Beispiel #1
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"):
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                    self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                                        initial_p=self.prioritized_replay_beta0,
                                                        final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size, hindsight=self.hindsight)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps),
                                              initial_p=1.0,
                                              final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_trans = []
            episode_replays = []
            episode_success = [0] * 100

            full_obs = self.env.reset()
            part_obs = np.concatenate((full_obs['observation'], full_obs['desired_goal']))

            reset = True
            self.episode_reward = np.zeros((1,))

            for step in range(total_timesteps):
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(step)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(step) +
                                self.exploration.value(step) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(part_obs)[None], update_eps=update_eps, **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, _ = self.env.step(env_action)
                # Store transition in the replay buffer.
                # self.replay_buffer.add(part_obs, action, rew, np.concatenate((new_obs['observation'], new_obs['desired_goal'])), float(done))
                episode_replays.append((full_obs, action, rew, new_obs, float(done)))
                episode_trans.append((full_obs, action, rew, new_obs))
                full_obs = new_obs
                part_obs = np.concatenate((full_obs['observation'], full_obs['desired_goal']))

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
                                                                      step)

                episode_rewards[-1] += rew
                if done:
                    if np.array_equal(full_obs['achieved_goal'], full_obs['desired_goal']):
                        episode_success.append(1.)
                    else:
                        episode_success.append(0.)
                    episode_success = episode_success[1:]

                    if not isinstance(self.env, VecEnv):
                        full_obs = self.env.reset()
                        part_obs = np.concatenate((full_obs['observation'], full_obs['desired_goal']))

                    self.replay_buffer.add(episode_replays)

                    if callback is not None:
                        callback(locals(), globals())

                    episode_rewards.append(0.0)
                    episode_trans = []
                    episode_replays = []
                    reset = True

                if step > self.learning_starts and step % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(step))
                        (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + step) % 100 == 0:
                            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess, options=run_options,
                                                                  run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata, 'step%d' % step)
                        else:
                            summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess)
                        writer.add_summary(summary, step)
                    else:
                        _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(batch_idxes, new_priorities)

                    # Metric
                    obses_beg, obses_step, obses_fin, dist = self.replay_buffer.mtr_sample(self.batch_size)
                    self.mtr_train(obses_beg, obses_step, obses_fin, dist)


                if step > self.learning_starts and step % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", step)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    logger.record_tabular("100 episode success", np.mean(episode_success))
                    logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(step)))
                    logger.dump_tabular()

        return self
Beispiel #2
0
config_proto.graph_options.rewrite_options.dependency_optimization = (
    rewriter_config_pb2.RewriterConfig.OFF)
config_proto.graph_options.rewrite_options.layout_optimizer = (
    rewriter_config_pb2.RewriterConfig.OFF)

sess = tf.Session(config=config_proto)
sess.run(tf.global_variables_initializer())

X_, Y_ = sess.run([X, Y])
X_Y_ = X_ + Y_
_X_Y = _X + _Y

tot_time = 0
for i in range(10):
    print(i)
    run_metadata = tf.RunMetadata()
    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE,
                                output_partition_graphs=True)
    st = time.time()
    sess.run(Z1 + Z2 + Z3, {_i: i_
                            for _i, i_ in zip(_X_Y, X_Y_)},
             options=run_options,
             run_metadata=run_metadata)
    tot_time += time.time() - st

    if i >= 2:
        jsonObj = MessageToJson(run_metadata)
        with open('%s/metadata_%d.json' % (logPath, i), 'w') as outfile:
            json.dump(jsonObj, outfile)

        trace = timeline.Timeline(step_stats=run_metadata.step_stats)
Beispiel #3
0
    def train(self, rnn_config, l_data_config, train_config, info_config, run):
        self.rnn_config = rnn_config
        self.info_config = info_config
        self.train_config = train_config
        set_rnn_config(rnn_config)
        set_info_config(info_config)

        self.timer = Timer(info_config['timer']['enabled'])
        print_config(rnn_config, train_config, l_data_config)
        temp_model_path = '../models/temp' + info_config[
            'filename'] + '_' + str(train_config['task_id'])
        pretrained_model_path = '../tr_models/' + str(
            train_config['pretraining']['path'])

        if train_config['mode']['name'] == 'inc_lengths':
            n_sessions = len(train_config['mode']['in_seq_len'])
        elif train_config['mode']['name'] == 'classic':
            n_sessions = 1
        else:
            raise Exception('training mode not understood')

        self.timer.start()
        set_train_config(train_config)
        # Sessions refer to training with different architectures. If one RNN is used throughout the training process
        # then only one session is created. Training with incremental sequence lengths for example requires multiple
        # RNNs, one for each sequence lenghts. Evaluation datasets (validation and test) are always evaluated on a fixed
        # RNN, only the RNN structure used for the training set varies. current_epoch stores the total amounts of epochs
        # and epoch the epoch within a session
        current_epoch = 0
        tau = self.train_config['tau']
        learning_rate = self.train_config['learning_rate']
        best_weight_probs_dict = None
        for session_idx in range(n_sessions):
            tf.reset_default_graph()
            if self.train_config['mode']['name'] == 'inc_lengths':
                max_epochs = self.train_config['mode']['max_epochs'][
                    session_idx]
                min_error = self.train_config['mode']['min_errors'][
                    session_idx]
                self.create_modificated_model(l_data_config, session_idx)
            elif self.train_config['mode']['name'] == 'classic':
                self.data_dict = load_dataset(l_data_config)
                l_data = LabeledData(l_data_config, self.data_dict)
                self.create_rnn(l_data, l_data_config)
                max_epochs = self.train_config['mode']['max_epochs']
                min_error = self.train_config['mode']['min_error']
            self.timer.restart('Graph creation')

            # Saver is used for restoring weights for new session if more than one is used for training
            model_saver = tf.train.Saver(var_list=tf.trainable_variables())
            with tf.Session() as sess:
                if info_config['profiling']['enabled']:
                    options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                else:
                    options = tf.RunOptions(trace_level=tf.RunOptions.NO_TRACE)
                run_metadata = tf.RunMetadata()
                writer = tf.summary.FileWriter(
                    info_config['tensorboard']['path'] +
                    str(self.train_config['task_id']))
                sess.run(tf.global_variables_initializer())

                if session_idx != 0:
                    #self.optimistic_restore(sess, pretrained_model_path)
                    model_saver.restore(sess, temp_model_path)
                elif self.train_config['pretraining']['enabled'] == True:
                    self.optimistic_restore(sess, pretrained_model_path)
                    sess.run(self.rnn.init_op)
                #sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type="readline")
                #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
                self.timer.restart('Intialization')

                # Loading datasets into GPU (via tf.Variables)
                for key in self.data_dict.keys():
                    sess.run(self.l_data.data[key]['load'],
                             feed_dict={
                                 self.l_data.data[key]['x_ph']:
                                 self.data_dict[key]['x'],
                                 self.l_data.data[key]['y_ph']:
                                 self.data_dict[key]['y']
                             })

                self.timer.restart('Loading data')

                traces = list()

                for epoch in range(max_epochs):
                    if self.info_config['gradient']['evaluate']:
                        self.save_gradient_variance(sess, epoch, tau)
                        quit()
                    # Evaluate performance on the different datasets and print some results on console
                    # Also check potential stopping critera
                    if current_epoch % info_config[
                            'calc_performance_every'] == 0:
                        self.rnn.t_metrics.retrieve_results(
                            sess, current_epoch, tau)
                        self.rnn.t_metrics.print(session_idx)
                        #if self.rnn.t_metrics.result_dict['tr_b']['vfe'][-1] < min_error:
                        #break

                    if current_epoch + 1 % info_config['save_weights'][
                            'save_every'] == 0:
                        self.save_weight_probs(
                            info_config['save_weights']['path'], current_epoch,
                            run, sess.run(self.rnn.get_weights_op))

                    if info_config['save_weights']['save_best']:
                        if self.rnn.t_metrics.best_va['is_current']:
                            best_weight_probs_dict = sess.run(
                                self.rnn.get_weights_op)

                    self.timer.restart('Metrics')

                    # Optionally store tensorboard summaries
                    if info_config['tensorboard']['enabled'] \
                            and current_epoch % info_config['tensorboard']['period'] == 0:
                        if info_config['tensorboard']['weights']:
                            weight_summary = sess.run(
                                self.rnn.weight_summaries,
                                feed_dict={
                                    self.rnn.tau: (tau, ),
                                    self.l_data.batch_idx: 0,
                                    self.rnn.is_training: False
                                })
                            writer.add_summary(weight_summary, current_epoch)
                        if info_config['tensorboard']['gradients']:
                            gradient_summary = sess.run(
                                self.rnn.gradient_summaries,
                                feed_dict={
                                    self.rnn.tau: (tau, ),
                                    self.l_data.batch_idx: 0,
                                    self.rnn.is_training: False
                                })
                            writer.add_summary(gradient_summary, current_epoch)
                        if info_config['tensorboard']['results']:
                            t_result_summaries = sess.run(
                                self.rnn.t_metric_summaries,
                                feed_dict={
                                    self.rnn.tau: (tau, ),
                                    self.l_data.batch_idx: 0,
                                    self.rnn.is_training: False
                                })
                            writer.add_summary(t_result_summaries,
                                               current_epoch)
                        if info_config['tensorboard']['acts']:
                            act_summaries = sess.run(self.rnn.act_summaries,
                                                     feed_dict={
                                                         self.rnn.tau: (tau, ),
                                                         self.l_data.batch_idx:
                                                         0,
                                                         self.rnn.is_training:
                                                         False
                                                     })
                            writer.add_summary(act_summaries, current_epoch)

                    self.timer.restart('Tensorboard')
                    # Train for one full epoch. First shuffle to create new minibatches from the given data and
                    # then do a training step for each minibatch.
                    # Also anneal learning rate and tau if necessary
                    if (current_epoch +
                            1) % self.train_config['learning_rate_tau'] == 0:
                        learning_rate /= 2

                    sess.run(self.l_data.data['tr']['shuffle'])
                    if 'c_ar' in self.train_config[
                            'algorithm'] or 'c_arm' in self.train_config[
                                'algorithm']:
                        sess.run(
                            self.rnn.assign_learning_rate,
                            feed_dict={self.rnn.learning_rate: learning_rate})
                    for minibatch_idx in range(
                            self.l_data.data['tr']['n_minibatches']):
                        if 'c_ar' in self.train_config['algorithm'] or 'c_arm' in self.train_config['algorithm']\
                                or 'log_der' in self.train_config['algorithm']:
                            grads = []
                            for i in range(
                                    self.train_config['carm_iterations']):
                                sess.run(self.rnn.c_arm_sample_op)
                                gradients = sess.run(self.rnn.gradients,
                                                     feed_dict={
                                                         self.l_data.batch_idx:
                                                         minibatch_idx,
                                                         self.rnn.is_training:
                                                         True
                                                     })
                                if len(grads) == 0:
                                    for j in range(len(gradients)):
                                        grads.append(gradients[j][0])
                                else:
                                    for j in range(len(grads)):
                                        if grads[j] is not None:
                                            grads[j] += gradients[j][0]
                            for j in range(len(grads)):
                                grads[j] /= self.train_config[
                                    'carm_iterations']
                            sess.run(self.rnn.train_b_op,
                                     feed_dict={
                                         gradient_ph: grad
                                         for gradient_ph, grad in zip(
                                             self.rnn.gradient_ph, grads)
                                     })

                        else:
                            sess.run(self.rnn.train_b_op,
                                     feed_dict={
                                         self.rnn.learning_rate: learning_rate,
                                         self.rnn.tau: (tau, ),
                                         self.l_data.batch_idx: minibatch_idx,
                                         self.rnn.is_training: True
                                     },
                                     options=options,
                                     run_metadata=run_metadata)

                    if info_config['profiling']['enabled']:
                        traces.append(
                            timeline.Timeline(run_metadata.step_stats).
                            generate_chrome_trace_format())
                    current_epoch += 1
                    self.timer.restart('Training')

                # Optionally store profiling results of this epoch in files
                if info_config['profiling']['enabled']:
                    for trace_idx, trace in enumerate(traces):
                        path = info_config['profiling']['path'] + '_' + str(
                            current_epoch) + '_' + str(trace_idx)
                        with open(path + 'training.json', 'w') as f:
                            f.write(trace)

                # TODO: Clean the cell access code
                if info_config['cell_access']:
                    ca_1, ca_2 = sess.run([
                        self.rnn.layers[0].cell_access_mat,
                        self.rnn.layers[1].cell_access_mat
                    ],
                                          feed_dict={self.l_data.batch_idx: 0})
                    np.save(file='../nr/ca_1_' +
                            str(self.train_config['task_id']),
                            arr=ca_1)
                    np.save(file='../nr/ca_2_' +
                            str(self.train_config['task_id']),
                            arr=ca_2)
                model_saver.save(sess, temp_model_path)

        if info_config['save_weights']['save_best']:
            self.save_weight_probs(self.info_config['save_weights']['path'],
                                   'best', run, best_weight_probs_dict)
        writer.close()
        return self.rnn.t_metrics.result_dict
Beispiel #4
0
def main():
    os.environ['CUDA_VISIBLE_DEVICES'] = '4'

    if not os.path.exists(a.output_dir):
        os.makedirs(a.output_dir)

    if a.mode == "test":
        if a.checkpoint is None:
            raise Exception("checkpoint required for test mode")

    for k, v in a._get_kwargs():
        print(k, "=", v)

    with open(os.path.join(a.output_dir, "options.json"), "w") as f:
        f.write(json.dumps(vars(a), sort_keys=True, indent=4))

    examples = load_examples()
    model = create_model(examples.inputs1, examples.inputs2, examples.inputs3, examples.inputs4, examples.targets, examples.inputs5
                         )
    with tf.name_scope("images"):
        display_fetches = {
            "targets": examples.targets,
            "outputs": model.outputs,
        }
    with tf.name_scope("inputs1_summary"):
        tf.summary.image("inputs1", examples.inputs1)
    with tf.name_scope("inputs2_summary"):
        tf.summary.image("inputs2", examples.inputs2)


    tf.summary.scalar("DUnet_loss_L1", model.DUnet_loss_L1)

    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name + "/values", var)

    with tf.name_scope("parameter_count"):
        parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()])

    saver = tf.train.Saver(max_to_keep=1)

    logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None
    sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with sv.managed_session(config=config)  as sess:
        print("parameter_count = ", sess.run(parameter_count))

        if a.checkpoint is not None:
            print("loading model from checkpoint")
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            saver.restore(sess, checkpoint)

        max_steps = 2 ** 32
        if a.max_epochs is not None:
            max_steps = examples.steps_per_epoch * a.max_epochs
        if a.max_steps is not None:
            max_steps = a.max_steps

        if a.mode == "test":
            max_steps = int(a.test_count / a.batch_size)
            for i in range(max_steps):
                results = sess.run(display_fetches)
                print(results["outputs"].shape)
                save_images(results, i)
        else:
            start = time.time()

            for step in range(max_steps):
                def should(freq):
                    return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1)

                options = None
                run_metadata = None
                if should(a.trace_freq):
                    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()

                fetches = {
                    "train": model.train,
                    "global_step": sv.global_step,
                }
                if should(a.progress_freq):
                    fetches["DUnet_loss_L1"] = model.DUnet_loss_L1

                if should(a.summary_freq):
                    fetches["summary"] = sv.summary_op

                if should(a.display_freq):
                    fetches["display"] = display_fetches

                results = sess.run(fetches, options=options, run_metadata=run_metadata)

                if should(a.summary_freq):
                    print("recording summary")
                    sv.summary_writer.add_summary(results["summary"], results["global_step"])

                if should(a.display_freq):
                    print("saving display images")

                    save_images(results["display"], step=results["global_step"])

                if should(a.trace_freq):
                    print("recording trace")
                    sv.summary_writer.add_run_metadata(run_metadata, "step_%d" % results["global_step"])

                if should(a.progress_freq):
                    # global_step will have the correct step count if we resume from a checkpoint
                    train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch)
                    train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1
                    rate = (step + 1) * a.batch_size / (time.time() - start)
                    remaining = (max_steps - step) * a.batch_size / rate
                    print("progress  epoch %d  step %d  image/sec %0.1f  remaining %dm" % (
                        train_epoch, train_step, rate, remaining / 60))
                    print("DUnet_loss_L1", results["DUnet_loss_L1"])

                if should(a.save_freq):
                    print("saving model")
                    saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step)

                if sv.should_stop():
                    break
Beispiel #5
0
    def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO1"):
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)
                                
            assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \
                "an instance of common.policies.ActorCriticPolicy."
                            
            with self.sess.as_default():
                self.adam.sync()
                
                # Prepare for rollouts
                seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch)
                # seg_gen = filtered_traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, imbalance_limit = self.timesteps_per_actorbatch // 100, waste_limit=self.timesteps_per_actorbatch*10)
                # seg_gen = balanced_traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, waste_limit=self.timesteps_per_actorbatch*10)
                
                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                
                # rolling buffer for episode lengths
                lenbuffer = deque(maxlen=100)
                # rolling buffer for episode rewards
                rewbuffer = deque(maxlen=100)
                
                self.episode_reward = np.zeros((self.n_envs,))
                
                while True:
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) == False:
                            break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break
                    
                    if self.schedule == 'constant':
                        cur_lrmult = 1.0
                    elif self.schedule == 'linear':
                        cur_lrmult = max(1.0 - float(timesteps_so_far) / total_timesteps, 0)
                    else:
                        raise NotImplementedError
                    
                    logger.log("********** Iteration %i ************" % iters_so_far)
                    # logger.record_tabular("update_no", iters_so_far)
                    logger.logkv("update_no", iters_so_far)

                    
                    seg = seg_gen.__next__()
                    add_vtarg_and_adv(seg, self.gamma, self.lam)
                    # seg = balanced_sample(seg_gen, self.timesteps_per_actorbatch, self.gamma, self.lam, waste_limit=self.timesteps_per_actorbatch*10)
                    
                    # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                    obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
                    
                    # true_rew is the reward without discount
                    if writer is not None:
                        self.episode_reward = total_episode_reward_logger(self.episode_reward,
                                                                          seg["true_rew"].reshape((self.n_envs, -1)),
                                                                          seg["dones"].reshape((self.n_envs, -1)),
                                                                          writer, timesteps_so_far)
                        
                    # predicted value function before udpate
                    vpredbefore = seg["vpred"]
                    
                    # standardized advantage function estimate
                    atarg = (atarg - atarg.mean()) / atarg.std()
                    dataset = Dataset(dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret),
                                      shuffle=not issubclass(self.policy, LstmPolicy))
                    optim_batchsize = self.optim_batchsize or obs_ph.shape[0]
                    
                    # set old parameter values to new parameter values
                    self.assign_old_eq_new(sess=self.sess)
                    logger.log("Optimizing...")
                    logger.log(fmt_row(13, self.loss_names))
                    
                    # Here we do a bunch of optimization epochs over the data
                    for k in range(self.optim_epochs):
                        # list of tuples, each of which gives the loss for a minibatch
                        losses = []
                        for i, batch in enumerate(dataset.iterate_once(optim_batchsize)):
                            steps = (timesteps_so_far +
                                     k * optim_batchsize +
                                     int(i * (optim_batchsize / len(dataset.data_map))))
                            if writer is not None:
                                # run loss backprop with summary, but once every 10 runs save the metadata
                                # (memory, compute time, ...)
                                if (1 + k) % 10 == 0:
                                    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                                    run_metadata = tf.RunMetadata()
                                    summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"],
                                                                                 batch["atarg"], batch["vtarg"],
                                                                                 cur_lrmult, sess=self.sess,
                                                                                 options=run_options,
                                                                                 run_metadata=run_metadata)
                                    writer.add_run_metadata(run_metadata, 'step%d' % steps)
                                else:
                                    summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"],
                                                                                 batch["atarg"], batch["vtarg"],
                                                                                 cur_lrmult, sess=self.sess)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"],
                                                                       batch["atarg"], batch["vtarg"], cur_lrmult,
                                                                       sess=self.sess)
                                
                            self.adam.update(grad, self.optim_stepsize * cur_lrmult)
                            losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(losses, axis=0)))
                        
                    logger.log("Evaluating losses...")
                    losses = []
                    for batch in dataset.iterate_once(optim_batchsize):
                        newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"],
                                                        batch["vtarg"], cur_lrmult, sess=self.sess)
                        losses.append(newlosses)
                    mean_losses, _, _ = mpi_moments(losses, axis=0)
                    logger.log(fmt_row(13, mean_losses))
                    for (loss_val, name) in zipsame(mean_losses, self.loss_names):
                        logger.record_tabular("loss_" + name, loss_val)
                    logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
                    
                    # local values
                    lrlocal = (seg["ep_lens"], seg["ep_rets"])
                    
                    # list of tuples
                    listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)
                    lens, rews = map(flatten_lists, zip(*listoflrpairs))
                    lenbuffer.extend(lens)
                    rewbuffer.extend(rews)
                    logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                    logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    timesteps_so_far += MPI.COMM_WORLD.allreduce(seg["total_timestep"])
                    iters_so_far += 1
                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)
                    if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0:
                        logger.dump_tabular()
                        
        return self
Beispiel #6
0
    def train_model(self, sess, max_iters):
        """Network training loop."""

        data_layer = get_data_layer(self.roidb, self.imdb.num_classes)

        # RPN
        # classification loss
        rpn_cls_score = tf.reshape(
            self.net.get_output('rpn_cls_score_reshape'), [-1, 2])
        rpn_label = tf.reshape(self.net.get_output('rpn-data')[0], [-1])
        rpn_cls_score = tf.reshape(
            tf.gather(rpn_cls_score, tf.where(tf.not_equal(rpn_label, -1))),
            [-1, 2])
        rpn_label = tf.reshape(
            tf.gather(rpn_label, tf.where(tf.not_equal(rpn_label, -1))), [-1])
        rpn_cross_entropy = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=rpn_cls_score, labels=rpn_label))

        # bounding box regression L1 loss
        rpn_bbox_pred = self.net.get_output('rpn_bbox_pred')
        rpn_bbox_targets = tf.transpose(
            self.net.get_output('rpn-data')[1], [0, 2, 3, 1])
        rpn_bbox_inside_weights = tf.transpose(
            self.net.get_output('rpn-data')[2], [0, 2, 3, 1])
        rpn_bbox_outside_weights = tf.transpose(
            self.net.get_output('rpn-data')[3], [0, 2, 3, 1])

        rpn_smooth_l1 = self._modified_smooth_l1(3.0, rpn_bbox_pred,
                                                 rpn_bbox_targets,
                                                 rpn_bbox_inside_weights,
                                                 rpn_bbox_outside_weights)
        rpn_loss_box = tf.reduce_mean(
            tf.reduce_sum(rpn_smooth_l1, reduction_indices=[1, 2, 3]))

        # R-CNN
        # classification loss
        cls_score = self.net.get_output('cls_score')
        label = tf.reshape(self.net.get_output('roi-data')[1], [-1])
        cross_entropy = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_score,
                                                           labels=label))

        # bounding box regression L1 loss
        bbox_pred = self.net.get_output('bbox_pred')
        bbox_targets = self.net.get_output('roi-data')[2]
        bbox_inside_weights = self.net.get_output('roi-data')[3]
        bbox_outside_weights = self.net.get_output('roi-data')[4]

        smooth_l1 = self._modified_smooth_l1(1.0, bbox_pred, bbox_targets,
                                             bbox_inside_weights,
                                             bbox_outside_weights)
        loss_box = tf.reduce_mean(
            tf.reduce_sum(smooth_l1, reduction_indices=[1]))

        # final loss
        loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box

        # optimizer and learning rate
        global_step = tf.Variable(0, trainable=False)
        lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE,
                                        global_step,
                                        cfg.TRAIN.STEPSIZE,
                                        0.1,
                                        staircase=True)
        momentum = cfg.TRAIN.MOMENTUM
        train_op = tf.train.MomentumOptimizer(lr, momentum).minimize(
            loss, global_step=global_step)

        # iintialize variables
        sess.run(tf.global_variables_initializer())
        if self.pretrained_model is not None:
            print(('Loading pretrained model '
                   'weights from {:s}').format(self.pretrained_model))
            self.net.load(self.pretrained_model, sess, self.saver, True)

        last_snapshot_iter = -1
        timer = Timer()
        for iter in range(max_iters):
            # get one batch
            blobs = data_layer.forward()

            # Make one SGD update
            feed_dict={self.net.data: blobs['data'], self.net.im_info: blobs['im_info'], self.net.keep_prob: 0.5, \
                           self.net.gt_boxes: blobs['gt_boxes']}

            run_options = None
            run_metadata = None
            if cfg.TRAIN.DEBUG_TIMELINE:
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()

            timer.tic()

            rpn_loss_cls_value, rpn_loss_box_value, loss_cls_value, loss_box_value, _ = sess.run(
                [
                    rpn_cross_entropy, rpn_loss_box, cross_entropy, loss_box,
                    train_op
                ],
                feed_dict=feed_dict,
                options=run_options,
                run_metadata=run_metadata)

            timer.toc()

            if cfg.TRAIN.DEBUG_TIMELINE:
                trace = timeline.Timeline(step_stats=run_metadata.step_stats)
                trace_file = open(
                    str(long(time.time() * 1000)) + '-train-timeline.ctf.json',
                    'w')
                trace_file.write(
                    trace.generate_chrome_trace_format(show_memory=False))
                trace_file.close()

            if (iter + 1) % (cfg.TRAIN.DISPLAY) == 0:
                print('iter: %d / %d, total loss: %.4f, rpn_loss_cls: %.4f, rpn_loss_box: %.4f, loss_cls: %.4f, loss_box: %.4f, lr: %f'%\
                        (iter+1, max_iters, rpn_loss_cls_value + rpn_loss_box_value + loss_cls_value + loss_box_value ,rpn_loss_cls_value, rpn_loss_box_value,loss_cls_value, loss_box_value, lr.eval()))
                print('speed: {:.3f}s / iter'.format(timer.average_time))

            if (iter + 1) % cfg.TRAIN.SNAPSHOT_ITERS == 0:
                last_snapshot_iter = iter
                self.snapshot(sess, iter)

        if last_snapshot_iter != iter:
            self.snapshot(sess, iter)
Beispiel #7
0
def word2vec_basic(log_dir):
    """Example of building, training and visualizing a word2vec model."""
    # Create the directory for TensorBoard variables if there is not.
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Step 1: Download the data.
    url = 'http://mattmahoney.net/dc/'

    # pylint: disable=redefined-outer-name
    def maybe_download(filename, expected_bytes):
        """Download a file if not present, and make sure it's the right size."""
        local_filename = os.path.join(gettempdir(), filename)
        if not os.path.exists(local_filename):
            local_filename, _ = urllib.request.urlretrieve(
                url + filename, local_filename)
        statinfo = os.stat(local_filename)
        if statinfo.st_size == expected_bytes:
            print('Found and verified', filename)
        else:
            print(statinfo.st_size)
            raise Exception('Failed to verify ' + local_filename +
                            '. Can you get to it with a browser?')
        return local_filename

    filename = maybe_download('text8.zip', 31344016)

    # Read the data into a list of strings.
    def read_data(filename):
        """Extract the first file enclosed in a zip file as a list of words."""
        with zipfile.ZipFile(filename) as f:
            data = tf.compat.as_str(f.read(f.namelist()[0])).split()
        return data

    vocabulary = read_data(filename)
    print('Data size', len(vocabulary))

    # Step 2: Build the dictionary and replace rare words with UNK token.
    vocabulary_size = 50000

    def build_dataset(words, n_words):
        """Process raw inputs into a dataset."""
        count = [['UNK', -1]]
        count.extend(collections.Counter(words).most_common(n_words - 1))
        dictionary = {}
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = []
        unk_count = 0
        for word in words:
            index = dictionary.get(word, 0)
            if index == 0:  # dictionary['UNK']
                unk_count += 1
            data.append(index)
        count[0][1] = unk_count
        reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        return data, count, dictionary, reversed_dictionary

    # Filling 4 global variables:
    # data - list of codes (integers from 0 to vocabulary_size-1).
    #   This is the original text but words are replaced by their codes
    # count - map of words(strings) to count of occurrences
    # dictionary - map of words(strings) to their codes(integers)
    # reverse_dictionary - maps codes(integers) to words(strings)
    data, count, unused_dictionary, reverse_dictionary = build_dataset(
        vocabulary, vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    print('Most common words (+UNK)', count[:5])
    print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

    # Step 3: Function to generate a training batch for the skip-gram model.
    def generate_batch(batch_size, num_skips, skip_window):
        global data_index
        assert batch_size % num_skips == 0
        assert num_skips <= 2 * skip_window
        batch = np.ndarray(shape=(batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        span = 2 * skip_window + 1  # [ skip_window target skip_window ]
        buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
        if data_index + span > len(data):
            data_index = 0
        buffer.extend(data[data_index:data_index + span])
        data_index += span
        for i in range(batch_size // num_skips):
            context_words = [w for w in range(span) if w != skip_window]
            words_to_use = random.sample(context_words, num_skips)
            for j, context_word in enumerate(words_to_use):
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[context_word]
            if data_index == len(data):
                buffer.extend(data[0:span])
                data_index = span
            else:
                buffer.append(data[data_index])
                data_index += 1
        # Backtrack a little bit to avoid skipping words in the end of a batch
        data_index = (data_index + len(data) - span) % len(data)
        return batch, labels

    batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
    for i in range(8):
        print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
              reverse_dictionary[labels[i, 0]])

    # Step 4: Build and train a skip-gram model.

    batch_size = 128
    embedding_size = 128  # Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  # How many times to reuse an input to generate a label.
    num_sampled = 64  # Number of negative examples to sample.

    # We pick a random validation set to sample nearest neighbors. Here we limit
    # the validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent. These 3 variables are used only for
    # displaying model accuracy, they don't affect calculation.
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.random.choice(valid_window, valid_size, replace=False)

    graph = tf.Graph()

    with graph.as_default():

        # Input data.
        with tf.name_scope('inputs'):
            train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        # Ops and variables pinned to the CPU because of missing GPU implementation
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            with tf.name_scope('embeddings'):
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0,
                                      1.0))
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

            # Construct the variables for the NCE loss
            with tf.name_scope('weights'):
                nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size],
                                        stddev=1.0 /
                                        math.sqrt(embedding_size)))
            with tf.name_scope('biases'):
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        # Explanation of the meaning of NCE loss:
        #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
        with tf.name_scope('loss'):
            loss = tf.reduce_mean(
                tf.nn.nce_loss(weights=nce_weights,
                               biases=nce_biases,
                               labels=train_labels,
                               inputs=embed,
                               num_sampled=num_sampled,
                               num_classes=vocabulary_size))

        # Add the loss value as a scalar to summary.
        tf.summary.scalar('loss', loss)

        # Construct the SGD optimizer using a learning rate of 1.0.
        with tf.name_scope('optimizer'):
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all
        # embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)

        # Merge all summaries.
        merged = tf.summary.merge_all()

        # Add variable initializer.
        init = tf.global_variables_initializer()

        # Create a saver.
        saver = tf.train.Saver()

    # Step 5: Begin training.
    num_steps = 100001

    with tf.Session(graph=graph) as session:
        # Open a writer to write summaries.
        writer = tf.summary.FileWriter(log_dir, session.graph)

        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = generate_batch(
                batch_size, num_skips, skip_window)
            feed_dict = {
                train_inputs: batch_inputs,
                train_labels: batch_labels
            }

            # Define metadata variable.
            run_metadata = tf.RunMetadata()

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            # Also, evaluate the merged op to get all summaries from the returned
            # "summary" variable. Feed metadata variable to session for visualizing
            # the graph in TensorBoard.
            _, summary, loss_val = session.run([optimizer, merged, loss],
                                               feed_dict=feed_dict,
                                               run_metadata=run_metadata)
            average_loss += loss_val

            # Add returned summaries to writer in each step.
            writer.add_summary(summary, step)
            # Add metadata to visualize the graph for the last run.
            if step == (num_steps - 1):
                writer.add_run_metadata(run_metadata, 'step%d' % step)

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000
                # batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0

            # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0:
                sim = similarity.eval()
                for i in xrange(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in xrange(top_k):
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)
        final_embeddings = normalized_embeddings.eval()

        # Write corresponding labels for the embeddings.
        with open(log_dir + '/metadata.tsv', 'w') as f:
            for i in xrange(vocabulary_size):
                f.write(reverse_dictionary[i] + '\n')

        # Save the model for checkpoints.
        saver.save(session, os.path.join(log_dir, 'model.ckpt'))

        # Create a configuration for visualizing embeddings with the labels in
        # TensorBoard.
        config = projector.ProjectorConfig()
        embedding_conf = config.embeddings.add()
        embedding_conf.tensor_name = embeddings.name
        embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
        projector.visualize_embeddings(writer, config)

    writer.close()

    # Step 6: Visualize the embeddings.

    # pylint: disable=missing-docstring
    # Function to draw visualization of distance between embeddings.
    def plot_with_labels(low_dim_embs, labels, filename):
        assert low_dim_embs.shape[0] >= len(
            labels), 'More labels than embeddings'
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')

        plt.savefig(filename)

    try:
        # pylint: disable=g-import-not-at-top
        from sklearn.manifold import TSNE
        import matplotlib.pyplot as plt

        tsne = TSNE(perplexity=30,
                    n_components=2,
                    init='pca',
                    n_iter=5000,
                    method='exact')
        plot_only = 500
        low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
        labels = [reverse_dictionary[i] for i in xrange(plot_only)]
        plot_with_labels(low_dim_embs, labels,
                         os.path.join(gettempdir(), 'tsne.png'))

    except ImportError as ex:
        print(
            'Please install sklearn, matplotlib, and scipy to show embeddings.'
        )
        print(ex)
Beispiel #8
0
    def run(self, sess):
        if not self.init:
            return
        train_data = data_parser(self.args)

        self.model.setup_training(sess)
        if self.args.lr_scheduler is not None:
            global_step = tf.Variable(0, trainable=False, dtype=tf.int64)
        if self.args.lr_scheduler is None:
            learning_rate = tf.constant(self.args.learning_rate,
                                        dtype=tf.float16)
        else:
            raise NotImplementedError(
                'Learning rate scheduler type [%s] is not implemented',
                self.args.lr_scheduler)
        opt = tf.train.AdamOptimizer(learning_rate)
        trainG = opt.minimize(self.model.loss)  # like hed
        saver = tf.train.Saver(max_to_keep=7)

        sess.run(tf.global_variables_initializer())
        # here to recovery previous training
        if self.args.use_previous_trained:
            if self.args.dataset_name.lower(
            ) != 'biped':  # using biped pretrained to use in other dataset
                model_path = os.path.join(
                    self.args.checkpoint_dir,
                    self.args.model_name + '_' + self.args.train_dataset,
                    'train')
            else:
                model_path = os.path.join(
                    self.args.checkpoint_dir,
                    self.args.model_name + '_' + self.args.train_dataset)
                model_path = os.path.join(model_path, 'train')
            if not os.path.exists(model_path) or len(
                    os.listdir(model_path)) == 0:  # :
                ini = 0
                maxi = self.args.max_iterations + 1
                print_warning(
                    'There is not previous trained data for the current model... and'
                )
                print_warning(
                    '*** The training process is starting from scratch ***')
            else:
                # restoring using the last checkpoint
                assert (
                    len(os.listdir(model_path)) != 0
                ), 'There is not previous trained data for the current model...'
                last_ckpt = tf.train.latest_checkpoint(model_path)
                saver.restore(sess, last_ckpt)
                ini = self.args.max_iterations
                maxi = ini + self.args.max_iterations + 1  # check
                print_info(
                    '--> Previous model restored successfully: {}'.format(
                        last_ckpt))
        else:
            print_warning(
                '*** The training process is starting from scratch ***')
            ini = 0
            maxi = ini + self.args.max_iterations
        prev_loss = 1000.
        prev_val = None
        # directories for checkpoints
        checkpoint_dir = os.path.join(
            self.args.checkpoint_dir,
            self.args.model_name + '_' + self.args.train_dataset,
            self.args.model_state)
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        fig = plt.figure()
        for idx in range(ini, maxi):

            x_batch, y_batch, _ = get_training_batch(self.args, train_data)
            run_metadata = tf.RunMetadata()

            _, summary, loss, pred_maps = sess.run([
                trainG, self.model.merged_summary, self.model.loss,
                self.model.predictions
            ],
                                                   feed_dict={
                                                       self.model.images:
                                                       x_batch,
                                                       self.model.edgemaps:
                                                       y_batch
                                                   })
            if idx % 5 == 0:
                self.model.train_writer.add_run_metadata(
                    run_metadata, 'step{:06}'.format(idx))
                self.model.train_writer.add_summary(summary, idx)
                print(time.ctime(), '[{}/{}]'.format(idx, maxi),
                      ' TRAINING loss: %.5f' % loss,
                      'prev_loss: %.5f' % prev_loss)

            # saving trained parameters
            save_inter = ini + self.args.save_interval
            if prev_loss > loss:
                saver.save(sess,
                           os.path.join(checkpoint_dir, self.args.model_name),
                           global_step=idx)
                prev_loss = loss
                print("Weights saved in the lowest loss", idx, " Current Loss",
                      prev_loss)

            if idx % self.args.save_interval == 0:
                saver.save(sess,
                           os.path.join(checkpoint_dir, self.args.model_name),
                           global_step=idx)
                prev_loss = loss
                print("Weights saved in the interval", idx, " Current Loss",
                      prev_loss)

            # ********* for validation **********
            if (idx + 1) % self.args.val_interval == 0:
                pause_show = 0.01
                imgs_list = []
                img = x_batch[2][:, :, 0:3]
                gt_mp = y_batch[2]
                imgs_list.append(img)
                imgs_list.append(gt_mp)
                for i in range(len(pred_maps)):
                    tmp = pred_maps[i][2, ...]
                    imgs_list.append(tmp)
                vis_imgs = visualize_result(imgs_list, self.args)
                fig.suptitle("Iterac:" + str(idx + 1) + " Loss:" +
                             '%.5f' % loss + " training")
                fig.add_subplot(1, 1, 1)
                plt.imshow(np.uint8(vis_imgs))

                print("Evaluation in progress...")
                plt.draw()
                plt.pause(pause_show)

                im, em, _ = get_validation_batch(self.args, train_data)
                summary, error, pred_val = sess.run([
                    self.model.merged_summary, self.model.error,
                    self.model.fuse_output
                ],
                                                    feed_dict={
                                                        self.model.images: im,
                                                        self.model.edgemaps: em
                                                    })
                if error <= 0.08:
                    saver.save(sess,
                               os.path.join(checkpoint_dir,
                                            self.args.model_name),
                               global_step=idx)
                    prev_loss = loss
                    print(
                        "Parameters saved in the validation stage when its error is <=0.08::",
                        error)

                self.model.val_writer.add_summary(summary, idx)
                print_info(('[{}/{}]'.format(idx, self.args.max_iterations),
                            'VALIDATION error: %0.5f' % error,
                            'pError: %.5f' % prev_loss))
                if (idx + 1) % (self.args.val_interval * 150) == 0:
                    print('updating visualisation')
                    plt.close()
                    fig = plt.figure()

        saver.save(sess,
                   os.path.join(checkpoint_dir, self.args.model_name),
                   global_step=idx)
        print("Final Weights saved", idx, " Current Loss", loss)
        self.model.train_writer.close()
        sess.close()
Beispiel #9
0
def main(_):
    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)

    # Create the model
    x = tf.placeholder(tf.float32, [None, 784])
    w = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    y = tf.matmul(x, w) + b

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, 10])

    # The raw formulation of cross-entropy,
    #
    #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
    #                                 reduction_indices=[1]))
    #
    # can be numerically unstable.
    #
    # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
    # outputs of 'y', and then average across the batch.
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
    train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

    config = tf.ConfigProto()
    jit_level = 0
    if FLAGS.xla:
        # Turns on XLA JIT compilation.
        jit_level = tf.OptimizerOptions.ON_1

    config.graph_options.optimizer_options.global_jit_level = jit_level
    run_metadata = tf.RunMetadata()
    sess = tf.Session(config=config)
    tf.global_variables_initializer().run(session=sess)
    # Train
    train_loops = 1000
    for i in range(train_loops):
        batch_xs, batch_ys = mnist.train.next_batch(100)

        # Create a timeline for the last loop and export to json to view with
        # chrome://tracing/.
        if i == train_loops - 1:
            sess.run(
                train_step,
                feed_dict={
                    x: batch_xs,
                    y_: batch_ys
                },
                options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
                run_metadata=run_metadata)
            trace = timeline.Timeline(step_stats=run_metadata.step_stats)
            trace_file = open('timeline.ctf.json', 'w')
            trace_file.write(trace.generate_chrome_trace_format())
        else:
            sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(
        sess.run(accuracy,
                 feed_dict={
                     x: mnist.test.images,
                     y_: mnist.test.labels
                 }))
    sess.close()
Beispiel #10
0
def model(X_train,
          Y_train,
          X_test,
          Y_test,
          learning_rate=0.003,
          num_epochs=50,
          minibatch_size=64,
          print_cost=True):

    ops.reset_default_graph(
    )  # to be able to rerun the model without overwriting tf variables
    tf.set_random_seed(1)  # to keep results consistent (tensorflow seed)
    seed = 3  # to keep results consistent (numpy seed)
    (m, n_H0, n_W0, n_C0) = X_train.shape
    n_y = Y_train.shape[1]
    costs = []  # To keep track of the cost

    # Create Placeholders of the correct shape
    with tf.name_scope("input"):
        X, Y = create_placeholders(n_H0, n_W0, n_C0, n_y)
        #tf.summary.scalar('input_X', X)
        #tf.summary.scalar('input_Y', Y)
    # Initialize parameters

    with tf.name_scope('input_image'):
        image_in = tf.reshape(X, [-1, 64, 64, 3])
        tf.summary.image('input_image', [image_in[5, :, :, :]], 5)

    parameters = initialize_parameters()

    # Forward propagation: Build the forward propagation in the tensorflow graph
    Z3 = forward_propagation(X, parameters)

    # Cost function: Add cost function to tensorflow graph
    cost = compute_cost(Z3, Y)

    # Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer that minimizes the cost.
    with tf.name_scope("optimizer"):
        optimizer = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(cost)
        #tf.summary.scalar('Adam', optimizer)
    # Initialize all the variables globally
    init = tf.global_variables_initializer()
    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter("/anaconda3/dnn/con4",
                                   tf.get_default_graph())
    # Start the session to compute the tensorflow graph
    with tf.Session() as sess:

        # Run the initialization
        sess.run(init)

        # Do the training loop
        for epoch in range(num_epochs):

            minibatch_cost = 0.
            num_minibatches = int(
                m / minibatch_size
            )  # number of minibatches of size minibatch_size in the train set
            seed = seed + 1
            minibatches = random_mini_batches(X_train, Y_train, minibatch_size,
                                              seed)
            iiiii = 0
            for minibatch in minibatches:
                # Select a minibatch
                (minibatch_X, minibatch_Y) = minibatch
                # IMPORTANT: The line that runs the graph on a minibatch.
                # Run the session to execute the optimizer and the cost, the feedict should contain a minibatch for (X,Y).
                iiiii += 1
                if iiiii == 10:
                    run_options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    opti, temp_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y}, \
                                               options=run_options, run_metadata=run_metadata)

                    writer.add_run_metadata(run_metadata, str(epoch))

                elif iiiii == 20:
                    summary, _ = sess.run([merged, cost],
                                          feed_dict={
                                              X: minibatch_X,
                                              Y: minibatch_Y
                                          })
                    writer.add_summary(summary, epoch)
                else:

                    opti, temp_cost = sess.run([optimizer, cost],
                                               feed_dict={
                                                   X: minibatch_X,
                                                   Y: minibatch_Y
                                               })
                    #tf.summary.scalar('opti', opti)
                tf.summary.scalar('temp_cost', temp_cost)
                minibatch_cost += temp_cost / num_minibatches

            # Print the cost every epoch
            if print_cost == True:  #and epoch % 5 == 0:
                print("Cost after epoch %i: %f" % (epoch, minibatch_cost))
            if print_cost == True:  #and epoch % 1 == 0:
                costs.append(minibatch_cost)

        # plot the cost
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per tens)')
        #plt.title("Learning rate = 0.003" )
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

        # Calculate the correct predictions
        predict_op = tf.argmax(Z3, 1)
        correct_prediction = tf.equal(predict_op, tf.argmax(Y, 1))

        # Calculate accuracy on the test set
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        #print(accuracy)
        train_accuracy = accuracy.eval({X: X_train, Y: Y_train})
        test_accuracy = accuracy.eval({X: X_test, Y: Y_test})
        #print("Train Accuracy: 0.99647886")
        #print("Test Accuracy: 0.99")
        print("Train Accuracy:", train_accuracy)
        print("Test Accuracy:", test_accuracy)
        #writer = tf.summary.FileWriter("/anaconda3/dnn/con1", tf.get_default_graph())
        writer.close()
        return train_accuracy, test_accuracy, parameters
Beispiel #11
0
def timeGraph(gdef,
              batch_size=128,
              image_folder='images',
              latencyMS=30,
              powerCapW=50,
              result_file="ResultsLog.txt"):
    tf.logging.info("Starting execution")
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95)
    tf.reset_default_graph()
    g = tf.Graph()

    imageCounter = 0
    outlist = []
    with g.as_default():
        imageString = tf.placeholder(tf.string, name='imageString')
        imagenstack = tf.stack(imageString)
        batch_size_dynamic = tf.placeholder(tf.int64,
                                            shape=(),
                                            name='batch_size_dynamic')
        dataset = tf.data.Dataset.from_tensor_slices(imagenstack)
        dataset = dataset.map(_parse_function)
        dataset = dataset.batch(batch_size_dynamic)
        dataset = dataset.repeat()
        iterator = dataset.make_initializable_iterator()
        next_element = iterator.get_next()
        out = tf.import_graph_def(
            graph_def=gdef,
            input_map={"input": next_element},
            return_elements=["MobilenetV1/Predictions/Softmax"])
        out = out[0].outputs[0]
        outlist.append(out)

    timings = []

    with tf.Session(graph=g,
                    config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()

        skipSize = 0
        processed_image = 0
        StringofImage = []
        for imageName in glob.glob(image_folder + '/*.*'):
            StringofImage.append(imageName)
            imageCounter = imageCounter + 1

        listBatchSizeSteps = [x for x in range(1, 256)]
        MAXINDEX = len(listBatchSizeSteps) - 1
        MININDEX = 0
        minBS = MININDEX
        maxBS = MAXINDEX
        initialBS = 0
        BestBS = MININDEX
        batchCounter = 0

        ThroughputPerSecond = [-1] * N

        newbatchSize = listBatchSizeSteps[initialBS]

        resultsFile = open(result_file, 'w')
        resultsFile.write(
            "time stamp, Througput (image/sec), power, batch size, minBS, maxBS, BestBS, power cap, DVFS\n"
        )

        listDVFSSteps = [544, 632, 734, 835, 949, 1063, 1189, 1303, 1430, 1531]
        minDVFS = 0
        maxDVFS = 9
        initialDVFS = 5
        currentDVFS = 5

        DVFSLevel = listDVFSSteps[initialDVFS]
        #DVFSLevel = listDVFSSteps[maxDVFS]
        os.system(
            "echo sudo_password | sudo -S nvidia-smi --applications-clocks=3615,"
            + str(DVFSLevel))

        two_second_start = time.time()
        firstTime = 0
        while True:

            if (time.time() - two_second_start) > 3:
                two_second_start = time.time()

                #if power cap is meeting, do not do anything. we use 0.9 as a margin to avoid lots of fluctuations
                if max(powerReading) <= powerCapW and max(powerReading) >= (
                        0.8 * powerCapW):
                    powerReading[:] = []
                    pass

                elif max(powerReading) < (
                        0.8 * powerCapW):  # Power less than power cap
                    powerReading[:] = []
                    if BestBS == MAXINDEX:
                        print("Max BS, no further improvement")
                        if currentDVFS != maxDVFS:
                            currentDVFS = currentDVFS + 1  # increase DVFS by one step
                            DVFSLevel = listDVFSSteps[currentDVFS]
                            os.system(
                                "echo sudo_password | sudo -S nvidia-smi --applications-clocks=3615,"
                                + str(DVFSLevel))

                    elif BestBS == maxBS:
                        minBS = maxBS
                        maxBS = MAXINDEX
                        BestBS = int(math.ceil((minBS + maxBS) / float(2)))
                        newbatchSize = listBatchSizeSteps[BestBS]

                    else:
                        minBS = BestBS
                        maxBS = maxBS
                        BestBS = int(math.ceil((minBS + maxBS) / float(2)))
                        newbatchSize = listBatchSizeSteps[BestBS]

                elif max(
                        powerReading) > powerCapW:  # Power More than Power Cap
                    powerReading[:] = []
                    if BestBS == MININDEX:
                        print("Min BS, No Possible Solution")
                        #resultsFile.write("No Possible Solution\n")
                        if currentDVFS != minDVFS:
                            currentDVFS = currentDVFS - 1  # decrease DVFS by one step
                            DVFSLevel = listDVFSSteps[currentDVFS]
                            os.system(
                                "echo sudo_password | sudo -S nvidia-smi --applications-clocks=3615,"
                                + str(DVFSLevel))

                    elif BestBS == minBS:  # It is stuck in a loop where there is no scape. Restart everything from beginning.
                        maxBS = minBS
                        minBS = MININDEX
                        BestBS = int(math.floor((minBS + maxBS) / float(2)))
                        newbatchSize = listBatchSizeSteps[BestBS]
                    else:
                        minBS = minBS
                        maxBS = BestBS
                        BestBS = int(math.floor((minBS + maxBS) / float(2)))
                        newbatchSize = listBatchSizeSteps[BestBS]

            # For last run to make sure that the batch size is not greater than the remaining number of images
            if (processed_image + newbatchSize) > imageCounter:
                print("Entered If for processed_image")
                newbatchSize = imageCounter - processed_image

            StringImage_2 = []
            #print(len(StringofImage))
            for countertemp in range(newbatchSize):
                StringImage_2.append(StringofImage.pop(0))
            tstart = time.time()

            sess.run(iterator.initializer,
                     feed_dict={
                         batch_size_dynamic: newbatchSize,
                         imageString: StringImage_2
                     })

            val = sess.run(outlist,
                           feed_dict={
                               batch_size_dynamic: newbatchSize,
                               imageString: StringImage_2
                           })

            timings.append(time.time() - tstart)

            # Reading power

            printLables = 0  # SET TO ONE FOR LABELS TO BE PRINTED
            if printLables == 1:
                if os.path.exists('resultLables.txt'):
                    append_write = 'a'  # append if already exists
                else:
                    append_write = 'w'  # make a new file if not
                #
                highscore = open('resultLables.txt', append_write)
                for index1 in range(0, len(topX(val[0], f.topN)[1])):
                    highscore.write(
                        str(getLabels(labels,
                                      topX(val[0], f.topN)[1][index1])))
                    highscore.write("\n")
                highscore.close()

            ThroughputPerSecond.append(
                1000 / ((timings[-1] * 1000) / newbatchSize)
            )  #first convert the time to milisecond (*1000), then divide by the number of processed image (newbatchsize)
            if len(powerReading) == 0:
                time.sleep(1)
                #maxPower = max(powerReading)
            resultsFile.write(
                str(time.time()) + "," + str(ThroughputPerSecond[-1]) + "," +
                str(max(powerReading)) + "," + str(newbatchSize) + "," +
                str(minBS) + "," + str(maxBS) + "," + str(BestBS) + "," +
                str(powerCapW) + "," + str(DVFSLevel) + "\n")

            #end of our new code
            processed_image = processed_image + newbatchSize
            #print("processed image = ", processed_image)
            #print("skip size is", skipSize)
            #print("batch size ", newbatchSize, " =  ", timings[-1], " s\n\n")
            if processed_image == imageCounter:
                break
            skipSize = skipSize + newbatchSize

            #fileBatch.close()
        sess.close()
        tf.logging.info("Timing loop done!")
        #os.system("pkill nvidia-smi")
        return timings, True, val[0], None
def main(_):
    # Import data
    mnist = input_data.read_data_sets(FLAGS.data_dir)

    # Create the model
    x = tf.placeholder(tf.float32, [None, 784])
    w = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))
    y = tf.matmul(x, w) + b

    # Define loss and optimizer
    y_ = tf.placeholder(tf.int64, [None])

    # The raw formulation of cross-entropy,
    #
    #
    # can be numerically unstable.
    #
    # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the raw
    # logit outputs of 'y', and then average across the batch.
    cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
    train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

    config = tf.ConfigProto()
    #add npu config, enable offline train
    custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
    custom_op.name = "NpuOptimizer"
    #enable offline train
    custom_op.parameter_map["use_off_line"].b = True

    run_metadata = tf.RunMetadata()
    sess = tf.compat.v1.Session(config=config)
    tf.global_variables_initializer().run(session=sess)
    # Train
    train_loops = 1000
    for i in range(train_loops):
        batch_xs, batch_ys = mnist.train.next_batch(100)

        # Create a timeline for the last loop and export to json to view with
        # chrome://tracing/.
        if i == train_loops - 1:
            sess.run(
                train_step,
                feed_dict={
                    x: batch_xs,
                    y_: batch_ys
                },
                options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
                run_metadata=run_metadata)
            trace = timeline.Timeline(step_stats=run_metadata.step_stats)
            with open('/tmp/timeline.ctf.json', 'w') as trace_file:
                trace_file.write(trace.generate_chrome_trace_format())
        else:
            sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

    # Test trained model
    correct_prediction = tf.equal(tf.argmax(y, 1), y_)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print(
        sess.run(accuracy,
                 feed_dict={
                     x: mnist.test.images,
                     y_: mnist.test.labels
                 }))
    sess.close()
Beispiel #13
0
def train():
    # Import input data
    INPUTS_DIR = os.getenv('VH_INPUTS_DIR', '/tmp/tensorflow/mnist/inputs')
    data_set_files = [
        get_first_file(os.path.join(INPUTS_DIR, 'training-set-images')),
        get_first_file(os.path.join(INPUTS_DIR, 'training-set-labels')),
        get_first_file(os.path.join(INPUTS_DIR, 'test-set-images')),
        get_first_file(os.path.join(INPUTS_DIR, 'test-set-labels')),
    ]
    train_dir = os.getcwd()
    for file in data_set_files:
        copy2(file, train_dir)
    mnist = input_data.read_data_sets(train_dir, fake_data=FLAGS.fake_data)

    sess = tf.InteractiveSession()

    # Create a multilayer model.

    # Input placeholders
    with tf.name_scope('input'):
        x = tf.placeholder(tf.float32, [None, 784], name='x-input')
        y_ = tf.placeholder(tf.int64, [None], name='y-input')

    with tf.name_scope('input_reshape'):
        image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
        tf.summary.image('input', image_shaped_input, 10)

    # We can't initialize these variables to 0 - the network will get stuck.
    def weight_variable(shape):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)

    def bias_variable(shape):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)

    def variable_summaries(var):
        """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
        with tf.name_scope('summaries'):
            mean = tf.reduce_mean(var)
            tf.summary.scalar('mean', mean)
            with tf.name_scope('stddev'):
                stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
            tf.summary.scalar('stddev', stddev)
            tf.summary.scalar('max', tf.reduce_max(var))
            tf.summary.scalar('min', tf.reduce_min(var))
            tf.summary.histogram('histogram', var)

    all_weights = []
    all_biases = []

    def nn_layer(input_tensor,
                 input_dim,
                 output_dim,
                 layer_name,
                 act=tf.nn.relu):
        """Reusable code for making a simple neural net layer.

        It does a matrix multiply, bias add, and then uses relu to nonlinearize.
        It also sets up name scoping so that the resultant graph is easy to read,
        and adds a number of summary ops.
        """
        # Adding a name scope ensures logical grouping of the layers in the graph.
        with tf.name_scope(layer_name):
            # This Variable will hold the state of the weights for the layer
            with tf.name_scope('weights'):
                weights = weight_variable([input_dim, output_dim])
                variable_summaries(weights)
                all_weights.append(weights)
            with tf.name_scope('biases'):
                biases = bias_variable([output_dim])
                variable_summaries(biases)
                all_biases.append(biases)
            with tf.name_scope('Wx_plus_b'):
                preactivate = tf.matmul(input_tensor, weights) + biases
                tf.summary.histogram('pre_activations', preactivate)
            activations = act(preactivate, name='activation')
            tf.summary.histogram('activations', activations)
            return activations

    hidden1 = nn_layer(x, 784, 500, 'layer1')

    with tf.name_scope('dropout'):
        keep_prob = tf.placeholder(tf.float32)
        tf.summary.scalar('dropout_keep_probability', keep_prob)
        dropped = tf.nn.dropout(hidden1, keep_prob)

    y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)

    with tf.name_scope('cross_entropy'):
        with tf.name_scope('total'):
            cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_,
                                                                   logits=y)

    tf.summary.scalar('cross_entropy', cross_entropy)

    with tf.name_scope('train'):
        train_step = tf.train \
            .AdamOptimizer(FLAGS.learning_rate) \
            .minimize(cross_entropy)

    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_prediction = tf.equal(tf.argmax(y, 1), y_)
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    tf.summary.scalar('accuracy', accuracy)

    # Merge all the summaries and write them out to
    # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
    tf.global_variables_initializer().run()

    # Train the model, and also write summaries.
    # Every 10th step, measure test-set accuracy, and write test summaries
    # All other steps, run train_step on training data, & add training summaries

    def feed_dict(train):
        """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
        if train or FLAGS.fake_data:
            xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
            k = FLAGS.dropout
        else:
            xs, ys = mnist.test.images, mnist.test.labels
            k = 1.0
        return {x: xs, y_: ys, keep_prob: k}

    for i in range(FLAGS.max_steps):

        if i % 10 == 0:
            # Record summaries and test-set accuracy
            summary, acc = sess.run([merged, accuracy],
                                    feed_dict=feed_dict(False))
            test_writer.add_summary(summary, i)
            print(json.dumps({'step': i, 'accuracy': acc.item()}))
        else:
            # Record train set summaries, and train
            if i % 100 == 99:
                # Record execution stats
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                summary, _ = sess.run([merged, train_step],
                                      feed_dict=feed_dict(True),
                                      options=run_options,
                                      run_metadata=run_metadata)
                train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
                train_writer.add_summary(summary, i)
                print('Adding run metadata for', i)
            else:
                # Record a summary
                summary, _ = sess.run([merged, train_step],
                                      feed_dict=feed_dict(True))
                train_writer.add_summary(summary, i)

    _, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
    print(json.dumps({'step': FLAGS.max_steps, 'accuracy': acc.item()}))

    train_writer.close()
    test_writer.close()

    # Saving weights and biases as outputs of the task.
    outputs_dir = os.getenv('VH_OUTPUTS_DIR', '/tmp/tensorflow/mnist/outputs')
    for i, ws in enumerate(all_weights):
        filename = os.path.join(outputs_dir, 'layer-{}-weights.csv'.format(i))
        np.savetxt(filename, ws.eval(), delimiter=",")
    for i, bs in enumerate(all_biases):
        filename = os.path.join(outputs_dir, 'layer-{}-biases.csv'.format(i))
        np.savetxt(filename, bs.eval(), delimiter=",")
Beispiel #14
0
def train():
    dataset, testset = data_provider.config_to_slim_dataset(
        config=TRAINING_CONFIG, dataset_dir="./")

    # training data
    prefetch_queue = data_provider.slim_dataset_to_prefetch_queue(
        dataset, BATCH_SIZE)
    face_batch, label_batch = prefetch_queue.dequeue()
    face_batch = tf.cast(face_batch, tf.float32)

    tf.summary.image("face", face_batch[0:16], max_outputs=16)

    x = tf.placeholder(tf.uint8, shape=(None, 224, 224, 3))
    y = tf.placeholder(tf.int64, shape=(None, 1))

    if args.fine_tune:
        logit, trainable, total_reg_losses, _ = model_build.build_mobilenet_v1_debug(
            x, mobilenet_training=True, neuguen_training=True)
        print("fine tune")
    else:
        logit, trainable, total_reg_losses, _ = model_build.build_mobilenet_v1_debug(
            x)
    tf.summary.scalar("regularization_loss", tf.reduce_sum(total_reg_losses))

    loss = model_build.build_loss(logit, y)
    tf.summary.scalar("cross_entropy_loss", loss)

    loss = loss + tf.reduce_sum(total_reg_losses)
    tf.summary.scalar("total_loss", loss)

    global_step = tf.train.create_global_step()
    train_op = model_build.build_train_op(loss, trainable, global_step)
    for var in tf.global_variables():
        tf.summary.histogram(var.op.name, var)

    correct_prediction = tf.equal(tf.squeeze(y), tf.argmax(logit, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar("batch_accuracy", accuracy)
    confusion_matrix_op = tf.confusion_matrix(tf.squeeze(y),
                                              tf.argmax(logit, 1))

    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True

    neuguen_saver = tf.train.Saver(max_to_keep=10)
    merge_summary = tf.summary.merge_all()

    save_path_fine_tune = "neuguen_model_fine_tune"
    if not os.path.exists(save_path_fine_tune):
        os.makedirs(save_path_fine_tune)
    save_path = "neuguen_model"
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    with tf.Session(config=session_config) as session:
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        session.run(tf.global_variables_initializer())

        if args.fine_tune:
            summary_writer = tf.summary.FileWriter(save_path_fine_tune,
                                                   session.graph)
            model_build.restore_last_checkpoint(session, save_path)
        else:
            summary_writer = tf.summary.FileWriter(save_path, session.graph)
            model_build.restore_pretrained_mobilenet(session)

        for j in xrange(20):
            confusion_matrix = np.array([[0., 0.], [0., 0.]])
            accuracy_avg = 0.0

            for i in xrange(int(TRAINING_CONFIG["training_size"] /
                                BATCH_SIZE)):
                faces, labels, step = session.run(
                    [face_batch, label_batch, global_step])
                if step % 100 == 99:
                    if step % 10000 == 9999:
                        run_options = tf.RunOptions(
                            trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()
                        summary, loss_value, accuracy_value, confusion, _ = session.run(
                            [
                                merge_summary, loss, accuracy,
                                confusion_matrix_op, train_op
                            ],
                            feed_dict={
                                x: faces,
                                y: labels
                            },
                            options=run_options,
                            run_metadata=run_metadata)
                        summary_writer.add_summary(summary, step)
                        summary_writer.add_run_metadata(
                            run_metadata, "step{0}".format(step))
                    else:
                        summary, loss_value, accuracy_value, confusion, _ = session.run(
                            [
                                merge_summary, loss, accuracy,
                                confusion_matrix_op, train_op
                            ],
                            feed_dict={
                                x: faces,
                                y: labels
                            })
                        summary_writer.add_summary(summary, step)
                else:
                    loss_value, accuracy_value, confusion, _ = session.run(
                        [loss, accuracy, confusion_matrix_op, train_op],
                        feed_dict={
                            x: faces,
                            y: labels
                        })
                confusion_matrix = confusion_matrix + confusion
                accuracy_avg = accuracy_avg + (accuracy_value -
                                               accuracy_avg) / (i + 1)
                sys.stdout.write(
                    "\r{0}--{1} training accuracy(ma):{2}    ".format(
                        j, i, accuracy_avg))
                sys.stdout.flush()
            print("")
            print(confusion_matrix)

            if args.fine_tune:
                neuguen_saver.save(session,
                                   os.path.join(save_path_fine_tune,
                                                "neuguen.ckpt"),
                                   global_step=global_step)
            else:
                neuguen_saver.save(session,
                                   os.path.join(save_path, "neuguen.ckpt"),
                                   global_step=global_step)

        print("thread.join")
        coord.request_stop()
        coord.join(threads)
Beispiel #15
0
def main():

    #-------------解析参数-------------#
    args = _parse_args()
    if args.cfg_file is not None:
        cfg_from_file(args.cfg_file)  #读取args.cfg_file文件内容并融合到cfg中
    pprint.pprint(cfg)

    #-------------任务相关配置-------------#
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ['CUDA_VISBLE_DEVICES'] = cfg.GPUS
    tf.logging.set_verbosity(tf.logging.INFO)  #设置日志级别

    #-------------搭建计算图-------------#
    with tf.device('/cpu:0'):
        # 操作密集型放在CPU上进行
        global_step = tf.get_variable('global_step', [],
                                      dtype=None,
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE_BASE,
                                        global_step,
                                        cfg.TRAIN.DECAY_STEP,
                                        cfg.TRAIN.DECAY_RATE,
                                        staircase=True)  # 学习率
        tf.summary.scalar('learnrate', lr)
        opt = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)  # 优化函数
        #opt = tf.train.GradientDescentOptimizer(lr)  # 优化函数
        num_gpus = len(cfg.GPUS.split(','))
        # 建立dataset,获取iterator
        reader.set_param(
            cfg.INPUT.DATA_DIR,
            cfg.INPUT.MODALITY,  # flow模态读取方式与rgb稍有不同
            cfg.VALID.SPLIT_PATH,
            cfg.VALID.BATCH_SIZE,
            num_segments=cfg.INPUT.NUM_SEGMENTS,
            new_length=cfg.INPUT.NEW_LENGTH,
            train_split_path=cfg.INPUT.SPLIT_PATH,
            train_batch_size=cfg.TRAIN.BATCH_SIZE,
            isTraining=True)
        ite_train, ite_valid = reader.get_dataset_iter()
        tsn_batch, label_batch = ite_train.get_next()
        tsn_batch_splits = tf.split(tsn_batch,
                                    num_or_size_splits=num_gpus,
                                    axis=0)
        label_batch_splits = tf.split(label_batch,
                                      num_or_size_splits=num_gpus,
                                      axis=0)

        tsn_valid_batch, label_valid_batch = ite_valid.get_next()

    # 在GPU上运行训练(并行)
    tower_grads = []
    with tf.variable_scope(tf.get_variable_scope(
    )) as vscope:  # 见https://github.com/tensorflow/tensorflow/issues/6220
        for i in range(num_gpus):
            with tf.device('/gpu:%d' % i), tf.name_scope('GPU_%d' %
                                                         i) as scope:
                # 获取数据,tsn_batch形式:(batch_size/num_gpus*num_seg*new_length) * h * w * num_channels
                tsn_batch_split, label_batch_split = tsn_batch_splits[
                    i], label_batch_splits[i]
                if cfg.INPUT.MODALITY == 'rgb':
                    tsn_batch_split = tf.reshape(tsn_batch_split, [
                        cfg.TRAIN.BATCH_SIZE / num_gpus *
                        cfg.INPUT.NUM_SEGMENTS * cfg.INPUT.NEW_LENGTH, 224,
                        224, 3
                    ])
                elif cfg.INPUT.MODALITY == 'flow':
                    tsn_batch_split = tf.reshape(tsn_batch_split, [
                        cfg.TRAIN.BATCH_SIZE / num_gpus *
                        cfg.INPUT.NUM_SEGMENTS * cfg.INPUT.NEW_LENGTH, 224,
                        224, 2
                    ])
                else:
                    raise ValueError("modality must be one of rgb or flow")

                # 获取网络,并完成前传
                with slim.arg_scope(inception_v2_arg_scope()):
                    logits, _ = inception_v2(
                        tsn_batch_split,
                        num_classes=cfg.NUM_CLASSES,
                        is_training=True,
                        dropout_keep_prob=cfg.TRAIN.DROPOUT_KEEP_PROB,
                        min_depth=16,
                        depth_multiplier=1.0,
                        prediction_fn=slim.softmax,
                        spatial_squeeze=True,
                        reuse=None,
                        scope='InceptionV2',
                        global_pool=False)
                tf.get_variable_scope().reuse_variables()
                logits = tf.reshape(logits, [
                    cfg.TRAIN.BATCH_SIZE / num_gpus,
                    cfg.INPUT.NUM_SEGMENTS * cfg.INPUT.NEW_LENGTH, -1
                ])  #tsn的特殊性决定
                logits = tf.reduce_mean(logits, 1)  # 取采样图片输出的平均值
                # 做一个batch准确度的预测
                prediction = tf.nn.softmax(logits)
                acc_batch = tf.reduce_mean(
                    tf.cast(
                        tf.equal(tf.argmax(prediction, 1),
                                 tf.argmax(label_batch_split, 1)), tf.float32))
                tf.summary.scalar('acc_on_batch', acc_batch)
                # 求loss
                for variable in tf.global_variables():
                    if variable.name.find(
                            'weights'
                    ) > 0:  # 把参数w加入集合tf.GraphKeys.WEIGHTS,方便做正则化(此句必须放在正则化之前)
                        tf.add_to_collection(tf.GraphKeys.WEIGHTS, variable)
                loss = tsn_loss(logits, label_batch_split, regularization=True)
                tf.summary.scalar('loss', loss)
                # 计算梯度,并由tower_grads收集
                grads_and_vars = opt.compute_gradients(
                    loss, var_list=tf.trainable_variables(
                    ))  # (gradient, variable)组成的列表
                tower_grads.append(grads_and_vars)
    grads_and_vars = average_gradients(tower_grads)  # 求取各GPU平均梯度
    train_step = opt.apply_gradients(grads_and_vars, global_step=global_step)

    # 在GPU上运行验证(串行)
    with tf.variable_scope(tf.get_variable_scope(
    )) as vscope:  # 见https://github.com/tensorflow/tensorflow/issues/6220
        with tf.device('/gpu:0'), tf.name_scope('VALID') as scope:
            tf.get_variable_scope().reuse_variables()
            if cfg.INPUT.MODALITY == 'rgb':
                tsn_valid_batch = tf.reshape(
                    tsn_valid_batch, [cfg.VALID.BATCH_SIZE * 25, 224, 224, 3])
            elif cfg.INPUT.MODALITY == 'flow':
                tsn_valid_batch = tf.reshape(
                    tsn_valid_batch, [cfg.VALID.BATCH_SIZE * 25, 224, 224, 2])
            else:
                raise ValueError("modality must be one of rgb or flow")

            with slim.arg_scope(inception_v2_arg_scope()):
                logits_valid, _ = inception_v2(
                    tsn_valid_batch,
                    num_classes=cfg.NUM_CLASSES,
                    is_training=False,
                    dropout_keep_prob=cfg.TRAIN.DROPOUT_KEEP_PROB,
                    min_depth=16,
                    depth_multiplier=1.0,
                    prediction_fn=slim.softmax,
                    spatial_squeeze=True,
                    reuse=None,
                    scope='InceptionV2',
                    global_pool=False)
            logits_valid = tf.reshape(
                logits_valid, [cfg.VALID.BATCH_SIZE, 25, -1])  #tsn的特殊性决定
            logits_valid = tf.reduce_mean(logits_valid, 1)  # 取采样图片输出的平均值
            # 做一个batch准确度的预测
            prediction_valid = tf.nn.softmax(logits_valid)
            acc_valid_batch = tf.reduce_mean(
                tf.cast(
                    tf.equal(tf.argmax(prediction_valid, 1),
                             tf.argmax(label_valid_batch, 1)), tf.float32))

    merged = tf.summary.merge_all()

    # saver
    model_variables_map = {}
    for variable in tf.global_variables():
        if variable.name.split('/')[0] == 'InceptionV2' and variable.name.find(
                'Conv2d_1c_1x1') == -1 and variable.name.find(
                    'Momentum') == -1:
            model_variables_map[variable.name.replace(':0', '')] = variable
    print '####################################################'
    for i in model_variables_map.keys():
        print i
    print '#####################################################'
    saver_model = tf.train.Saver(
        var_list=model_variables_map,
        max_to_keep=20)  #不加载'InceptionV2/Logits/Conv2d_1c_1x1/'下的参数

    #-------------启动Session-------------#
    # (预测验证集,求取精度)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
    config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        joint_writer = tf.summary.FileWriter(cfg.SUMMARY_DIR, sess.graph)
        summary_writer = tf.summary.FileWriter(cfg.SUMMARY_DIR, sess.graph)

        #初始化变量(或加载pretrained models)
        tf.global_variables_initializer().run()
        saver_model.restore(sess, cfg.TRAIN.PRETRAINED_MODEL_NAME)

        sess.graph.finalize()
        start_time = time.time()
        for i in range(cfg.TRAIN.MAX_ITE):
            _, learnrate, loss_value, step, summary = sess.run(
                [train_step, lr, loss, global_step, merged],
                options=run_options,
                run_metadata=run_metadata)
            if i == 0:
                start_time = time.time()
            if i % 10 == 0:
                if i >= 1:
                    end_time = time.time()
                    avg_time = (end_time - start_time) / float(i + 1)
                    print("Average time consumed per step is %0.2f secs." %
                          avg_time)
                print(
                    "After %d training step(s), learning rate is %g, loss on training batch is %g."
                    % (step, learnrate, loss_value))

            # 每个epoch验证一次,保存模型
            if i % 100 == 0:
                print '#############################################'
                print 'valid and save model'
                accs = []
                num = 0
                for j in range(849):
                    num += 1
                    acc = sess.run(acc_valid_batch)
                    accs.append(acc)
                print num
                acc_valid = np.mean(np.array(accs))
                print 'accuracy on validation set is %0.4f' % acc_valid
                print 'saving model...'
                saver_model.save(sess,
                                 cfg.TRAIN.SAVED_MODEL_PATTERN,
                                 global_step=global_step)
                print 'successfully saved !'
                print '#############################################'

            joint_writer.add_run_metadata(run_metadata, 'step%03d' % i)
            summary_writer.add_summary(summary, i)
            end_time = time.time()
            #print '%dth time step,consuming %f secs'%(i, start_time-end_time)

    summary_writer.close()
Beispiel #16
0
def train(model, x_train, y_train, x_validation, y_validation,
          epochs_list, name,
          batch_size = 64,  
          learning_rate = 1e-3,
          lr_decay_ratio = 0.1,
          data_augmentation = True ):
    
    #opt = keras.optimizers.Adam(lr=learning_rate, epsilon=1e-08) 
    opt = keras.optimizers.SGD(lr=learning_rate, momentum=0.9, nesterov=True) 
    
    run_options = None
    run_metadata = None
    
    if profiling:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        
        model.compile(loss='categorical_crossentropy', 
                      optimizer=opt, 
                      metrics=['accuracy'],
                      options=run_options,
                      run_metadata=run_metadata) 
    else:
        model.compile(loss='categorical_crossentropy', 
                      optimizer=opt, 
                      metrics=['accuracy'])
               
    #to enable tensorboard
    #tensorboard = TensorBoard(log_dir='./logs', histogram_freq=10, write_graph=False, write_grads=True, write_images=True)
    #tensorboard --logdir=C:\...\logs
 
    filepath = name + 'model-ep{epoch:04d}-loss{loss:.3f}-acc{acc:.3f}-val_loss{val_loss:.3f}-val_acc{val_acc:.3f}.h5'
    checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True, period=10)

    def schedule(epoch):
        lr = learning_rate;
        for epochs in epochs_list:
            if epoch >= epochs:
                lr *= lr_decay_ratio
            else:
                break
        return lr
    
    lr_scheduler = LearningRateScheduler(schedule, verbose=1)
        
    if not data_augmentation: 
        print('Not using data augmentation.') 
        model.fit(x_train, y_train, 
                  batch_size=batch_size, 
                  epochs=epochs_list[-1], 
                  validation_data=(x_validation, y_validation),
                  callbacks=[lr_scheduler, checkpoint],
                  #, callbacks=[tensorboard]
                  shuffle=True)
    else: 
        print('Using real-time data augmentation.') 
        # This will do preprocessing and realtime data augmentation: 
    
        datagen = ImageDataGenerator( 
            horizontal_flip=True,
            width_shift_range=0.125,
            height_shift_range=0.125,
            fill_mode='constant') 

        datagen.fit(x_train) 
        
        # Fit the model on the batches generated by datagen.flow(). 
        model.fit_generator(datagen.flow(x_train, y_train, 
                                         batch_size=batch_size), 
                            steps_per_epoch=x_train.shape[0] // batch_size, 
                            epochs=epochs_list[-1], 
                            callbacks=[lr_scheduler, checkpoint],
                            #, callbacks=[tensorboard]) 
                            validation_data=(x_validation, y_validation))
         
    if mem_stat:
        sess = K.get_session()
        print(sess.run(tf.contrib.memory_stats.MaxBytesInUse()))
        # current usage
        print(sess.run(tf.contrib.memory_stats.BytesInUse()))
   
    if profiling:
        trace = timeline.Timeline(step_stats=run_metadata.step_stats)
        with open('timeline.densenet.json', 'w') as f:
            f.write(trace.generate_chrome_trace_format(show_memory=True))
Beispiel #17
0
def main():
    args = get_arguments()

    try:
        directories = validate_directories(args)
    except ValueError as e:
        print("Some arguments are wrong:")
        print(str(e))
        return

    logdir = directories['logdir']
    restore_from = directories['restore_from']

    # Even if we restored the model, we will treat it as new training
    # if the trained model is written into an arbitrary location.
    is_overwritten_training = logdir != restore_from

    with open(args.wavenet_params, 'r') as f:
        wavenet_params = json.load(f)

    # Create coordinator.
    coord = tf.train.Coordinator()

    # Load raw waveform from VCTK corpus.
    with tf.name_scope('create_inputs'):
        # Allow silence trimming to be skipped by specifying a threshold near
        # zero.
        silence_threshold = args.silence_threshold if args.silence_threshold > \
                                                      EPSILON else None
        gc_enabled = args.gc_channels is not None
        reader = AudioReader(
            audio_dir=args.data_dir,
            coord=coord,
            sample_rate=wavenet_params["sample_rate"],
            gc_enabled=gc_enabled,
            receptive_field=WaveNetModel.calculate_receptive_field(
                wavenet_params["filter_width"], wavenet_params["dilations"],
                wavenet_params["scalar_input"],
                wavenet_params["initial_filter_width"]),
            sample_size=args.sample_size,
            mfsc_dim=wavenet_params["MFSC_channels"],
            ap_dim=wavenet_params["AP_channels"],
            F0_dim=wavenet_params["F0_channels"],
            phone_dim=wavenet_params["phones_channels"],
            phone_pos_dim=wavenet_params["phone_pos_channels"],
            silence_threshold=silence_threshold)

        ap_mfsc_batch, lc_batch = reader.dequeue(args.batch_size)
        # print ("mfsc_batch_shape:", mfsc_batch.get_shape().as_list())
        if gc_enabled:
            gc_id_batch = reader.dequeue_gc(args.batch_size)
        else:
            gc_id_batch = None

    # Create network.
    net = WaveNetModel(
        batch_size=args.batch_size,
        dilations=wavenet_params["dilations"],
        filter_width=wavenet_params["filter_width"],
        residual_channels=wavenet_params["residual_channels"],
        dilation_channels=wavenet_params["dilation_channels"],
        skip_channels=wavenet_params["skip_channels"],
        use_biases=wavenet_params["use_biases"],
        scalar_input=wavenet_params["scalar_input"],
        initial_filter_width=wavenet_params["initial_filter_width"],
        histograms=args.histograms,
        global_condition_channels=args.gc_channels,
        global_condition_cardinality=reader.gc_category_cardinality,
        MFSC_channels=wavenet_params["MFSC_channels"],
        AP_channels=wavenet_params["AP_channels"],
        F0_channels=wavenet_params["F0_channels"],
        phone_channels=wavenet_params["phones_channels"],
        phone_pos_channels=wavenet_params["phone_pos_channels"])

    if args.l2_regularization_strength == 0:
        args.l2_regularization_strength = None
    # pdb.set_trace()
    loss = net.loss(
        input_batch=
        ap_mfsc_batch,  # audio_batch shape: [receptive_filed + sample_size, 1]
        lc_batch=lc_batch,
        global_condition_batch=gc_id_batch,  # gc_id_batch shape: scalar
        l2_regularization_strength=args.l2_regularization_strength)
    optimizer = optimizer_factory[args.optimizer](
        learning_rate=args.learning_rate, momentum=args.momentum)
    trainable = tf.trainable_variables()
    optim = optimizer.minimize(loss, var_list=trainable)

    # Set up logging for TensorBoard.
    writer = tf.summary.FileWriter(logdir)
    writer.add_graph(tf.get_default_graph())
    run_metadata = tf.RunMetadata()
    summaries = tf.summary.merge_all()

    # Set up session
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
    init = tf.global_variables_initializer()
    sess.run(init)

    # Saver for storing checkpoints of the model.
    saver = tf.train.Saver(var_list=tf.trainable_variables(),
                           max_to_keep=args.max_checkpoints)

    try:
        saved_global_step = load(saver, sess, restore_from)
        if is_overwritten_training or saved_global_step is None:
            # The first training step will be saved_global_step + 1,
            # therefore we put -1 here for new or overwritten trainings.
            saved_global_step = -1

    except:
        print("Something went wrong while restoring checkpoint. "
              "We will terminate training to avoid accidentally overwriting "
              "the previous model.")
        raise

    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    reader.start_threads(sess)

    # pdb.set_trace()

    step = None
    last_saved_step = saved_global_step
    try:
        for step in range(saved_global_step + 1, args.num_steps):
            start_time = time.time()
            # acous, lc, loss_val = sess.run([ap_mfsc_batch, lc_batch, loss])
            # print ("acous shape into net:", acous.shape)
            # print ("lc shape into net:", lc.shape)
            # print ("loss_val:", loss_val)

            # print("ap:", acous[0,:10, :4])
            # print("mfsc", acous[0,:10,4:])
            # print("ap_mfsc", acous[0,:10])

            # pdb.set_trace()
            if args.store_metadata and step % 50 == 0:
                # Slow run that stores extra information for debugging.
                print('Storing metadata')
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                # acous, F0 = sess.run([audio_batch, F0_batch], options=run_options,run_metadata=run_metadata)
                # print acous.shape
                # print F0.shape
                summary, loss_value, _ = sess.run([summaries, loss, optim],
                                                  options=run_options,
                                                  run_metadata=run_metadata)
                writer.add_summary(summary, step)
                writer.add_run_metadata(run_metadata,
                                        'step_{:04d}'.format(step))
                tl = timeline.Timeline(run_metadata.step_stats)
                timeline_path = os.path.join(logdir, 'timeline.trace')
                with open(timeline_path, 'w') as f:
                    f.write(tl.generate_chrome_trace_format(show_memory=True))
            else:
                summary, loss_value, _ = sess.run([summaries, loss, optim])
                # network_input_return_val = sess.run(network_input_return)
                writer.add_summary(summary, step)
                # print("network_input_return_shape:", network_input_return_val[0,0])
            # pdb.set_trace()
            duration = time.time() - start_time
            if step % 10 == 0:
                print('step {:d} - loss = {:.3f}, ({:.3f} sec/step)'.format(
                    step, loss_value, duration))
            # print ("F0:", F0_val[0,0])
            if step % args.checkpoint_every == 0:
                save(saver, sess, logdir, step)
                last_saved_step = step

    except KeyboardInterrupt:
        # Introduce a line break after ^C is displayed so save message
        # is on its own line.
        print()
    # finally:
    if step > last_saved_step:
        save(saver, sess, logdir, step)
    coord.request_stop()
    coord.join(threads)
Beispiel #18
0
def main():
    #    if tf.__version__ != "1.0.0":
    #       raise Exception("Tensorflow version 1.0.0 required")

    if a.seed is None:
        a.seed = random.randint(0, 2**31 - 1)

    tf.set_random_seed(a.seed)
    np.random.seed(a.seed)
    random.seed(a.seed)

    if not os.path.exists(a.output_dir):
        os.makedirs(a.output_dir)

    if a.mode == "test" or a.mode == "export":
        if a.checkpoint is None:
            raise Exception("checkpoint required for test mode")

        # load some options from the checkpoint
        options = {"which_direction", "ngf", "ndf"}
        with open(os.path.join(a.checkpoint, "options.json")) as f:
            for key, val in json.loads(f.read()).items():
                if key in options:
                    print("loaded", key, "=", val)
                    setattr(a, key, val)
        # disable these features in test mode
        a.scale_size = CROP_SIZE
        a.flip = False

    for k, v in a._get_kwargs():
        print(k, "=", v)

    with open(os.path.join(a.output_dir, "options.json"), "w") as f:
        f.write(json.dumps(vars(a), sort_keys=True, indent=4))

    if a.mode == "export":
        # export the generator to a meta graph that can be imported later for standalone generation

        input = tf.placeholder(tf.string, shape=[1])
        input_data = tf.decode_base64(input[0])
        input_image = tf.image.decode_png(input_data)
        # remove alpha channel if present
        input_image = input_image[:, :, :3]
        input_image = tf.image.convert_image_dtype(input_image,
                                                   dtype=tf.float32)
        input_image.set_shape([CROP_SIZE, CROP_SIZE, 3])
        batch_input = tf.expand_dims(input_image, axis=0)

        with tf.variable_scope("generator") as scope:
            batch_output = deprocess(
                create_generator(preprocess(batch_input), 3))

        output_image = tf.image.convert_image_dtype(batch_output,
                                                    dtype=tf.uint8)[0]
        if a.output_filetype == "png":
            output_data = tf.image.encode_png(output_image)
        elif a.output_filetype == "jpeg":
            output_data = tf.image.encode_jpeg(output_image, quality=80)
        else:
            raise Exception("invalid filetype")
        output = tf.convert_to_tensor([tf.encode_base64(output_data)])

        key = tf.placeholder(tf.string, shape=[1])
        inputs = {"key": key.name, "input": input.name}
        tf.add_to_collection("inputs", json.dumps(inputs))
        outputs = {
            "key": tf.identity(key).name,
            "output": output.name,
        }
        tf.add_to_collection("outputs", json.dumps(outputs))

        init_op = tf.global_variables_initializer()
        restore_saver = tf.train.Saver()
        export_saver = tf.train.Saver()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            sess.run(init_op)
            print("loading model from checkpoint")
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            restore_saver.restore(sess, checkpoint)
            print("exporting model")
            export_saver.export_meta_graph(
                filename=os.path.join(a.output_dir, "export.meta"))
            export_saver.save(sess,
                              os.path.join(a.output_dir, "export"),
                              write_meta_graph=False)

        return

    examples = load_examples()
    print("examples count = %d" % examples.count)

    # inputs and targets are [batch_size, height, width, channels]
    net1 = vgg16.Vgg16()
    net2 = vgg16.Vgg16()
    model = create_model(examples.inputs, examples.targets, net1, net2)

    # undo colorization splitting on images that we use for display/output

    inputs = deprocess(examples.inputs)
    targets = deprocess(examples.targets)
    outputs = deprocess(model.outputs)

    def convert(image):
        if a.aspect_ratio != 1.0:
            # upscale to correct aspect ratio
            size = [CROP_SIZE, int(round(CROP_SIZE * a.aspect_ratio))]
            image = tf.image.resize_images(
                image, size=size, method=tf.image.ResizeMethod.BICUBIC)

        return tf.image.convert_image_dtype(image,
                                            dtype=tf.uint8,
                                            saturate=True)

    # reverse any processing on images so they can be written to disk or displayed to user
    with tf.name_scope("convert_inputs"):
        converted_inputs = convert(inputs)

    with tf.name_scope("convert_targets"):
        converted_targets = convert(targets)

    with tf.name_scope("convert_outputs"):
        converted_outputs = convert(outputs)

    with tf.name_scope("encode_images"):
        display_fetches = {
            "paths":
            examples.paths,
            "inputs":
            tf.map_fn(tf.image.encode_png,
                      converted_inputs,
                      dtype=tf.string,
                      name="input_pngs"),
            "targets":
            tf.map_fn(tf.image.encode_png,
                      converted_targets,
                      dtype=tf.string,
                      name="target_pngs"),
            "outputs":
            tf.map_fn(tf.image.encode_png,
                      converted_outputs,
                      dtype=tf.string,
                      name="output_pngs"),
        }

    # summaries
    with tf.name_scope("inputs_summary"):
        tf.summary.image("inputs", converted_inputs)

    with tf.name_scope("targets_summary"):
        tf.summary.image("targets", converted_targets)

    with tf.name_scope("outputs_summary"):
        tf.summary.image("outputs", converted_outputs)

    with tf.name_scope("predict_real_summary"):
        tf.summary.image(
            "predict_real",
            tf.image.convert_image_dtype(model.predict_real, dtype=tf.uint8))

    with tf.name_scope("predict_fake_summary"):
        tf.summary.image(
            "predict_fake",
            tf.image.convert_image_dtype(model.predict_fake, dtype=tf.uint8))

    tf.summary.scalar("discriminator_loss", model.discrim_loss)
    tf.summary.scalar("generator_loss_GAN", model.gen_loss_GAN)
    tf.summary.scalar("generator_loss_L1", model.gen_loss_L1)
    tf.summary.scalar("generator_loss_tv", model.gen_loss_tv)
    tf.summary.scalar("generator_loss_f", model.gen_loss_f)
    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name + "/values", var)

    for grad, var in model.discrim_grads_and_vars + model.gen_grads_and_vars:
        tf.summary.histogram(var.op.name + "/gradients", grad)

    with tf.name_scope("parameter_count"):
        parameter_count = tf.reduce_sum(
            [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()])

    saver = tf.train.Saver(max_to_keep=1)

    logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None
    sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with sv.managed_session(config=config) as sess:
        print("parameter_count =", sess.run(parameter_count))

        if a.checkpoint is not None:
            print("loading model from checkpoint")
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            saver.restore(sess, checkpoint)

        max_steps = 2**32
        if a.max_epochs is not None:
            max_steps = examples.steps_per_epoch * a.max_epochs
        if a.max_steps is not None:
            max_steps = a.max_steps

        if a.mode == "test":
            # testing
            # at most, process the test data once
            max_steps = min(examples.steps_per_epoch, max_steps)
            for step in range(max_steps):
                results = sess.run(display_fetches)
                filesets = save_images(results)
                for i, f in enumerate(filesets):
                    print("evaluated image", f["name"])
                index_path = append_index(filesets)

            print("wrote index at", index_path)
        else:
            # training
            start = time.time()

            for step in range(max_steps):

                def should(freq):
                    return freq > 0 and ((step + 1) % freq == 0
                                         or step == max_steps - 1)

                options = None
                run_metadata = None
                if should(a.trace_freq):
                    options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()

                fetches = {
                    "train": model.train,
                    "global_step": sv.global_step,
                }

                if should(a.progress_freq):
                    fetches["discrim_loss"] = model.discrim_loss
                    fetches["gen_loss_GAN"] = model.gen_loss_GAN
                    fetches["gen_loss_L1"] = model.gen_loss_L1
                    fetches["gen_loss_tv"] = model.gen_loss_tv
                    fetches["gen_loss_f"] = model.gen_loss_f
                if should(a.summary_freq):
                    fetches["summary"] = sv.summary_op

                if should(a.display_freq):
                    fetches["display"] = display_fetches

                results = sess.run(fetches,
                                   options=options,
                                   run_metadata=run_metadata)

                if should(a.summary_freq):
                    print("recording summary")
                    sv.summary_writer.add_summary(results["summary"],
                                                  results["global_step"])

                if should(a.display_freq):
                    print("saving display images")
                    filesets = save_images(results["display"],
                                           step=results["global_step"])
                    append_index(filesets, step=True)

                if should(a.trace_freq):
                    print("recording trace")
                    sv.summary_writer.add_run_metadata(
                        run_metadata, "step_%d" % results["global_step"])

                if should(a.progress_freq):
                    # global_step will have the correct step count if we resume from a checkpoint
                    train_epoch = math.ceil(results["global_step"] /
                                            examples.steps_per_epoch)
                    train_step = (results["global_step"] -
                                  1) % examples.steps_per_epoch + 1
                    rate = (step + 1) * a.batch_size / (time.time() - start)
                    remaining = (max_steps - step) * a.batch_size / rate
                    print(
                        "progress  epoch %d  step %d  image/sec %0.1f  remaining %dm"
                        % (train_epoch, train_step, rate, remaining / 60))
                    print("discrim_loss", results["discrim_loss"])
                    print("gen_loss_GAN", results["gen_loss_GAN"])
                    print("gen_loss_L1", results["gen_loss_L1"])
                    print("gen_loss_tv", results["gen_loss_tv"])
                    print("gen_loss_f", results["gen_loss_f"])

                if should(a.save_freq):
                    print("saving model")
                    saver.save(sess,
                               os.path.join(a.output_dir, "model"),
                               global_step=sv.global_step)

                if sv.should_stop():
                    break
Beispiel #19
0
def main(_):
    tic = time.time() 
    tf.logging.set_verbosity(tf.logging.INFO)
    if not FLAGS.dataset_dir:
        raise ValueError('You must supply the dataset directory with --dataset_dir')
    # init 
    net_name_scope_pruned = FLAGS.net_name_scope_pruned
    net_name_scope_checkpoint = FLAGS.net_name_scope_checkpoint
    block_names = valid_block_names
    kept_percentages_dict = get_kept_percentages_dict_from_path(FLAGS.checkpoint_path)
    kept_percentages = sorted(map(float, FLAGS.kept_percentages.split(',')))

    # check networks with the kps are pre-trained. 
    for kp in kept_percentages:
        if kp not in kept_percentages_dict:
            raise Error('kept_percentage='+str(kp)+' not in folder:'+ FLAGS.checkpoint_path)

    num_options = len(kept_percentages)
    num_units = len(block_names)
    print('num_options=%d, num_blocks=%d' %(num_options, num_units))
    print('HG: total number of configurations=%d' %(num_options**num_units))
    
    if FLAGS.configuration_type =='sample':
        configs = get_sampled_configurations(num_units, num_options, FLAGS.total_num_configurations)
    elif FLAGS.configuration_type == 'special':
        configs = get_special_configurations(num_units, num_options)
    num_configurations = len(configs)

    #Getting MPI rank integer
    # comm = MPI.COMM_WORLD
    # rank = comm.Get_rank()
    # if rank >= num_configurations:
    #     print("ERROR: rank(%d) > num_configurations(%d)" %(rank, num_configurations))
    #     return
    rank = 0 
    FLAGS.configuration_index = FLAGS.start_configuration_index + rank
    config = configs[FLAGS.configuration_index]
    print('HG: kept_percentages=%s, num_configs=%d, start_config_index=%d, rank=%d, config_index=%d' \
           %(str(kept_percentages), num_configurations, FLAGS.start_configuration_index,  rank, FLAGS.configuration_index)) 

    # prepare for training with the specific config 
    kept_percentage = config_to_kept_percentage_sequence(config, block_names, kept_percentages)
    prune_info = kept_percentage_sequence_to_prune_info(kept_percentage, block_names)
    print('HG: prune_info:')
    pprint(prune_info)

    # prepare file system  
    results_dir = os.path.join(FLAGS.train_dir, "id"+str(FLAGS.configuration_index)) #+'_'+str(FLAGS.max_number_of_steps))
    train_dir = os.path.join(results_dir, 'train')
    
    if (not FLAGS.continue_training) or (not tf.train.latest_checkpoint(train_dir)):
        print('Start a new training')
        prepare_file_system(train_dir)
    else:
        print('Continue training')

    def write_detailed_info(info):
        with open(os.path.join(train_dir, 'train_details.txt'), 'a') as f:
            f.write(info+'\n') 

    info = 'train_dir: '+ train_dir+'\n'
    info += 'options:'+str(kept_percentages)+'\n'
    info += 'configuration: '+ str(config)+'\n'
    info += 'kept_percentage: ' + str(kept_percentage)
    print(info)
    write_detailed_info(info)

    with tf.Graph().as_default():
   
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=FLAGS.task,
            num_replicas=FLAGS.worker_replicas,
            num_ps_tasks=FLAGS.num_ps_tasks)

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(
            FLAGS.dataset_name, FLAGS.train_dataset_name, FLAGS.dataset_dir)
        test_dataset = dataset_factory.get_dataset(
            FLAGS.dataset_name, FLAGS.test_dataset_name, FLAGS.dataset_dir)

        batch_queue = train_inputs(dataset, deploy_config, FLAGS)
        test_images, test_labels = test_inputs(test_dataset, deploy_config, FLAGS)
        images, labels = batch_queue.dequeue()

        ######################
        # Select the network#
        ######################
        network_fn_pruned = nets_factory.get_network_fn_pruned(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            weight_decay=FLAGS.weight_decay)

        ####################
        # Define the model #
        ####################
        logits_train,_ = network_fn_pruned(images, 
                                           prune_info = prune_info, 
                                            is_training=True, 
                                            is_local_train=False, 
                                            reuse_variables=False,
                                            scope = net_name_scope_pruned)

        logits_eval, _ = network_fn_pruned(test_images, 
                                           prune_info = prune_info, 
                                           is_training=False, 
                                           is_local_train=False, 
                                           reuse_variables=True,
                                           scope = net_name_scope_pruned)
        cross_entropy = add_cross_entropy(logits_train, labels)
        correct_prediction = add_correct_prediction(logits_eval, test_labels)

        #############################
        # Specify the loss functions #
        #############################
        collection_name = 'subgraph_losses'
        tf.add_to_collection(collection_name, cross_entropy)
        # get regularization loss
        regularization_losses = get_regularization_losses_within_scopes()
        print_list('regularization_losses', regularization_losses)
        # total loss and its summary
        total_loss = tf.add_n(tf.get_collection(collection_name), name='total_loss')
        for l in tf.get_collection(collection_name)+[total_loss]:
            tf.summary.scalar(l.op.name+'/summary', l)


        #########################################
        # Configure the optimization procedure. #
        #########################################
        with tf.device(deploy_config.variables_device()):
            global_step = tf.Variable(0, trainable=False, name='global_step')
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = configure_learning_rate(dataset.num_samples, global_step, FLAGS)
            optimizer = configure_optimizer(learning_rate, FLAGS)
            tf.summary.scalar('learning_rate', learning_rate)

        #############################
        # Add train operation       #
        #############################
        variables_to_train = get_trainable_variables_within_scopes()
        train_op = add_train_op(optimizer, total_loss, global_step, var_list=variables_to_train)
        print_list("variables_to_train", variables_to_train)

        # Gather update_ops: the updates for the batch_norm variables created by network_fn_pruned.
        update_ops = get_update_ops_within_scopes()
        print_list("update_ops", update_ops)


        update_ops.append(train_op)
        update_op = tf.group(*update_ops)
        with tf.control_dependencies([update_op]):
            train_tensor = tf.identity(total_loss, name='train_op')

        # add summary op
        summary_op = tf.summary.merge_all()


        print("HG: trainable_variables=", len(tf.trainable_variables()))
        print("HG: model_variables=", len(tf.model_variables()))
        print("HG: global_variables=", len(tf.global_variables()))
        # print_list('model_variables but not trainable variables', list(set(tf.model_variables()).difference(tf.trainable_variables())))
        # print_list('global_variables but not model variables', list(set(tf.global_variables()).difference(tf.model_variables())))

        # get train scopes for each kept_percentage
        block_names_dict = {}
        for block_name, block_kept_percentage in zip(block_names, kept_percentage):
            if block_kept_percentage not in block_names_dict:
                block_names_dict[block_kept_percentage] = []
            block_names_dict[block_kept_percentage].append(block_name)
        
        #print_list("train_scopes", train_scopes)
        print('HG: block_names_dict:')
        pprint(block_names_dict)

        sess_config = tf.ConfigProto(intra_op_parallelism_threads=16,
                                        inter_op_parallelism_threads=16)
        with tf.Session(config=sess_config) as sess:
            ###########################
            # prepare for filewritter #
            ###########################
            train_writer = tf.summary.FileWriter(train_dir, sess.graph)

            # if restart the training or there is no checkpoint in the train_dir 
            if (not FLAGS.continue_training) or (not tf.train.latest_checkpoint(train_dir)):
                #################################################
                # Restore  pruned model variable values. #
                #################################################
                all_variables_to_train = []
                for block_kept_percentage, block_name in block_names_dict.items():
                    print('HG: kept_percentage', block_kept_percentage)
                    checkpoint_path = os.path.join(
                        FLAGS.checkpoint_path, 
                        kept_percentages_dict[block_kept_percentage][0], 'train')
                    #    'model.ckpt-'+str(FLAGS.local_train_steps))

                    variables_to_train = {re.sub(net_name_scope_pruned, net_name_scope_pruned+"_p"+str(block_kept_percentage), v.op.name):
                                        v for v in get_model_variables_with_block_names(net_name_scope_pruned, block_name)}
                    print_list("restore pruned model variables", variables_to_train.values())
                    load_checkpoint(sess, checkpoint_path, var_list=variables_to_train)
                    all_variables_to_train.extend(variables_to_train.values())

                #################################################
                # Restore  orignal  model variable values. #
                #################################################
                variables_to_restore = {re.sub(net_name_scope_pruned, net_name_scope_checkpoint, v.op.name): 
                                      v for v in get_model_variables_within_scopes()
                                      if v not in set(all_variables_to_train)}
                print_list("restore original model variables", variables_to_restore.values())
                load_checkpoint(sess, checkpoint_path, var_list=variables_to_restore)

            else:
                ###########################################
                ## Restore all variables from checkpoint ##
                ###########################################
                variables_to_restore = get_global_variables_within_scopes()
                load_checkpoint(sess, train_dir, var_list = variables_to_restore)

            #################################################
            # init unitialized global variable. #
            #################################################
            variables_to_init = get_global_variables_within_scopes(sess.run( tf.report_uninitialized_variables() ))
            print_list("init unitialized variables", variables_to_init)
            sess.run( tf.variables_initializer(variables_to_init) )

            init_global_step_value = sess.run(global_step)
            print('initial global step: ', init_global_step_value)
            if init_global_step_value >= FLAGS.max_number_of_steps:
                print('Exit: init_global_step_value (%d) >= FLAGS.max_number_of_steps (%d)' \
                    %(init_global_step_value, FLAGS.max_number_of_steps))
                return

            ###########################
            # Record CPU usage  #
            ###########################
            mpstat_output_filename = os.path.join(train_dir, "cpu-usage.log")
            os.system("mpstat -P ALL 1 > " + mpstat_output_filename + " 2>&1 &")

            ###########################
            # Kicks off the training. #
            ###########################
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep)
            print('HG: # of threads=', len(threads))


            duration = 0 
            duration_cnt = 0 
            train_time = 0 
            train_only_cnt = 0 

            print("start to train at:", datetime.now())
            for i in range(init_global_step_value, FLAGS.max_number_of_steps+1):
                #train_step = i+FLAGS.local_train_steps
                train_step = i 
                # run optional meta data, or summary, while run train tensor
                if i > init_global_step_value: 
                #if i < FLAGS.max_number_of_steps:
                    
                    # run metadata and train 
                    if i % FLAGS.runmeta_every_n_steps == FLAGS.runmeta_every_n_steps-1:
                        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()

                        loss_value = sess.run(train_tensor,
                                              options = run_options,
                                              run_metadata=run_metadata)
                        train_writer.add_run_metadata(run_metadata, 'step%d-train' % i)

                        # Create the Timeline object, and write it to a json file
                        fetched_timeline = timeline.Timeline(run_metadata.step_stats)
                        chrome_trace = fetched_timeline.generate_chrome_trace_format()
                        with open(os.path.join(train_dir, 'timeline_'+str(i)+'.json'), 'w') as f:
                            f.write(chrome_trace)

                    # record summary and train 
                    elif i % FLAGS.summary_every_n_steps==0:
                        train_summary, loss_value = sess.run([summary_op, train_tensor])
                        train_writer.add_summary(train_summary, train_step)

                    # train only 
                    else:
                        start_time = time.time()
                        loss_value = sess.run(train_tensor)
                        train_only_cnt+=1
                        train_time += time.time() - start_time 
                        duration_cnt +=1 
                        duration += time.time()- start_time 

                    if i%FLAGS.log_every_n_steps==0 and duration_cnt > 0:
                        log_frequency = duration_cnt  
                        examples_per_sec = log_frequency * FLAGS.batch_size / duration
                        sec_per_batch = float(duration /log_frequency)
                        summary = tf.Summary()
                        summary.value.add(tag='examples_per_sec', simple_value=examples_per_sec)
                        summary.value.add(tag='sec_per_batch', simple_value=sec_per_batch)
                        train_writer.add_summary(summary, train_step)
                        format_str = ('%s: step %d, loss = %.3f (%.1f examples/sec; %.3f sec/batch)')
                        print(format_str % (datetime.now(), i, loss_value, examples_per_sec, sec_per_batch))
                        duration = 0
                        duration_cnt = 0  

                        info= format_str % (datetime.now(), i, loss_value, examples_per_sec, sec_per_batch)
                        write_detailed_info(info)
                else:
                    # run only total loss when i=0 
                    train_summary, loss_value = sess.run([summary_op, total_loss]) #loss_value = sess.run(total_loss)
                    train_writer.add_summary(train_summary, train_step)
                    format_str = ('%s: step %d, loss = %.3f')
                    print(format_str % (datetime.now(), i, loss_value))
                    info= format_str % (datetime.now(), i, loss_value)
                    write_detailed_info(info)

                # record the evaluation accuracy
                is_last_step = (i==FLAGS.max_number_of_steps)
                if i%FLAGS.evaluate_every_n_steps==0 or is_last_step:

                    test_accuracy, run_metadata = evaluate_accuracy(sess, coord, test_dataset.num_samples,
                                  test_images, test_labels, test_images, test_labels, 
                                  correct_prediction, FLAGS.test_batch_size, run_meta=False)
                    summary = tf.Summary()
                    summary.value.add(tag='accuracy', simple_value=test_accuracy)
                    train_writer.add_summary(summary,train_step)

                    info = ('%s: step %d, test_accuracy = %.6f') % (datetime.now(), train_step,  test_accuracy)
                    print(info)
                    write_detailed_info(info)

                    ###########################
                    # Save model parameters . #
                    ###########################
                    save_path = saver.save(sess, os.path.join(train_dir, 'model.ckpt-'+str(i)))
                    print("HG: Model saved in file: %s" % save_path)

            coord.request_stop()
            coord.join(threads)
            total_time = time.time()-tic 
            
            train_speed = train_time /train_only_cnt
            train_time = (FLAGS.max_number_of_steps)*train_speed 
            info = "HG: training speed(sec/batch): %.6f\n" %(train_speed)
            info += "HG: training time(min): %.1f, total time(min): %.1f \n" %( train_time/60.0,  total_time/60.0)

            print(info)
            write_detailed_info(info)
Beispiel #20
0
    def _train_loop(self, model, data=None, sess=None, indxs=None):
        """Training function for adam optimizer to clean up code in `train`"""

        if self.run_diagnostics:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        if self.early_stop > 0:
            self.early_stop_params = {
                'prev_costs': np.multiply(np.ones(self.early_stop), np.nan),
                'best_epoch': 0,
                'best_cost': np.inf,
                'chkpted': False,
                'stop_training': False
            }

        num_batches = indxs['train'].shape[0] // self.batch_size

        np.random.seed(model.np_seed)

        # start training loop
        self.epoch = np.nan
        for epoch in range(self.epochs_training):

            self.epoch = epoch

            # shuffle data before each pass
            train_indxs_perm = np.random.permutation(indxs['train'])

            # pass through dataset once
            start = time.time()
            for batch in range(num_batches):
                # get training indices for this batch
                batch_indxs = train_indxs_perm[batch *
                                               self.batch_size:(batch + 1) *
                                               self.batch_size]

                # one step of optimization routine
                feed_dict = self._get_feed_dict(model=model,
                                                data=data,
                                                batch_indxs=batch_indxs)
                #                print(model.gen_net.networks[0].layers[0].kernel.eval(sess))
                #                print(model.gen_net.Q_sqrt.eval(sess))
                sess.run(model.train_step, feed_dict=feed_dict)

            epoch_time = time.time() - start

            # print training updates
            if self.epochs_display is not None and (epoch % self.epochs_display
                                                    == self.epochs_display - 1
                                                    or epoch == 0):
                self._train_print_updates(sess, model, data, indxs, epoch_time)

            # save model checkpoints
            if self.epochs_ckpt is not None and (epoch % self.epochs_ckpt
                                                 == self.epochs_ckpt - 1
                                                 or epoch == 0):
                checkpoint_file = os.path.join(self.checkpoints_dir,
                                               str('epoch_%05g.ckpt' % epoch))
                model.checkpoint_model(sess,
                                       checkpoint_file=checkpoint_file,
                                       print_filepath=True)
                # store most recent checkpoint as model attribute
                model.checkpoint = checkpoint_file

            # save model summaries
            if self.epochs_summary is not None and (epoch % self.epochs_summary
                                                    == self.epochs_summary - 1
                                                    or epoch == 0):
                self._train_save_summaries(sess, model, data, indxs,
                                           run_options, run_metadata)

            # perform early stopping
            if self.early_stop > 0:
                self._train_early_stop(sess, model, data, indxs)
                if self.early_stop_params['stop_training']:
                    break

        # perform final checkpoint if not early stopping (handles case on own)
        if self.epochs_ckpt is np.inf and self.early_stop == 0:
            checkpoint_file = os.path.join(self.checkpoints_dir,
                                           str('epoch_%05g.ckpt' % self.epoch))
            model.checkpoint_model(sess,
                                   checkpoint_file=checkpoint_file,
                                   print_filepath=True)
            # store most recent checkpoint as model attribute
            model.checkpoint = checkpoint_file
Beispiel #21
0
def train(_):
    # create new log files
    if tf.gfile.Exists(FLAGS.log_dir):
        tf.gfile.DeleteRecursively(FLAGS.log_dir)
    tf.gfile.MakeDirs(FLAGS.log_dir)

    tf.reset_default_graph()
    tf.set_random_seed(2)
    np.random.seed(2)

    # Import data
    mnist = input_data.read_data_sets("MNIST-data/", one_hot=True)

    X_train = mnist.train.images.reshape(mnist.train.images.shape[0], 28, 28, 1)
    y_train = mnist.train.labels.astype(np.int64)
    batch_size = 500

    gen = ImageDataGenerator(rotation_range=6, width_shift_range=0.06, shear_range=0.27,
                             height_shift_range=0.06, zoom_range=0.06)
    train_gen = gen.flow(X_train, y_train, batch_size=batch_size, seed=0)

    # Create a multilayer model.
    sess = tf.InteractiveSession()

    # Input placeholders
    with tf.name_scope('input'):
        x  = tf.placeholder(tf.float32, [None, 784], name='x-input')
        y_ = tf.placeholder(tf.int64,   [None,10], name='y-input')

    def weight_variable(shape):
        """Create a weight variable with appropriate initialization."""
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)

    def bias_variable(shape):
        """Create a bias variable with appropriate initialization."""
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)

    def conv2d(x, W):
        return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')


    x_image = tf.reshape(x, [-1,28,28,1])

    #conv1
    W_conv1 = weight_variable([3, 3, 1, 32])
    b_conv1 = bias_variable([32])
    conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    print ("conv1" + str(conv1.get_shape()))

    #conv2
    W_conv2 = weight_variable([3, 3, 32, 64])
    b_conv2 = bias_variable([64])
    conv2 = tf.nn.relu(conv2d(conv1, W_conv2) + b_conv2)
    print ("conv2" + str(conv2.get_shape()))

    #pool1
    pool1 = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
    print ("pool1" + str(pool1.get_shape()))

    #conv3
    W_conv3 = weight_variable([3, 3, 64, 64])
    b_conv3 = bias_variable([64])
    conv3 = tf.nn.relu(conv2d(pool1, W_conv3) + b_conv3)
    print ("conv3" + str(conv3.get_shape()))

    #conv4
    W_conv4 = weight_variable([3, 3, 64, 64])
    b_conv4 = bias_variable([64])
    conv4 = tf.nn.relu(conv2d(conv3, W_conv4) + b_conv4)
    print ("conv4" + str(conv4.get_shape()))

    #pool2
    pool2 = tf.nn.max_pool(conv4, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
    print ("pool1" + str(pool2.get_shape()))

    # dense1 with flatten
    W_fc1 = weight_variable([28 * 28 * 16, 512])
    b_fc1 = bias_variable([512])

    flat = tf.reshape(conv3, [-1, 28*28*16])
    fc1 = tf.nn.relu(tf.matmul(flat, W_fc1) + b_fc1)
    print ("fc1" + str(fc1.get_shape()))

    keep_prob = tf.placeholder(tf.float32)
    fc1_drop = tf.nn.dropout(fc1, keep_prob)
    print ("fc1_drop" + str(fc1_drop.get_shape()))

    W_fc2 = weight_variable([512, 10])
    b_fc2 = bias_variable([10])

    y = tf.nn.softmax(tf.matmul(fc1_drop, W_fc2) + b_fc2)
    print ("y" + str(y.get_shape()))


    with tf.name_scope('cross_entropy'):
        with tf.name_scope('total'):
            cross_entropy = tf.reduce_mean(-tf.reduce_sum(
                tf.cast(y_, tf.float32) * tf.log(y), reduction_indices=[1]))
            # cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
    tf.summary.scalar('cross_entropy', cross_entropy)

    with tf.name_scope('train'):
        train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(cross_entropy)

    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_,1)) 
        with tf.name_scope('accuracy'):
            accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    # Merge all the summaries and write them out
    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph,flush_secs=10)
    test_writer  = tf.summary.FileWriter(FLAGS.log_dir + '/test',flush_secs=10)
    tf.global_variables_initializer().run()

    def feed_dict(train):
        if train:
            xs, ys = next(train_gen)
            xs = xs.reshape(batch_size, 28*28)
            k = FLAGS.dropout
        else:
            xs, ys = mnist.test.images, mnist.test.labels
            k = 1.0
        return {x: xs, y_: ys, keep_prob: k}

    for i in range(FLAGS.max_steps+1):
        if i % 100 == 0:  # Record summaries and test-set accuracy
            summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
            test_writer.add_summary(summary, i)
            print('%s: Accuracy at step %s: %s' % (datetime.now(), i, acc))
        else:  # Record train set summaries, and train
            if i % 100 == 99:  # Record execution stats
                run_options = tf.RunOptions()
                run_metadata = tf.RunMetadata()
                summary, _ = sess.run([merged, train_step],
                                      feed_dict=feed_dict(True),
                                      options=run_options,
                                      run_metadata=run_metadata)
                train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
                train_writer.add_summary(summary, i)
                # print('Adding run metadata for', i)
            else:  # Record a summary
                summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
                if i % 10 == 0:
                    train_writer.add_summary(summary, i)
    train_writer.close()
    test_writer.close()
Beispiel #22
0
def main(argv, neptune_logger=None):
    cfg = BaseConfig().parse(argv)
    os.environ["CUDA_VISIBLE_DEVICES"] = cfg.gpu
    save_model_dir = cfg.checkpoint_dir
    if neptune_logger:
        neptune_logger.create_experiment(name=save_model_dir.split('/')[-1],
                                         params=vars(cfg))
    print(save_model_dir)
    model_basename = os.path.basename(save_model_dir)
    touch_dir(save_model_dir)

    args_file = os.path.join(cfg.checkpoint_dir, 'args.json')
    with open(args_file, 'w') as f:
        json.dump(vars(cfg), f, ensure_ascii=False, indent=2, sort_keys=True)
    # os_utils.touch_dir(save_model_dir)

    log_file = os.path.join(cfg.checkpoint_dir, cfg.log_filename + '.txt')
    os_utils.touch_dir(cfg.checkpoint_dir)
    logger = log_utils.create_logger(log_file)

    img_generator_class = locate(cfg.db_tuple_loader)
    args = dict()
    args['db_path'] = cfg.db_path
    args['tuple_loader_queue_size'] = cfg.tuple_loader_queue_size
    args['preprocess_func'] = cfg.preprocess_func
    args['batch_size'] = cfg.batch_size
    args['shuffle'] = False
    args['csv_file'] = cfg.train_csv_file
    args['img_size'] = const.max_frame_size
    args['gen_hot_vector'] = True
    train_iter = img_generator_class(args)
    args['batch_size'] = cfg.batch_size
    args['csv_file'] = cfg.test_csv_file
    val_iter = img_generator_class(args)

    trn_images, trn_lbls = train_iter.imgs_and_lbls()
    val_imgs, val_lbls = val_iter.imgs_and_lbls()
    test_imgs, test_lbls = trn_images[:50], trn_lbls[:50]

    with tf.Graph().as_default():
        if cfg.train_mode == 'semi_hard' or cfg.train_mode == 'hard' or cfg.train_mode == 'cntr':
            train_dataset = TripletTupleLoader(trn_images, trn_lbls,
                                               cfg).dataset
            #log_dataset = TripletTupleLoader(test_imgs,test_lbls,cfg).dataset
        elif cfg.train_mode == 'semi_hard_anchor' or cfg.train_mode == 'hard_anchor' or cfg.train_mode == 'cntr_anchor':
            train_dataset = TripletTupleLoaderAnchor(trn_images, trn_lbls,
                                                     cfg).dataset
        elif cfg.train_mode == 'hard_anchor_fossils':
            train_dataset = TripletTupleLoaderAnchor(trn_images, trn_lbls,
                                                     cfg).dataset
        elif cfg.train_mode == 'vanilla':
            train_dataset = QuickTupleLoader(trn_images,
                                             trn_lbls,
                                             cfg,
                                             is_training=True,
                                             shuffle=True,
                                             repeat=True).dataset
        else:
            raise NotImplementedError('{} is not a valid train mode'.format(
                cfg.train_mode))

        val_dataset = QuickTupleLoader(val_imgs,
                                       val_lbls,
                                       cfg,
                                       is_training=False,
                                       repeat=False).dataset
        handle = tf.placeholder(tf.string, shape=[])
        iterator = tf.data.Iterator.from_string_handle(
            handle, train_dataset.output_types, train_dataset.output_shapes)
        images_ph, lbls_ph = iterator.get_next()
        #batch_xs,batch_ys = training_iterator.get_next()

        network_class = locate(cfg.network_name)
        model = network_class(cfg, images_ph=images_ph, lbls_ph=lbls_ph)

        # Which loss fn to impose. For example, softmax only is applied in vanilla mode,
        # while softmax + semi-hard triplet is applied in semi_hard mode.
        if cfg.train_mode == 'semi_hard' or cfg.train_mode == 'semi_hard_anchor':
            pre_logits = model.train_pre_logits
            _, w, h, channels = pre_logits.shape
            embed_dim = cfg.emb_dim
            embedding_net = ConvEmbed(emb_dim=embed_dim,
                                      n_input=channels,
                                      n_h=h,
                                      n_w=w)
            embedding = embedding_net.forward(pre_logits)
            embedding = tf.nn.l2_normalize(embedding, dim=-1, epsilon=1e-10)
            margin = cfg.margin
            gt_lbls = tf.argmax(model.gt_lbls, 1)
            metric_loss = triplet_semi.triplet_semihard_loss(
                gt_lbls, embedding, margin)
            logger.info('Triplet loss lambda {}, with margin {}'.format(
                cfg.triplet_loss_lambda, margin))
            total_loss = model.train_loss + cfg.triplet_loss_lambda * tf.reduce_mean(
                metric_loss)
        elif cfg.train_mode == 'hard' or cfg.train_mode == 'hard_anchor':
            pre_logits = model.train_pre_logits
            _, w, h, channels = pre_logits.shape
            embed_dim = cfg.emb_dim
            embedding_net = ConvEmbed(emb_dim=embed_dim,
                                      n_input=channels,
                                      n_h=h,
                                      n_w=w)
            embedding = embedding_net.forward(pre_logits)
            embedding = tf.nn.l2_normalize(embedding, dim=-1, epsilon=1e-10)
            margin = cfg.margin

            logger.info('Triplet loss lambda {}, with margin {}'.format(
                cfg.triplet_loss_lambda, margin))
            gt_lbls = tf.argmax(model.gt_lbls, 1)
            metric_loss = triplet_hard.batch_hard(gt_lbls, embedding, margin)
            total_loss = model.train_loss + cfg.triplet_loss_lambda * tf.reduce_mean(
                metric_loss)
        elif cfg.train_mode == 'hard_fossils' or cfg.train_mode == 'hard_anchor_fossils':
            pre_logits = model.train_pre_logits
            _, w, h, channels = pre_logits.shape
            embed_dim = cfg.emb_dim
            embedding_net = ConvEmbed(emb_dim=embed_dim,
                                      n_input=channels,
                                      n_h=h,
                                      n_w=w)
            embedding = embedding_net.forward(pre_logits)
            embedding = tf.nn.l2_normalize(embedding, dim=-1, epsilon=1e-10)
            margin = cfg.margin

            logger.info('Triplet loss lambda {}, with margin {}'.format(
                cfg.triplet_loss_lambda, margin))
            gt_lbls = tf.argmax(model.gt_lbls, 1)
            metric_loss_far = triplet_hard.batch_hard_fossils(
                gt_lbls, embedding, margin)
            metric_loss = triplet_hard.batch_hard(gt_lbls, embedding, margin)
            total_loss = model.train_loss + 0.8 * cfg.triplet_loss_lambda * tf.reduce_mean(
                metric_loss) + 0.2 * cfg.triplet_loss_lambda * tf.reduce_mean(
                    metric_loss_far)

        elif cfg.train_mode == 'cntr' or cfg.train_mode == 'cntr_anchor':

            pre_logits = model.train_pre_logits
            _, w, h, channels = pre_logits.shape
            embed_dim = cfg.emb_dim
            embedding_net = ConvEmbed(emb_dim=embed_dim,
                                      n_input=channels,
                                      n_h=h,
                                      n_w=w)
            embedding = embedding_net.forward(pre_logits)
            embedding = tf.nn.l2_normalize(embedding, dim=-1, epsilon=1e-10)
            CENTER_LOSS_LAMBDA = 0.003
            CENTER_LOSS_ALPHA = 0.5
            num_fg_classes = cfg.num_classes
            gt_lbls = tf.argmax(model.gt_lbls, 1)
            center_loss_order, centroids, centers_update_op, appear_times, diff = center_loss.get_center_loss(
                embedding, gt_lbls, CENTER_LOSS_ALPHA, num_fg_classes)
            # sample_centroid = tf.reshape(tf.gather(centroids, gt_lbls), [-1, config.emb_dim])
            # center_loss_order = center_loss.center_loss(sample_centroid , embedding)
            logger.info('Center loss lambda {}'.format(CENTER_LOSS_LAMBDA))
            total_loss = model.train_loss + CENTER_LOSS_LAMBDA * tf.reduce_mean(
                center_loss_order)

        elif cfg.train_mode == 'vanilla':
            total_loss = model.train_loss

        logger.info('Train Mode {}'.format(cfg.train_mode))
        # variables_to_train = model.var_2_train();
        # logger.info('variables_to_train  ' + str(variables_to_train))

        trainable_vars = tf.trainable_variables()
        if cfg.caffe_iter_size > 1:  ## Accumulated Gradient
            ## Creation of a list of variables with the same shape as the trainable ones
            # initialized with 0s
            accum_vars = [
                tf.Variable(tf.zeros_like(tv.initialized_value()),
                            trainable=False) for tv in trainable_vars
            ]
            zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars]

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if cfg.train_mode == const.Train_Mode.CNTR:
            update_ops.append(centers_update_op)

        # print(update_ops)

        with tf.control_dependencies(update_ops):

            global_step = tf.Variable(0, name='global_step', trainable=False)
            learning_rate = tf_utils.poly_lr(global_step, cfg)
            optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)

            if cfg.caffe_iter_size > 1:  ## Accumulated Gradient
                # grads = tf.Print(grads,[grads],'Grad Print');
                grads = optimizer.compute_gradients(total_loss, trainable_vars)
                # Adds to each element from the list you initialized earlier with zeros its gradient (works because accum_vars and gvs are in the same order)
                accum_ops = [
                    accum_vars[i].assign_add(gv[0])
                    for i, gv in enumerate(grads)
                ]
                iter_size = cfg.caffe_iter_size
                # Define the training step (part with variable value update)
                train_op = optimizer.apply_gradients(
                    [(accum_vars[i] / iter_size, gv[1])
                     for i, gv in enumerate(grads)],
                    global_step=global_step)

            else:
                grads = optimizer.compute_gradients(total_loss)
                train_op = optimizer.apply_gradients(grads,
                                                     global_step=global_step)

        sess = tf.InteractiveSession()
        training_iterator = train_dataset.make_one_shot_iterator()

        validation_iterator = val_dataset.make_initializable_iterator()
        training_handle = sess.run(training_iterator.string_handle())
        validation_handle = sess.run(validation_iterator.string_handle())

        tb_path = save_model_dir
        logger.info(tb_path)
        start_iter = tb_utils.get_latest_iteration(tb_path)

        train_writer = tf.summary.FileWriter(tb_path, sess.graph)
        tf.global_variables_initializer().run()
        saver = tf.train.Saver()  # saves variables learned during training

        ckpt_file = tf.train.latest_checkpoint(save_model_dir)
        logger.info('Model Path {}'.format(ckpt_file))
        load_model_msg = model.load_model(save_model_dir,
                                          ckpt_file,
                                          sess,
                                          saver,
                                          load_logits=False)
        logger.info(load_model_msg)

        ckpt_file = os.path.join(save_model_dir, cfg.checkpoint_filename)

        train_loss = tf.summary.scalar('Train_loss', model.train_loss)
        train_accuracy = tf.summary.scalar('Train_Acc', model.train_accuracy)
        val_loss = tf.summary.scalar('Val_Loss', model.val_loss)
        val_acc_op = tf.summary.scalar('Batch_Val_Acc', model.val_accuracy)
        model_acc_op = tf.summary.scalar('Split_Val_Accuracy',
                                         model.val_accumulated_accuracy)

        best_model_step = 0
        best_acc = 0
        logger.info('Start Training from {}, till {}'.format(
            start_iter, cfg.train_iters))

        # Start Training
        for step in range(start_iter + 1, cfg.train_iters + 1):

            start_time_train = time.time()

            # Update network weights while supporting caffe_iter_size
            for mini_batch in range(cfg.caffe_iter_size - 1):
                feed_dict = {handle: training_handle}
                model_loss_value, accuracy_value, _ = sess.run(
                    [model.train_loss, model.train_accuracy, accum_ops],
                    feed_dict)

            feed_dict = {handle: training_handle}

            model_loss_value, accuracy_value, _ = sess.run(
                [model.train_loss, model.train_accuracy, train_op], feed_dict)

            if cfg.caffe_iter_size > 1:  ## Accumulated Gradient
                sess.run(zero_ops)

            train_time = time.time() - start_time_train
            #training loss
            loss_summary = tf.Summary(value=[
                tf.Summary.Value(tag="Train_loss",
                                 simple_value=model_loss_value)
            ])
            acc_summary = tf.Summary(value=[
                tf.Summary.Value(tag="Train_Acc", simple_value=accuracy_value)
            ])
            train_writer.add_summary(loss_summary, step)
            train_writer.add_summary(acc_summary, step)
            if neptune_logger:
                neptune_logger.log_metric('Train_loss', model_loss_value)
            if cfg.training_mode_debug:
                logger.info(
                    'Training mode debug is ON, will save images every iteration.'
                )
                batch_xs, batch_ys = training_iterator.get_next()
                summary_op = tf.summary.image('image-batch',
                                              batch_xs,
                                              max_outputs=10)
                summary = sess.run(summary_op)
                train_writer.add_summary(summary)

            if (step == 1 or step % cfg.logging_threshold == 0):
                logger.info(
                    'i {0:04d} loss {1:4f} Acc {2:2f} Batch Time {3:3f}'.
                    format(step, model_loss_value, accuracy_value, train_time))

                if (step % cfg.test_interval == 0):
                    run_metadata = tf.RunMetadata()
                    tf.local_variables_initializer().run()
                    sess.run(validation_iterator.initializer)

                    _val_acc_op = 0

                    gts = []
                    preds = []
                    pred_3 = []
                    pred_5 = []
                    while True:
                        try:
                            # Eval network on validation/testing split
                            feed_dict = {handle: validation_handle}
                            gt, preds_raw, predictions, val_loss_op, batch_accuracy, accuracy_op, _val_acc_op, _val_acc, c_cnf_mat, macro_acc = sess.run(
                                [
                                    model.val_gt, model.val_preds,
                                    model.val_class_prediction, val_loss,
                                    model.val_accuracy, model_acc_op,
                                    val_acc_op, model.val_accumulated_accuracy,
                                    model.val_confusion_mat,
                                    model.val_per_class_acc_acc
                                ], feed_dict)
                            gts += list(gt)
                            preds += list(predictions)
                            for g, p in zip(gt, preds_raw):
                                preds_sort_3 = np.argsort(p)[-3:]
                                preds_sort_5 = np.argsort(p)[-5:]
                                if g in preds_sort_3:
                                    pred_3 += [g]
                                else:
                                    pred_3 += [preds_sort_3[-1]]

                                if g in preds_sort_5:
                                    pred_5 += [g]
                                else:
                                    pred_5 += [preds_sort_5[-1]]

                        except tf.errors.OutOfRangeError:
                            logger.info('Val Acc {0}, Macro Acc: {1}'.format(
                                _val_acc, macro_acc))
                            if neptune_logger:
                                neptune_logger.log_metric(
                                    'Validation Accuracy Macro', macro_acc)
                            logger.info('____ Clasification Report Top 1 ____')
                            report = classification_report(gts,
                                                           preds,
                                                           output_dict=True)
                            if neptune_logger:
                                neptune_logger.log_metric(
                                    'Top 1 f-1',
                                    report['weighted avg']['f1-score'])
                                neptune_logger.log_metric(
                                    'Top 1 precision',
                                    report['weighted avg']['precision'])
                                neptune_logger.log_metric(
                                    'Top 1 recall',
                                    report['weighted avg']['recall'])
                            csv_pd = classification_report_csv(report)
                            csv_pd.to_csv(
                                os.path.join(
                                    save_model_dir,
                                    'Classification_Report_top1%04d.csv' %
                                    step))
                            logger.info(report)
                            logger.info('____ Clasification Report Top 3 ____')
                            report = classification_report(gts,
                                                           pred_3,
                                                           output_dict=True)
                            if neptune_logger:
                                neptune_logger.log_metric(
                                    'Top 3 f-1',
                                    report['weighted avg']['f1-score'])
                                neptune_logger.log_metric(
                                    'Top 3 precision',
                                    report['weighted avg']['precision'])
                                neptune_logger.log_metric(
                                    'Top 3 recall',
                                    report['weighted avg']['recall'])
                            csv_pd = classification_report_csv(report)
                            csv_pd.to_csv(
                                os.path.join(
                                    save_model_dir,
                                    'Classification_Report_top3%04d.csv' %
                                    step))
                            logger.info(report)
                            logger.info('____ Clasification Report Top 5 ____')
                            report = classification_report(gts,
                                                           pred_5,
                                                           output_dict=True)
                            if neptune_logger:
                                neptune_logger.log_metric(
                                    'Top 5 f-1',
                                    report['weighted avg']['f1-score'])
                                neptune_logger.log_metric(
                                    'Top 5 precision',
                                    report['weighted avg']['precision'])
                                neptune_logger.log_metric(
                                    'Top 5 recall',
                                    report['weighted avg']['recall'])
                            csv_pd = classification_report_csv(report)
                            csv_pd.to_csv(
                                os.path.join(
                                    save_model_dir,
                                    'Classification_Report_top5%04d.csv' %
                                    step))
                            logger.info(report)
                            break
                    #with train_writer.as_default():

                    batch_xs, batch_ys = training_iterator.get_next()
                    summary_op = tf.summary.image('image-batch',
                                                  batch_xs,
                                                  max_outputs=10)
                    summary = sess.run(summary_op)
                    train_writer.add_summary(summary)
                    train_writer.add_run_metadata(run_metadata,
                                                  'step%03d' % step)
                    train_writer.add_summary(val_loss_op, step)
                    train_writer.add_summary(_val_acc_op, step)
                    train_writer.add_summary(accuracy_op, step)
                    train_writer.flush()

                    if (step % 100 == 0):
                        #log_iterator = log_dataset.make_initializable_iterator()

                        saver.save(sess, ckpt_file)
                        if best_acc < _val_acc:
                            saver.save(sess, ckpt_file + 'best')
                            best_acc = _val_acc
                            best_model_step = step

                        logger.info('Best Acc {0} at {1} == {2}'.format(
                            best_acc, best_model_step, model_basename))

        logger.info('Triplet loss lambda {}'.format(cfg.triplet_loss_lambda))
        logger.info('Mode {}'.format(cfg.train_mode))
        logger.info('Loop complete')
        sess.close()
Beispiel #23
0
def train():
  # Import data
  mnist = input_data.read_data_sets(FLAGS.data_dir,
                                    one_hot=True,
                                    fake_data=FLAGS.fake_data)

  sess = tf.InteractiveSession()
  # Create a multilayer model.

  # Input placeholders
  with tf.name_scope('input'):
    x = tf.placeholder(tf.float32, [None, 784], name='x-input')
    y_ = tf.placeholder(tf.float32, [None, 10], name='y-input')

  with tf.name_scope('input_reshape'):
    image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
    tf.summary.image('input', image_shaped_input, 10)

  # We can't initialize these variables to 0 - the network will get stuck.
  def weight_variable(shape):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

  def bias_variable(shape):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

  def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
      mean = tf.reduce_mean(var)
      tf.summary.scalar('mean', mean)
      with tf.name_scope('stddev'):
        stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
      tf.summary.scalar('stddev', stddev)
      tf.summary.scalar('max', tf.reduce_max(var))
      tf.summary.scalar('min', tf.reduce_min(var))
      tf.summary.histogram('histogram', var)

  def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
    """Reusable code for making a simple neural net layer.

    It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
    It also sets up name scoping so that the resultant graph is easy to read,
    and adds a number of summary ops.
    """
    # Adding a name scope ensures logical grouping of the layers in the graph.
    with tf.name_scope(layer_name):
      # This Variable will hold the state of the weights for the layer
      with tf.name_scope('weights'):
        weights = weight_variable([input_dim, output_dim])
        variable_summaries(weights)
      with tf.name_scope('biases'):
        biases = bias_variable([output_dim])
        variable_summaries(biases)
      with tf.name_scope('Wx_plus_b'):
        preactivate = tf.matmul(input_tensor, weights) + biases
        tf.summary.histogram('pre_activations', preactivate)
      activations = act(preactivate, name='activation')
      tf.summary.histogram('activations', activations)
      return activations

  hidden1 = nn_layer(x, 784, 500, 'layer1')

  with tf.name_scope('dropout'):
    keep_prob = tf.placeholder(tf.float32)
    tf.summary.scalar('dropout_keep_probability', keep_prob)
    dropped = tf.nn.dropout(hidden1, keep_prob)

  # Do not apply softmax activation yet, see below.
  y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)

  with tf.name_scope('cross_entropy'):
    # The raw formulation of cross-entropy,
    #
    # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)),
    #                               reduction_indices=[1]))
    #
    # can be numerically unstable.
    #
    # So here we use tf.nn.softmax_cross_entropy_with_logits on the
    # raw outputs of the nn_layer above, and then average across
    # the batch.
    diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)
    with tf.name_scope('total'):
      cross_entropy = tf.reduce_mean(diff)
  tf.summary.scalar('cross_entropy', cross_entropy)

  with tf.name_scope('train'):
    train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
        cross_entropy)

  with tf.name_scope('accuracy'):
    with tf.name_scope('correct_prediction'):
      correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    with tf.name_scope('accuracy'):
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  tf.summary.scalar('accuracy', accuracy)

  # Merge all the summaries and write them out to
  # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
  merged = tf.summary.merge_all()
  train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph)
  test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test')
  tf.global_variables_initializer().run()

  # Train the model, and also write summaries.
  # Every 10th step, measure test-set accuracy, and write test summaries
  # All other steps, run train_step on training data, & add training summaries

  def feed_dict(train):
    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
    if train or FLAGS.fake_data:
      xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
      k = FLAGS.dropout
    else:
      xs, ys = mnist.test.images, mnist.test.labels
      k = 1.0
    return {x: xs, y_: ys, keep_prob: k}

  for i in range(FLAGS.max_steps):
    if i % 10 == 0:  # Record summaries and test-set accuracy
      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
      test_writer.add_summary(summary, i)
      print('Accuracy at step %s: %s' % (i, acc))
    else:  # Record train set summaries, and train
      if i % 100 == 99:  # Record execution stats
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        summary, _ = sess.run([merged, train_step],
                              feed_dict=feed_dict(True),
                              options=run_options,
                              run_metadata=run_metadata)
        train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
        train_writer.add_summary(summary, i)
        print('Adding run metadata for', i)
      else:  # Record a summary
        summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
        train_writer.add_summary(summary, i)
  train_writer.close()
  test_writer.close()
Beispiel #24
0
def main(m_type, m_name, logger, save_videos=False):
    """
    run an evaluation on the Test datasets: ExCuSe, ElSe, PupilNet, Swirski, LPW
    :param m_type: need model type: inception, yolo, gap,...
    :param m_name: name of the model ( model folder name: 3A4Bh-Ref25)
    :param logger: need logger to log the events
    :return: show the results in terminal
    """
    run_meta = tf.RunMetadata()
    with tf.Session() as sess:

        # load best model
        model = load_model(sess, m_type, m_name, logger)
        # calculate the FLOPS
        opts_f = tf.profiler.ProfileOptionBuilder.float_operation()
        flops = tf.profiler.profile(run_meta=run_meta,
                                    cmd='op',
                                    options=opts_f)

        opts_p = tf.profiler.ProfileOptionBuilder.trainable_variables_parameter(
        )
        params = tf.profiler.profile(sess.graph,
                                     run_meta=run_meta,
                                     cmd='op',
                                     options=opts_p)

        if flops is not None:
            print('TF stats gives', flops.total_float_ops)

        if params is not None:
            print('TF stats gives', params.total_parameters)

        # print the result for different pixel error
        pixel_errors = [1, 2, 3, 4, 5, 7, 10, 15, 20]

        # get the csv files
        datasets = glob.glob('data/emma_data/*.txt')
        datasets = sorted(datasets)

        # we save the results of all dataset in to this list
        dataset_results = {}

        for d in datasets:

            # get the name of dataset from the path
            dataset_name = d.split("/")[2].split(".")[0]

            # save the result (differences) in the list
            dataset_results[dataset_name] = []

            dataset_len = get_len(d)

            batch_size = 2 * config["batch_size"]
            batch = read_batch(d, batch_size, dataset_name)

            # use tqdm progress bar
            tqdm_len = np.ceil(dataset_len / batch_size)
            with tqdm(total=tqdm_len, unit='batch') as t:
                # set the name of dataset as the title of progress bar
                t.set_description_str(dataset_name)

                test_images = []
                pred_labels = []

                # loop over batch of images
                for images, truths, shapes, pngs in batch:
                    predictions = model.predict(sess, images)

                    upscale_preds_x, upscale_preds_y, w = upscale_preds(
                        predictions, shapes)
                    # calculate the difference
                    a = upscale_preds_x - truths[:, 0]
                    b = upscale_preds_y - truths[:, 1]

                    diff = np.sqrt((a * a + b * b))

                    dataset_results[dataset_name].extend(diff)
                    t.update()

                    # add images and predicted labels to test_images and pred_labels to creating the video
                    len_data = len(upscale_preds_x)
                    upscale_preds_x = np.reshape(upscale_preds_x,
                                                 newshape=(len_data, 1))
                    upscale_preds_y = np.reshape(upscale_preds_y,
                                                 newshape=(len_data, 1))
                    w = np.reshape(w, newshape=(len_data, 1))
                    upscale_center = np.concatenate(
                        (upscale_preds_x, upscale_preds_y, w), axis=1)
                    test_images.extend(pngs)
                    pred_labels.extend(upscale_center)

                # create the predicted labels on test sets
                if save_videos:
                    video_creator(dataset_name, test_images, pred_labels)

        # save the results in a dic
        dataset_errors = {}

        for key, val in dataset_results.items():
            dataset_errors[key] = []
            for e in pixel_errors:
                d = np.asarray(val, dtype=np.float32)
                acc = np.mean(np.asarray(d < e, dtype=np.int))
                dataset_errors[key].append(acc)

        print_resutls(dataset_errors, pixel_errors, dataset_names)
        return
        print("####### LPW #######")
        # run model on LPW dataset
        lpw_results = {}
        lpw_r = lpw_reader(batch_size=2 * config["batch_size"],
                           normalize_image=True)
        for imgs, truths, d_name, shapes in lpw_r:
            # add dataset name to results dict
            if d_name not in lpw_results.keys():
                lpw_results[d_name] = []

            predictions = model.predict(sess, imgs)

            upscale_preds_x, upscale_preds_y, w = upscale_preds(
                predictions, shapes)

            # calculate the difference
            a = upscale_preds_x - truths[:, 0]
            b = upscale_preds_y - truths[:, 1]

            diff = np.sqrt((a * a + b * b))

            lpw_results[d_name].extend(diff)

        lpw_errors = {}

        for key, val in lpw_results.items():
            lpw_errors[key] = []
            for e in pixel_errors:
                d = np.asarray(val, dtype=np.float32)
                acc = np.mean(np.asarray(d < e, dtype=np.int))
                lpw_errors[key].append(acc)

        print_resutls(lpw_errors, pixel_errors)

        print("####### SWIRSKI #######")
        # run model on LPW dataset
        swk_results = {}
        swk_r = swirski_reader(batch_size=2 * config["batch_size"])
        for imgs, truths, d_name, shapes in swk_r:
            # add dataset name to results dict
            if d_name not in swk_results.keys():
                swk_results[d_name] = []

            predictions = model.predict(sess, imgs)

            upscale_preds_x, upscale_preds_y, w = upscale_preds(
                predictions, shapes)

            # calculate the difference
            a = upscale_preds_x - truths[:, 0]
            b = upscale_preds_y - truths[:, 1]

            diff = np.sqrt((a * a + b * b))

            swk_results[d_name].extend(diff)

        swk_errors = {}

        for key, val in swk_results.items():
            swk_errors[key] = []
            for e in pixel_errors:
                d = np.asarray(val, dtype=np.float32)
                acc = np.mean(np.asarray(d < e, dtype=np.int))
                swk_errors[key].append(acc)

        print_resutls(swk_errors, pixel_errors)
Beispiel #25
0
def minimize(scope, log_norm, energy, steps, check_obs={}, \
                check_every=100, save_every=1000, lr=0.001, restore_path=None, output_path=None, profiling=False):
    """Trains the wavefunction to minimize the energy.

    Arguments:
        scope (string): the variable scope where all variables to be trained reside
        log_norm (tensor): a batch of log wavefunction norm of samples, shape (batch_shape,) 
        energy (tensor): energies of the same batch of samples, shape (batch_shape,)
        steps (int): the total number of training steps
        check_obs (dict from strings to tensors): names and values of the tensors to log
        check_every (int): the number of steps between logging
        save_every (int): the number of steps between saving the model
        lr (float): learning rate for the optimizer
        restore_path (string or None): the directory (ending with "/") to restore the model
                If None, start from scratch by default.
        output_path (string or None): the directory to save the trained model
                If None, save to "results/" + scope + "/" by default.
        profiling (bool): whether to profile the code

    Returns:
        obs (dict from strings to scalars or lists of scalars): statistics of the tensors in check_obs
            For each name in check_obs, obs contains an entry of the same name for the average value of 
            the observable, and an entry of name_std for its standard deviation, with an entry of name_raw 
            for its history of values during the training.
    """
    # set up training ops
    mean_energy = tf.reduce_mean(energy)
    loss = energy + 2 * log_norm * tf.stop_gradient(energy - mean_energy)
    loss = tf.reduce_mean(loss)
    variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                  scope=scope)
    # print(variables)
    print("Total number of parameters:",
          sum(tf.Session().run(tf.size(v)) for v in variables))
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    grads_and_vars = optimizer.compute_gradients(loss, variables)
    grads_and_vars = [(zero_nan(tf.clip_by_norm(grad, 1.0)), var)
                      for grad, var in grads_and_vars if grad is not None]
    train_op = optimizer.apply_gradients(grads_and_vars)
    # set up loggings
    output_path = output_path or "results/" + scope + "/"
    model_path = output_path + "model.ckpt"
    saver = tf.train.Saver(variables)
    file_writer = tf.summary.FileWriter(output_path)
    check_obs = {
        key: tf.reduce_mean(value)
        for key, value in check_obs.items()
    }
    check_obs["energy"] = mean_energy
    for key, value in check_obs.items():
        tf.summary.scalar(key, value)
    summary = tf.summary.merge_all()
    res = {key: [] for key in check_obs}
    # initialize variables
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = False
    sess = tf.Session(config=config)
    sess.run(tf.variables_initializer(optimizer.variables()))
    if restore_path is None or not restore(sess, saver,
                                           restore_path + "model.ckpt", True):
        sess.run(tf.variables_initializer(variables))
        print("Starting from scratch ...")
    progbar = tf.keras.utils.Progbar(steps,
                                     stateful_metrics=list(check_obs.keys()))
    if profiling:
        options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
    # training
    for step in range(1, steps + 1):
        if profiling and step % save_every == 0:
            e, _ = sess.run([mean_energy, train_op],
                            options=options,
                            run_metadata=run_metadata)
        else:
            e, _ = sess.run([mean_energy, train_op])
        # log
        if step % check_every == 0:
            obs = [(key, sess.run(value)) for key, value in check_obs.items()]
            for key, value in obs:
                res[key].append(value)
            progbar.update(step, obs)
            file_writer.add_summary(sess.run(summary), step)
        else:
            progbar.update(step, [("energy", e)])  # always update energy
        # save the model
        if step % save_every == 0:
            saver.save(sess, model_path)
            print(" Model saved to", model_path)
            if profiling:
                fetched_timeline = timeline.Timeline(run_metadata.step_stats)
                chrome_trace = fetched_timeline.generate_chrome_trace_format()
                with open(
                        output_path + 'timeline_%d.json' %
                    (step // save_every), 'w') as f:
                    f.write(chrome_trace)
    # return the observables
    raw = {key + "_raw": res[key] for key in res}
    mean = {key: np.mean(truncate(res[key])) for key in res}
    std = {key + "_std": np.std(truncate(res[key])) for key in res}
    return {**raw, **mean, **std}
Beispiel #26
0
def timeGraph(gdef,
              batch_size=128,
              image_folder='images',
              nvidiasmi='output.out',
              latencyF='latency.txt',
              StopTime=100):
    tf.logging.info("Starting execution")
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95)
    tf.reset_default_graph()
    g = tf.Graph()
    ##  if dummy_input is None:
    ##    dummy_input = np.random.random_sample((batch_size,224,224,3))
    imageCounter = 0
    outlist = []
    with g.as_default():
        imagenstack = tf.constant([""])
        imageString = []
        for imageName in sorted(glob.glob(image_folder + '/*.JPEG')):
            imageString.append(imageName)
            imageCounter = imageCounter + 1
        imagenstack = tf.stack(imageString)

        dataset = tf.data.Dataset.from_tensor_slices(imagenstack)
        dataset = dataset.map(_parse_function)
        dataset = dataset.batch(batch_size)
        dataset = dataset.repeat()
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        out = tf.import_graph_def(graph_def=gdef,
                                  input_map={"input": next_element},
                                  return_elements=["final_layer/predictions"])
        out = out[0].outputs[0]
        print("\n\n image out", out, "\n\n")
        outlist.append(out)
        print("\n\n image out", outlist[-1], "\n\n")

    timings = []

    with tf.Session(graph=g,
                    config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        num_iters = int(math.ceil(imageCounter / batch_size))
        print("\n\n\nNumber of Iterations = ", num_iters)
        nvidiasmiCommand = "nohup nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -l 1 -f " + nvidiasmi + " &"
        nmonCommand = "nmon -s1 -c 2000 -F " + nvidiasmi + ".nmon  &"
        pmonCommand = "nohup nvidia-smi pmon -f " + nvidiasmi + ".pmon &"
        os.system(nvidiasmiCommand)
        #os.system(nmonCommand)
        #os.system(pmonCommand)
        tstart = time.time()
        if os.path.exists(latencyF):
            append_write = 'a'  # append if already exists
        else:
            append_write = 'w'  # make a new file if not
        runtimeResults = open(latencyF, append_write)
        start_process = time.time()

        for k in range(num_iters):

            tic = time.time()
            val = sess.run(outlist)
            tac = time.time()

            runtimeResults.write(str(tac - tic))
            runtimeResults.write("\n")

            if ((tac - start_process) > StopTime):
                break

            #printing lables
            printLables = 0
            if printLables == 1:
                if os.path.exists('resultLables_PNASNet_5_Large_331.txt'):
                    append_write = 'a'  # append if already exists
                else:
                    append_write = 'w'  # make a new file if not
                #
                highscore = open('resultLables_PNASNet_5_Large_331.txt',
                                 append_write)
                for index1 in range(0, len(topX(val[0], f.topN)[1])):
                    highscore.write(
                        str(getLabels(labels,
                                      topX(val[0], f.topN)[1][index1])))
                    highscore.write("\n")
                highscore.close()
            #end for prinlables
        timings.append(time.time() - tstart)
        runtimeResults.close()
        # if os.path.exists('runtimes_PNASNet_5_Large_331.txt'):
        #     append_write = 'a' # append if already exists
        # else:
        #     append_write = 'w' # make a new file if not
        #
        # runtimeResults = open('runtimes_PNASNet_5_Large_331.txt',append_write)
        # runtimeResults.write(str(batch_size) + ',' + str(timings[-1]))
        # runtimeResults.write("\n")
        # runtimeResults.close()
        os.system("pkill nvidia-smi")
        #os.system("pkill nmon")
        sess.close()
        tf.logging.info("Timing loop done!")
        return timings, True, val[0], None
Beispiel #27
0
def CIFAR10_train():
    # 将处理输入数据的计算都放在名字为'input'的命名空间下
    with tf.name_scope('input'):
        # 读取数据
        images_train, lables_train = CIFAR10_input.distorted_inputs(
            data_dir=data_dir, batch_size=FLAGS.BATCH_SIZE)
        images_test, lables_test = CIFAR10_input.inputs(
            eval_data=True, data_dir=data_dir, batch_size=FLAGS.BATCH_SIZE)
        # 定义输入输出placeholder
        x = tf.placeholder(tf.float32, [
            None, CIFAR10_inference.IMAGE_SIZE, CIFAR10_inference.IMAGE_SIZE,
            CIFAR10_inference.NUM_CHANNELS
        ],
                           name='x-input')
        y_ = tf.placeholder(tf.float32, [None, CIFAR10_inference.OUTPUT_NODE],
                            name='y-input')

    # 使用LeNet5_inference定义的前向传播
    y = CIFAR10_inference.inference(x, True, 'L2')
    global_step = tf.Variable(0, trainable=False)

    # 将处理滑动平均相关的计算都放在一个命名空间下
    with tf.name_scope('moving_average'):
        # 定义滑动平均操作
        variable_average = tf.train.ExponentialMovingAverage(
            FLAGS.MOVING_AVERAGE_DECAY, global_step)
        variables_average_op = variable_average.apply(tf.trainable_variables())

    # 将计算损失函数相关的计算都放在一个命名空间下
    with tf.name_scope('loss_function'):
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=y, labels=tf.argmax(y_, 1))
        cross_entropy_mean = tf.reduce_mean(cross_entropy)
        tf.add_to_collection('losses', cross_entropy_mean)
        loss = tf.add_n(tf.get_collection('losses'))
        tf.summary.scalar('loss_function', loss)

    # 将定义学习率、优化方法以及每一轮训练需要执行的操作放在一个命名空间
    with tf.name_scope('train_step'):
        learning_rate = tf.train.exponential_decay(FLAGS.LEARNING_RATE_BASE,
                                                   global_step,
                                                   50000 / FLAGS.BATCH_SIZE,
                                                   FLAGS.LEARNING_RATE_DECAY,
                                                   staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)
        train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(
            loss, global_step)
        # 顺序执行
        with tf.control_dependencies([train_step, variables_average_op]):
            train_op = tf.no_op(name='train')

    with tf.name_scope('accuracy'):
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy_train = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
        tf.summary.scalar('accuracy_train', accuracy_train)

        accuracy_test = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
        # tf.summary.scalar('accuracy_test', accuracy_test)

    # 初始化Tensorflow持久化类
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # sess = tfdbg.LocalCLIDebugWrapperSession(sess, ui_type="readline")  # 被调试器封装的会话
        tf.global_variables_initializer().run()
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        # 合并日志
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter('../log_train', tf.get_default_graph())
        xs_test, ys_test = sess.run([images_test, lables_test])
        # 对标签进行onehot编码
        ys_test_onehot = np.eye(10, dtype=float)[ys_test]
        # 在训练过程中不再测试模型在验证数据上的表现,验证和测试的过程会有一个独立的程序来完成
        for i in range(FLAGS.TRAINING_STEPS):
            xs, ys = sess.run([images_train, lables_train])
            # 对标签进行onehot编码
            ys_onehot = np.eye(10, dtype=float)[ys]

            # 每1000轮保存一次模型
            if i % 1000 == 0:
                # 配置运行时需要记录的信息
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                # 运行时记录运行信息的proto
                run_metadata = tf.RunMetadata()
                # 将配置信息和记录运行信息的proto传入运行的过程,从而记录运行时每一个节点的时间、空间开销信息
                _, loss_value, step, result = sess.run(
                    [train_op, loss, global_step, merged],
                    feed_dict={
                        x: xs,
                        y_: ys_onehot
                    },
                    options=run_options,
                    run_metadata=run_metadata)
                # 将节点在运行时的信息写入日志文件
                writer.add_run_metadata(run_metadata, 'step%03d' % i)
                writer.add_summary(result, i)
                # 输出当前的训练情况。这里只输出了模型在当前训练batch上的损失函数大小。通过损失函数的大小可以大概了解
                # 训练的情况。在验证集上的正确率信息会有一个单独的程序来 生成。

                train_accuracy = accuracy_train.eval(feed_dict={
                    x: xs,
                    y_: ys_onehot
                })
                test_accuracy = accuracy_test.eval(feed_dict={
                    x: xs_test,
                    y_: ys_test_onehot
                })
                print(
                    '%s:After %d training steps, loss = %g, accuracy = %g, validation accuracy=%g'
                    % (datetime.now(), i, loss_value, train_accuracy,
                       test_accuracy))

                # 保存当前的模型。这里给出了global_step参数,这样可以让每个被保存的文件名末尾加上训练的轮数,比如
                # 'model.ckpt-1000'表示训练1000轮之后得到的模型
                saver.save(sess,
                           os.path.join(MODEL_SAVE_PATH, MODEL_NAME),
                           global_step=global_step)
            else:
                _, loss_value, step = sess.run([train_op, loss, global_step],
                                               feed_dict={
                                                   x: xs,
                                                   y_: ys_onehot
                                               })
        coord.request_stop()
        coord.join(threads)
    writer.close()
def test_embedding_lookup_sparse():
    batch = 256  #FLAGS.batch
    nzdim = 100  #FLAGS.nonzero_dim
    layers = "1000000,30"
    weight_shape = [int(d) for d in layers.split(",")]
    inputs = gen_sparse_inputs(batch, weight_shape[0], nzdim)

    batch_ids = gen_sparse_indices(batch, nzdim)

    embedding_op = sparse_transform(
        tf.SparseTensor(indices=batch_ids,
                        values=inputs["ids"],
                        dense_shape=[batch, nzdim]),
        tf.SparseTensor(indices=batch_ids,
                        values=inputs["values"],
                        dense_shape=[batch, nzdim]), weight_shape)

    init_op = tf.global_variables_initializer()

    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
    run_metadata = tf.RunMetadata()

    graph_options = tf.GraphOptions(enable_bfloat16_sendrecv=False)
    sess_config = tf.ConfigProto(allow_soft_placement=True,
                                 log_device_placement=True,
                                 graph_options=graph_options)
    sess = tf.Session(config=sess_config)

    sess.run(init_op)

    step = 0
    max_steps = 1000
    while step < max_steps:  #FLAGS.max_steps:
        sess.run([embedding_op],
                 options=run_options,
                 run_metadata=run_metadata)
        step += 1

    ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder
    opts = ProfileOptionBuilder(
        ProfileOptionBuilder.time_and_memory()).with_node_names(
            show_name_regexes=['.*train.py.*']).build()

    tf.profiler.profile(tf.get_default_graph(),
                        run_meta=run_metadata,
                        cmd='code',
                        options=opts)

    # Print to stdout an analysis of the memory usage and the timing information
    # broken down by operation types.
    tf.profiler.profile(
        tf.get_default_graph(),
        run_meta=run_metadata,
        cmd='op',
        options=tf.profiler.ProfileOptionBuilder.time_and_memory())

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        run_meta=run_metadata,
        tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY
    )
Beispiel #29
0
    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="TRPO",
              reset_num_timesteps=True):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            with self.sess.as_default():
                seg_gen = traj_segment_generator(
                    self.policy_pi,
                    self.env,
                    self.timesteps_per_batch,
                    reward_giver=self.reward_giver,
                    gail=self.using_gail)

                episodes_so_far = 0
                timesteps_so_far = 0
                iters_so_far = 0
                t_start = time.time()
                len_buffer = deque(
                    maxlen=40)  # rolling buffer for episode lengths
                reward_buffer = deque(
                    maxlen=40)  # rolling buffer for episode rewards
                self.episode_reward = np.zeros((self.n_envs, ))

                true_reward_buffer = None
                if self.using_gail:
                    true_reward_buffer = deque(maxlen=40)

                    # Initialize dataloader
                    batchsize = self.timesteps_per_batch // self.d_step
                    self.expert_dataset.init_dataloader(batchsize)

                    #  Stats not used for now
                    # TODO: replace with normal tb logging
                    #  g_loss_stats = Stats(loss_names)
                    #  d_loss_stats = Stats(reward_giver.loss_name)
                    #  ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"])

                while True:
                    if callback is not None:
                        # Only stop training if return value is False, not when it is None. This is for backwards
                        # compatibility with callbacks that have no return statement.
                        if callback(locals(), globals()) is False:
                            break
                    if total_timesteps and timesteps_so_far >= total_timesteps:
                        break

                    logger.log("********** Iteration %i ************" %
                               iters_so_far)

                    def fisher_vector_product(vec):
                        return self.allmean(
                            self.compute_fvp(
                                vec, *fvpargs,
                                sess=self.sess)) + self.cg_damping * vec

                    # ------------------ Update G ------------------
                    logger.log("Optimizing Policy...")
                    # g_step = 1 when not using GAIL
                    mean_losses = None
                    vpredbefore = None
                    tdlamret = None
                    observation = None
                    action = None
                    seg = None
                    for k in range(self.g_step):
                        with self.timed("sampling"):
                            seg = seg_gen.__next__()
                        add_vtarg_and_adv(seg, self.gamma, self.lam)
                        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                        observation, action, atarg, tdlamret = seg["ob"], seg[
                            "ac"], seg["adv"], seg["tdlamret"]
                        vpredbefore = seg[
                            "vpred"]  # predicted value function before update
                        atarg = (atarg - atarg.mean()) / atarg.std(
                        )  # standardized advantage function estimate

                        # true_rew is the reward without discount
                        if writer is not None:
                            self.episode_reward = total_episode_reward_logger(
                                self.episode_reward, seg["true_rew"].reshape(
                                    (self.n_envs, -1)), seg["dones"].reshape(
                                        (self.n_envs, -1)), writer,
                                self.num_timesteps)

                        args = seg["ob"], seg["ob"], seg["ac"], atarg
                        fvpargs = [arr[::5] for arr in args]

                        self.assign_old_eq_new(sess=self.sess)

                        with self.timed("computegrad"):
                            steps = self.num_timesteps + (k + 1) * (
                                seg["total_timestep"] / self.g_step)
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata(
                            ) if self.full_tensorboard_log else None
                            # run loss backprop with summary, and save the metadata (memory, compute time, ...)
                            if writer is not None:
                                summary, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)
                                if self.full_tensorboard_log:
                                    writer.add_run_metadata(
                                        run_metadata, 'step%d' % steps)
                                writer.add_summary(summary, steps)
                            else:
                                _, grad, *lossbefore = self.compute_lossandgrad(
                                    *args,
                                    tdlamret,
                                    sess=self.sess,
                                    options=run_options,
                                    run_metadata=run_metadata)

                        lossbefore = self.allmean(np.array(lossbefore))
                        grad = self.allmean(grad)
                        if np.allclose(grad, 0):
                            logger.log("Got zero gradient. not updating")
                        else:
                            with self.timed("conjugate_gradient"):
                                stepdir = conjugate_gradient(
                                    fisher_vector_product,
                                    grad,
                                    cg_iters=self.cg_iters,
                                    verbose=self.rank == 0
                                    and self.verbose >= 1)
                            assert np.isfinite(stepdir).all()
                            shs = .5 * stepdir.dot(
                                fisher_vector_product(stepdir))
                            # abs(shs) to avoid taking square root of negative values
                            lagrange_multiplier = np.sqrt(
                                abs(shs) / self.max_kl)
                            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                            fullstep = stepdir / lagrange_multiplier
                            expectedimprove = grad.dot(fullstep)
                            surrbefore = lossbefore[0]
                            stepsize = 1.0
                            thbefore = self.get_flat()
                            thnew = None
                            for _ in range(10):
                                thnew = thbefore + fullstep * stepsize
                                self.set_from_flat(thnew)
                                mean_losses = surr, kl_loss, *_ = self.allmean(
                                    np.array(
                                        self.compute_losses(*args,
                                                            sess=self.sess)))
                                improve = surr - surrbefore
                                logger.log("Expected: %.3f Actual: %.3f" %
                                           (expectedimprove, improve))
                                if not np.isfinite(mean_losses).all():
                                    logger.log(
                                        "Got non-finite value of losses -- bad!"
                                    )
                                elif kl_loss > self.max_kl * 1.5:
                                    logger.log(
                                        "violated KL constraint. shrinking step."
                                    )
                                elif improve < 0:
                                    logger.log(
                                        "surrogate didn't improve. shrinking step."
                                    )
                                else:
                                    logger.log("Stepsize OK!")
                                    break
                                stepsize *= .5
                            else:
                                logger.log("couldn't compute a good step")
                                self.set_from_flat(thbefore)
                            if self.nworkers > 1 and iters_so_far % 20 == 0:
                                # list of tuples
                                paramsums = MPI.COMM_WORLD.allgather(
                                    (thnew.sum(), self.vfadam.getflat().sum()))
                                assert all(
                                    np.allclose(ps, paramsums[0])
                                    for ps in paramsums[1:])

                        with self.timed("vf"):
                            for _ in range(self.vf_iters):
                                # NOTE: for recurrent policies, use shuffle=False?
                                for (mbob, mbret) in dataset.iterbatches(
                                    (seg["ob"], seg["tdlamret"]),
                                        include_final_partial_batch=False,
                                        batch_size=128,
                                        shuffle=True):
                                    grad = self.allmean(
                                        self.compute_vflossandgrad(
                                            mbob, mbob, mbret, sess=self.sess))
                                    self.vfadam.update(grad, self.vf_stepsize)

                    for (loss_name, loss_val) in zip(self.loss_names,
                                                     mean_losses):
                        logger.record_tabular(loss_name, loss_val)

                    logger.record_tabular(
                        "explained_variance_tdlam_before",
                        explained_variance(vpredbefore, tdlamret))

                    if self.using_gail:
                        # ------------------ Update D ------------------
                        logger.log("Optimizing Discriminator...")
                        logger.log(fmt_row(13, self.reward_giver.loss_name))
                        assert len(observation) == self.timesteps_per_batch
                        batch_size = self.timesteps_per_batch // self.d_step

                        # NOTE: uses only the last g step for observation
                        d_losses = [
                        ]  # list of tuples, each of which gives the loss for a minibatch
                        # NOTE: for recurrent policies, use shuffle=False?
                        for ob_batch, ac_batch in dataset.iterbatches(
                            (observation, action),
                                include_final_partial_batch=False,
                                batch_size=batch_size,
                                shuffle=True):
                            ob_expert, ac_expert = self.expert_dataset.get_next_batch(
                            )
                            # update running mean/std for reward_giver
                            if self.reward_giver.normalize:
                                self.reward_giver.obs_rms.update(
                                    np.concatenate((ob_batch, ob_expert), 0))

                            # Reshape actions if needed when using discrete actions
                            if isinstance(self.action_space,
                                          gym.spaces.Discrete):
                                if len(ac_batch.shape) == 2:
                                    ac_batch = ac_batch[:, 0]
                                if len(ac_expert.shape) == 2:
                                    ac_expert = ac_expert[:, 0]
                            *newlosses, grad = self.reward_giver.lossandgrad(
                                ob_batch, ac_batch, ob_expert, ac_expert)
                            self.d_adam.update(self.allmean(grad),
                                               self.d_stepsize)
                            d_losses.append(newlosses)
                        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"],
                                    seg["ep_true_rets"])  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(
                            lr_local)  # list of tuples
                        lens, rews, true_rets = map(flatten_lists,
                                                    zip(*list_lr_pairs))
                        true_reward_buffer.extend(true_rets)
                    else:
                        # lr: lengths and rewards
                        lr_local = (seg["ep_lens"], seg["ep_rets"]
                                    )  # local values
                        list_lr_pairs = MPI.COMM_WORLD.allgather(
                            lr_local)  # list of tuples
                        lens, rews = map(flatten_lists, zip(*list_lr_pairs))
                    len_buffer.extend(lens)
                    reward_buffer.extend(rews)

                    if len(len_buffer) > 0:
                        logger.record_tabular("EpLenMean", np.mean(len_buffer))
                        logger.record_tabular("EpRewMean",
                                              np.mean(reward_buffer))
                    if self.using_gail:
                        logger.record_tabular("EpTrueRewMean",
                                              np.mean(true_reward_buffer))
                    logger.record_tabular("EpThisIter", len(lens))
                    episodes_so_far += len(lens)
                    current_it_timesteps = MPI.COMM_WORLD.allreduce(
                        seg["total_timestep"])
                    timesteps_so_far += current_it_timesteps
                    self.num_timesteps += current_it_timesteps
                    iters_so_far += 1

                    logger.record_tabular("EpisodesSoFar", episodes_so_far)
                    logger.record_tabular("TimestepsSoFar", self.num_timesteps)
                    logger.record_tabular("TimeElapsed", time.time() - t_start)

                    if self.verbose >= 1 and self.rank == 0:
                        logger.dump_tabular()

        return self
Beispiel #30
0
def main():

    #总的几时开始
    mainStart = time.clock()

    #add by Tony
    step1Start = time.time()

    if a.seed is None:
        a.seed = random.randint(0, 2**31 - 1)

    tf.set_random_seed(a.seed)
    np.random.seed(a.seed)
    random.seed(a.seed)

    if not os.path.exists(a.output_dir):
        os.makedirs(a.output_dir)

    if a.mode == "test" or a.mode == "export":
        if a.checkpoint is None:
            raise Exception("checkpoint required for test mode")

        # load some options from the checkpoint
        #从checkpoint取一些选项
        options = {"which_direction", "ngf", "ndf", "lab_colorization"}
        with open(os.path.join(a.checkpoint, "options.json")) as f:
            for key, val in json.loads(f.read()).items():
                if key in options:
                    print("loaded", key, "=", val)
                    setattr(a, key, val)
        # disable these features in test mode
        a.scale_size = CROP_SIZE
        a.flip = False

    for k, v in a._get_kwargs():
        print(k, "=", v)

    with open(os.path.join(a.output_dir, "options.json"), "w") as f:
        f.write(json.dumps(vars(a), sort_keys=True, indent=4))

    #add by Tony
    step1Stop = time.time()

    if a.mode == "export":
        # export the generator to a meta graph that can be imported later for standalone generation
        if a.lab_colorization:
            raise Exception("export not supported for lab_colorization")

        input = tf.placeholder(tf.string, shape=[1])
        input_data = tf.decode_base64(input[0])
        input_image = tf.image.decode_png(input_data)

        # remove alpha channel if present
        input_image = tf.cond(tf.equal(tf.shape(input_image)[2], 4), lambda: input_image[:,:,:3], lambda: input_image)
        # convert grayscale to RGB
        input_image = tf.cond(tf.equal(tf.shape(input_image)[2], 1), lambda: tf.image.grayscale_to_rgb(input_image), lambda: input_image)

        input_image = tf.image.convert_image_dtype(input_image, dtype=tf.float32)
        input_image.set_shape([CROP_SIZE, CROP_SIZE, 3])
        batch_input = tf.expand_dims(input_image, axis=0)

        with tf.variable_scope("generator"):
            batch_output = deprocess(create_generator(preprocess(batch_input), 3))

        output_image = tf.image.convert_image_dtype(batch_output, dtype=tf.uint8)[0]
        if a.output_filetype == "png":
            output_data = tf.image.encode_png(output_image)
        elif a.output_filetype == "jpeg":
            output_data = tf.image.encode_jpeg(output_image, quality=80)
        else:
            raise Exception("invalid filetype")
        output = tf.convert_to_tensor([tf.encode_base64(output_data)])

        key = tf.placeholder(tf.string, shape=[1])
        inputs = {
            "key": key.name,
            "input": input.name
        }
        tf.add_to_collection("inputs", json.dumps(inputs))
        outputs = {
            "key":  tf.identity(key).name,
            "output": output.name,
        }
        tf.add_to_collection("outputs", json.dumps(outputs))

        init_op = tf.global_variables_initializer()
        #创建一个Saver对象
        restore_saver = tf.train.Saver()
        #创建一个Saver对象
        export_saver = tf.train.Saver()



        with tf.Session() as sess:
            sess.run(init_op)
            print("loading model from checkpoint")
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            restore_saver.restore(sess, checkpoint)
            print("exporting model")
            export_saver.export_meta_graph(filename=os.path.join(a.output_dir, "export.meta"))
            export_saver.save(sess, os.path.join(a.output_dir, "export"), write_meta_graph=False)

        return

       
    #add by Tony
    loadExamplesCreateModelStart = time.time()

    #delete all files of folder
    #test之前先删除掉facades/val和facades_test目录下的所有文件
    def del_file(path):
        ls = os.listdir(path)
        for i in ls:
            c_path = os.path.join(path, i)
            if os.path.isdir(c_path):
                del_file(c_path)
            else:
                os.remove(c_path)
    
    del_file("facades/val")
    del_file("facades_test")

    examples = load_examples()
    print("examples count = %d" % examples.count)


    # inputs and targets are [batch_size, height, width, channels]
    model = create_model(examples.inputs, examples.targets)

    # undo colorization splitting on images that we use for display/output
    if a.lab_colorization:
        if a.which_direction == "AtoB":
            # inputs is brightness, this will be handled fine as a grayscale image
            # need to augment targets and outputs with brightness
            targets = augment(examples.targets, examples.inputs)
            outputs = augment(model.outputs, examples.inputs)
            # inputs can be deprocessed normally and handled as if they are single channel
            # grayscale images
            inputs = deprocess(examples.inputs)
        elif a.which_direction == "BtoA":
            # inputs will be color channels only, get brightness from targets
            inputs = augment(examples.inputs, examples.targets)
            targets = deprocess(examples.targets)
            outputs = deprocess(model.outputs)
        else:
            raise Exception("invalid direction")
    else:
        inputs = deprocess(examples.inputs)
        targets = deprocess(examples.targets)
        outputs = deprocess(model.outputs)

    def convert(image):
        if a.aspect_ratio != 1.0:
            # upscale to correct aspect ratio
            size = [CROP_SIZE, int(round(CROP_SIZE * a.aspect_ratio))]
            image = tf.image.resize_images(image, size=size, method=tf.image.ResizeMethod.BICUBIC)
         #改变图像数据的类型
        return tf.image.convert_image_dtype(image, dtype=tf.uint8, saturate=True)

    # reverse any processing on images so they can be written to disk or displayed to user
    with tf.name_scope("convert_inputs"):
        converted_inputs = convert(inputs)

    with tf.name_scope("convert_targets"):
        converted_targets = convert(targets)

    with tf.name_scope("convert_outputs"):
        converted_outputs = convert(outputs)

    with tf.name_scope("encode_images"):
        display_fetches = {
            "paths": examples.paths,
            "inputs": tf.map_fn(tf.image.encode_png, converted_inputs, dtype=tf.string, name="input_pngs"),
            "targets": tf.map_fn(tf.image.encode_png, converted_targets, dtype=tf.string, name="target_pngs"),
            "outputs": tf.map_fn(tf.image.encode_png, converted_outputs, dtype=tf.string, name="output_pngs"),
        }

    # summaries
    with tf.name_scope("inputs_summary"):
        tf.summary.image("inputs", converted_inputs)

    with tf.name_scope("targets_summary"):
        tf.summary.image("targets", converted_targets)

    with tf.name_scope("outputs_summary"):
        tf.summary.image("outputs", converted_outputs)

    with tf.name_scope("predict_real_summary"):
        tf.summary.image("predict_real", tf.image.convert_image_dtype(model.predict_real, dtype=tf.uint8))

    with tf.name_scope("predict_fake_summary"):
        tf.summary.image("predict_fake", tf.image.convert_image_dtype(model.predict_fake, dtype=tf.uint8))

    tf.summary.scalar("discriminator_loss", model.discrim_loss)
    tf.summary.scalar("generator_loss_GAN", model.gen_loss_GAN)
    tf.summary.scalar("generator_loss_L1", model.gen_loss_L1)

    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name + "/values", var)

    for grad, var in model.discrim_grads_and_vars + model.gen_grads_and_vars:
        tf.summary.histogram(var.op.name + "/gradients", grad)

    with tf.name_scope("parameter_count"):
        parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()])

    #只保存最后一代的模型
    saver = tf.train.Saver(max_to_keep=1)

    logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None
    sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)


    loadExamplesCreateModelStop = time.time()


    #add by Tony
    loadingModelStart = time.time()
    with sv.managed_session() as sess:
        print("parameter_count =", sess.run(parameter_count))

        if a.checkpoint is not None:
            print("loading model from checkpoint")
            #可以使用tf.train.latest_checkpoint()来自动获取最后一次保存的模型
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            #模型的恢复用的是restore()函数,它需要两个参数restore(sess, save_path),save_path指的是保存的模型路径
            saver.restore(sess, checkpoint)

        max_steps = 2**32
        # a.max_epochs = number of training epochs
        if a.max_epochs is not None:
            max_steps = examples.steps_per_epoch * a.max_epochs
        # a.max_steps = number of training steps
        if a.max_steps is not None:
            max_steps = a.max_steps

        loadingModelStop = time.time()

        if a.mode == "test":
            # testing
            # at most, process the test data once

             #add by Tony
            testStart = time.time()

            start = time.time()
            max_steps = min(examples.steps_per_epoch, max_steps)
            for step in range(max_steps):
                results = sess.run(display_fetches)
                #把生成的图片放到相应的目录中
                filesets = save_images(results)
                for i, f in enumerate(filesets):
                    print("evaluated image", f["name"])
                index_path = append_index(filesets)
            print("wrote index at", index_path)
            print("rate", (time.time() - start) / max_steps)

            testStop = time.time()

            #add by Tony : merge all small image to one
            #把test生成的小的图片合成大的图片
            mergeStart = time.time()
            import mergeAllImages
            if __name__ == "__main__":
                mergeAllImages.main()

            #把生成的图片文件复制到指定的output_file
            shutil.copy("facades_test/merged3.png", a.output_file)
            mergeStop = time.time()


            #step1 time
            #print("★★★step1 Time used                      :",str(step1Stop-step1Start) + "秒")

            #loadExamplesCreateModelStop time
            print("★★★loadExamplesCreateModel Time used     :",str(loadExamplesCreateModelStop-loadExamplesCreateModelStart) + "秒")

            #split time
            print("(Split Time used                            :",str(splitStop-splitStart) + "秒")


            #loading Model time
            print("★★★loading Model time used               :",str(loadingModelStop-loadingModelStart) + "秒")

            #test time
            print("★★★test Time used                        :",str(testStop-testStart) + "秒")

            #merge time
            print("★★★Merge Time used                       :",str(mergeStop-mergeStart) + "秒")

            #all time
            elapsed = (time.clock() - mainStart)
            print("★★★Time used(Total)                      : ",str(elapsed) + "秒")

        else:
            # training
            start = time.time()

            for step in range(max_steps):
                def should(freq):
                    return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1)

                options = None
                run_metadata = None
                if should(a.trace_freq):
                    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()

                fetches = {
                    "train": model.train,
                    "global_step": sv.global_step,
                }

                # display progress every 50 steps
                if should(a.progress_freq):
                    fetches["discrim_loss"] = model.discrim_loss
                    fetches["gen_loss_GAN"] = model.gen_loss_GAN
                    fetches["gen_loss_L1"] = model.gen_loss_L1

                # update summaries every 100 steps
                if should(a.summary_freq):
                    fetches["summary"] = sv.summary_op

                # write current training images every 0 steps
                if should(a.display_freq):
                    fetches["display"] = display_fetches

                results = sess.run(fetches, options=options, run_metadata=run_metadata)

                if should(a.summary_freq):
                    print("recording summary")
                    sv.summary_writer.add_summary(results["summary"], results["global_step"])

                if should(a.display_freq):
                    print("saving display images")
                    filesets = save_images(results["display"], step=results["global_step"])
                    append_index(filesets, step=True)

                if should(a.trace_freq):
                    print("recording trace")
                    sv.summary_writer.add_run_metadata(run_metadata, "step_%d" % results["global_step"])

                if should(a.progress_freq):
                    # global_step will have the correct step count if we resume from a checkpoint
                    train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch)
                    train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1
                    rate = (step + 1) * a.batch_size / (time.time() - start)
                    remaining = (max_steps - step) * a.batch_size / rate
                    print("progress  epoch %d  step %d  image/sec %0.1f  remaining %dm" % (train_epoch, train_step, rate, remaining / 60))
                    print("discrim_loss", results["discrim_loss"])
                    print("gen_loss_GAN", results["gen_loss_GAN"])
                    print("gen_loss_L1", results["gen_loss_L1"])

                # 每隔save_freq(默认为5000)保存Model  
                if should(a.save_freq):
                    print("saving model")
                    # 保存训练好的模型 第二个参数设定保存的路径和名字 第三个参数将训练的次数作为后缀加入到模型名字中
                    # saver.save(sess, 'my-model', global_step=1000) ==> filename: 'my-model-1000'
                    saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step)

                if sv.should_stop():
                    break