Example #1
0
    def test_set_writer(self):
        """
        Check that when using an EventFileWriter from a FileWriter,
        the resulting events file contains events from both the FileWriter
        and easy_tf_log.
        """
        with tempfile.TemporaryDirectory() as temp_dir:
            os.chdir(temp_dir)

            writer = tf.summary.FileWriter('logs')

            var = tf.Variable(0.0)
            summary_op = tf.summary.scalar('tf_var', var)
            sess = tf.Session()
            sess.run(var.initializer)
            summary = sess.run(summary_op)
            writer.add_summary(summary)

            easy_tf_log.set_writer(writer.event_writer)
            easy_tf_log.tflog('easy-tf-log_var', 0)

            self.assertEqual(os.listdir(), ['logs'])
            event_filename = osp.join('logs', os.listdir('logs')[0])
            self.assertIn('events.out.tfevents', event_filename)

            tags = set()
            for event in tf.train.summary_iterator(event_filename):
                for value in event.summary.value:
                    tags.add(value.tag)
            self.assertIn('tf_var', tags)
            self.assertIn('easy-tf-log_var', tags)
    def update(self, *args, **kwargs):
        episode_length = len(self.state_set)

        discounted_rewards = self.discount_rewards(self.reward_set)
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards) + 0.0000001

        update_inputs = np.zeros((episode_length, self.config.config_dict['STATE_SPACE'][0]))
        advantages = np.zeros((episode_length, self.action_size))

        for i in range(episode_length):
            update_inputs[i] = self.state_set[i]
            advantages[i][self.action_set[i]] = discounted_rewards[i]

        average_loss = 0.0
        for i in range(self.config.config_dict['ITERATION_EVER_EPOCH']):
            re, _ = self.sess.run(fetches=[self.loss, self.optimize_op],
                                  feed_dict={
                                      self.state_input: update_inputs,
                                      self.advantages: advantages
                                  })
            average_loss += np.sum(re)
        average_loss /= self.config.config_dict['ITERATION_EVER_EPOCH']
        self.log_queue.put({self.name + '_LOSS': average_loss})
        easy_tf_log.tflog(key=self.name + 'TRAIN_LOSS', value=average_loss)
        self.state_set, self.action_set, self.reward_set = [], [], []
        self.print_log_queue(status=self.status_key['TRAIN'])
 def update(self):
     average_loss = 0.0
     for i in range(self.config.config_dict['ITERATION_EVER_EPOCH']):
         #            print("memory length=", self.memory.)
         if self.memory.observations0.length < self.config.config_dict[
                 'BATCH_SIZE']:
             return
         batch_data = self.memory.sample(
             batch_size=self.config.config_dict['BATCH_SIZE'])
         target_q_value_list = []
         for state in batch_data['obs1']:
             _, target_q_value = self.predict_target(sess=self.sess,
                                                     new_obs=state)
             target_q_value_list.append(target_q_value)
         re = self.sess.run(fetches=[self.loss, self.optimize],
                            feed_dict={
                                self.reward_input: batch_data['rewards'],
                                self.action_input: batch_data['actions'],
                                self.state_input: batch_data['obs0'],
                                self.done_input: batch_data['terminals1'],
                                self.target_q_input: target_q_value_list
                            })
         average_loss += re[0]
     average_loss /= self.config.config_dict['ITERATION_EVER_EPOCH']
     self.log_queue.put({self.name + '_LOSS': average_loss})
     easy_tf_log.tflog(key=self.name + 'TRAIN_LOSS', value=average_loss)
     # TODO POLICY FOR UPDATE DQN TARGET
     self.sess.run(self.update_target_q_op)
Example #4
0
    def test_full(self):
        """
        Log a few values and check that the event file contain the expected
        values.
        """
        with tempfile.TemporaryDirectory() as temp_dir:
            os.chdir(temp_dir)

            for i in range(10):
                easy_tf_log.tflog('foo', i)
            for i in range(10):
                easy_tf_log.tflog('bar', i)

            event_filename = osp.join('logs', os.listdir('logs')[0])
            event_n = 0
            for event in tf.train.summary_iterator(event_filename):
                if event_n == 0:  # metadata
                    event_n += 1
                    continue
                if event_n <= 10:
                    self.assertEqual(event.step, event_n - 1)
                    self.assertEqual(event.summary.value[0].tag, "foo")
                    self.assertEqual(event.summary.value[0].simple_value,
                                     float(event_n - 1))
                if event_n > 10 and event_n <= 20:
                    self.assertEqual(event.step, event_n - 10 - 1)
                    self.assertEqual(event.summary.value[0].tag, "bar")
                    self.assertEqual(event.summary.value[0].simple_value,
                                     float(event_n - 10 - 1))
                event_n += 1
    def compute_grad(self):
        batch = self.ddpg_model.memory.sample(
            batch_size=self.ddpg_model.batch_size)
        if self.ddpg_model.normalize_returns and self.ddpg_model.enable_popart:
            old_mean, old_std, target_Q = self.ddpg_model.sess.run(
                [
                    self.ddpg_model.ret_rms.mean, self.ddpg_model.ret_rms.std,
                    self.ddpg_model.target_Q
                ],
                feed_dict={
                    self.ddpg_model.obs1:
                    batch['obs1'],
                    self.ddpg_model.rewards:
                    batch['rewards'],
                    self.ddpg_model.terminals1:
                    batch['terminals1'].astype('float32'),
                })
            self.ddpg_model.ret_rms.update(target_Q.flatten())
            self.ddpg_model.sess.run(self.ddpg_model.renormalize_Q_outputs_op,
                                     feed_dict={
                                         self.ddpg_model.old_std:
                                         np.array([old_std]),
                                         self.ddpg_model.old_mean:
                                         np.array([old_mean]),
                                     })

        else:
            target_Q = self.ddpg_model.sess.run(
                self.ddpg_model.target_Q,
                feed_dict={
                    self.ddpg_model.obs1:
                    batch['obs1'],
                    self.ddpg_model.rewards:
                    batch['rewards'],
                    self.ddpg_model.terminals1:
                    batch['terminals1'].astype('float32'),
                })

        # Get all gradients and perform a synced update.
        ops = [self.ddpg_model.actor_grads, self.ddpg_model.critic_grads]
        actor_grads, critic_grads = self.ddpg_model.sess.run(
            ops,
            feed_dict={
                self.ddpg_model.obs0: batch['obs0'],
                self.ddpg_model.actions: batch['actions'],
                self.ddpg_model.critic_target: target_Q,
            })
        actor_grads_norm = np.sqrt(np.sum(actor_grads**2))
        critic_grads_norm = np.sqrt(np.sum(actor_grads_norm**2))

        easy_tf_log.tflog(key=self.name + '_' + self.current_env_status +
                          '_ACTOR_GRADS_2_NORM',
                          value=actor_grads_norm)
        easy_tf_log.tflog(key=self.name + '_' + self.current_env_status +
                          '_CRITIC_GRADS_2_NORM',
                          value=critic_grads_norm)
Example #6
0
 def test_no_setup(self):
     """
     Test that if tflog() is used without any extra setup, a directory
     'logs' is created in the current directory containing the event file.
     """
     with tempfile.TemporaryDirectory() as temp_dir:
         os.chdir(temp_dir)
         easy_tf_log.tflog('var', 0)
         self.assertEqual(os.listdir(), ['logs'])
         self.assertIn('events.out.tfevents', os.listdir('logs')[0])
Example #7
0
 def test_set_dir(self):
     """
     Confirm that set_dir works.
     """
     with tempfile.TemporaryDirectory() as temp_dir:
         os.chdir(temp_dir)
         easy_tf_log.set_dir('logs2')
         easy_tf_log.tflog('var', 0)
         self.assertEqual(os.listdir(), ['logs2'])
         self.assertIn('events.out.tfevents', os.listdir('logs2')[0])
Example #8
0
def run_manager(worker_threads, sess, lr, step_counter, update_counter, log_dir, saver,
                wake_interval_seconds, ckpt_interval_seconds):
    checkpoint_file = osp.join(log_dir, 'checkpoints', 'network.ckpt') # Junte um ou mais componentes de caminho de maneira inteligente. O valor de retorno é a concatenação de caminho e qualquer membro de * caminhos com exatamente um separador de diretório ( os.sep) seguindo cada parte não vazia, exceto a última, significando que o resultado só terminará em um separador se a última parte estiver vazia. Se um componente for um caminho absoluto, todos os componentes anteriores serão descartados e a junção continuará a partir do componente de caminho absoluto.

    ckpt_timer = utils.Timer(duration_seconds=ckpt_interval_seconds)
    ckpt_timer.reset()

    step_rate = utils.RateMeasure()
    step_rate.reset(int(step_counter))

    while True:
        time.sleep(wake_interval_seconds)

        steps_per_second = step_rate.measure(int(step_counter))
        easy_tf_log.tflog('misc/steps_per_second', steps_per_second)
        easy_tf_log.tflog('misc/steps', int(step_counter))
        easy_tf_log.tflog('misc/updates', int(update_counter))
        easy_tf_log.tflog('misc/lr', sess.run(lr))

        alive = [t.is_alive() for t in worker_threads]

        if ckpt_timer.done() or not any(alive):
            saver.save(sess, checkpoint_file, int(step_counter))
            print("Checkpoint saved to '{}'".format(checkpoint_file))
            ckpt_timer.reset()

        if not any(alive):
            break
Example #9
0
def run_manager(worker_threads, sess, lr, step_counter, update_counter, log_dir, saver,
                wake_interval_seconds, ckpt_interval_seconds):
    checkpoint_file = osp.join(log_dir, 'checkpoints', 'network.ckpt')

    ckpt_timer = utils.Timer(duration_seconds=ckpt_interval_seconds)
    ckpt_timer.reset()

    step_rate = utils.RateMeasure()
    step_rate.reset(int(step_counter))

    while True:
        time.sleep(wake_interval_seconds)

        steps_per_second = step_rate.measure(int(step_counter))
        easy_tf_log.tflog('misc/steps_per_second', steps_per_second)
        easy_tf_log.tflog('misc/steps', int(step_counter))
        easy_tf_log.tflog('misc/updates', int(update_counter))
        easy_tf_log.tflog('misc/lr', sess.run(lr))

        alive = [t.is_alive() for t in worker_threads]

        if ckpt_timer.done() or not any(alive):
            saver.save(sess, checkpoint_file, int(step_counter))
            print("Checkpoint saved to '{}'".format(checkpoint_file))
            ckpt_timer.reset()

        if not any(alive):
            break
Example #10
0
    def step(self, action):
        if self.episode_done:
            raise Exception("Attempted to call step() after episode done")

        obs, reward, done, info = self.env.step(action)

        self.episode_rewards.append(reward)
        self.episode_length_steps += 1
        if done:
            reward_sum = sum(self.episode_rewards)
            print("{}Episode {} finished; reward sum {}".format(
                self.log_prefix, self.episode_n, reward_sum))
            if self.log_dir is not None:
                tflog('rl/episode_reward_sum', reward_sum)
                tflog('rl/episode_length_steps', self.episode_length_steps)
            self.episode_done = True

        return obs, reward, done, info
Example #11
0
    def train(self, prefs_train, prefs_val, val_interval):
        """
        Train all ensemble members for one epoch.
        """

        start_steps = self.n_steps
        start_time = time.time()

        for ind, batch in enumerate(
                batch_iter(prefs_train.prefs, batch_size=32, shuffle=True)):
            self.train_step(batch, prefs_train)
            self.n_steps += 1

            if self.n_steps and self.n_steps % val_interval == 0:
                self.val_step(prefs_val)
        end_time = time.time()
        end_steps = self.n_steps
        rate = (end_steps - start_steps) / (end_time - start_time)
        easy_tf_log.tflog('reward_predictor_training_steps_per_second', rate)
Example #12
0
    def recv_prefs(self, pref_pipe):
        n_recvd = 0
        while not self.stop_recv:
            try:
                s1, s2, pref = pref_pipe.get(block=True, timeout=1)
                logging.debug("Pref DB got segment pair plus preferences from pref pipe")
            except queue.Empty:
                logging.debug("Pref DB got no segments")
                continue
            n_recvd += 1

            val_fraction = self.val_db.maxlen / (self.val_db.maxlen +
                                                 self.train_db.maxlen)

            self.lock.acquire(blocking=True)
            if np.random.rand() < val_fraction:
                self.val_db.append(s1, s2, pref)
                easy_tf_log.tflog('val_db_len', len(self.val_db))
            else:
                self.train_db.append(s1, s2, pref)
                easy_tf_log.tflog('train_db_len', len(self.train_db))

            self.lock.release()

            easy_tf_log.tflog('n_prefs_recvd', n_recvd)
 def update(self):
     self.update_count += 1
     if self.update_count % 50 == 0:
         self.ddpg_model.adapt_param_noise()
     # TODO CHECK THIS API
     critic_loss, actor_loss = self.ddpg_model.train()
     self.ddpg_model.update_target_net()
     self.log_queue.put({
         self.name + '_ACTOR': actor_loss,
         self.name + '_CRITIC': critic_loss
     })
     easy_tf_log.tflog(key=self.name + '_' + self.current_env_status +
                       '_ACTOR_TRAIN_LOSS',
                       value=actor_loss)
     easy_tf_log.tflog(key=self.name + '_' + self.current_env_status +
                       '_CRITIC_TRAIN_LOSS',
                       value=critic_loss)
     self.compute_grad()
     return {
         'VALUE_FUNCTION_LOSS': critic_loss,
         'CONTROLLER_LOSS': actor_loss
     }
    def predict(self, state, *args, **kwargs):
        state = np.reshape(state, [-1])
        count = self._real_env_sample_count
        eps = 1.0 - (self.config.config_dict['EPS'] - self.config.config_dict['EPS_GREEDY_FINAL_VALUE']) * \
              (count / self.config.config_dict['EPS_ZERO_FLAG'])
        if eps < 0:
            eps = 0.0
        rand_eps = np.random.rand(1)
        if self.config.config_dict[
                'EPS_GREEDY_FLAG'] == 1 and rand_eps < eps and self.status == self.status_key[
                    'TRAIN']:
            res = self.env.action_space.sample()
        else:
            res = np.array(self.model.predict(state))

        if self.config.config_dict[
                'NOISE_FLAG'] > 0 and self.status == self.status_key['TRAIN']:
            res, noise = noise_adder(action=res, agent=self)
            for i in range(len(noise)):
                easy_tf_log.tflog(key=self.name + '_ACTION_NOISE_DIM_' +
                                  str(i),
                                  value=noise[i])
        return np.reshape(res, [-1])
Example #15
0
    def test_explicit_step(self):
        """
        Log a few values explicitly setting the step number.
        """
        with tempfile.TemporaryDirectory() as temp_dir:
            os.chdir(temp_dir)

            for i in range(5):
                easy_tf_log.tflog('foo', i, step=(10 * i))
            # These ones should continue from where the previous ones left off
            for i in range(5):
                easy_tf_log.tflog('foo', i)

            event_filename = osp.join('logs', os.listdir('logs')[0])
            event_n = 0
            for event in tf.train.summary_iterator(event_filename):
                if event_n == 0:  # metadata
                    event_n += 1
                    continue
                if event_n <= 5:
                    self.assertEqual(event.step, 10 * (event_n - 1))
                if event_n > 5 and event_n <= 10:
                    self.assertEqual(event.step, 40 + (event_n - 5))
                event_n += 1
Example #16
0
 def recv_segments(self, seg_pipe):
     """
     Receive segments from `seg_pipe` into circular buffer `segments`.
     """
     max_wait_seconds = 0.5
     start_time = time.time()
     n_recvd = 0
     while time.time() - start_time < max_wait_seconds:
         try:
             segment = seg_pipe.get(block=True, timeout=max_wait_seconds)
         except queue.Empty:
             return
         if len(self.segments) < self.max_segs:
             self.segments.append(segment)
         else:
             self.segments[self.seg_idx] = segment
             self.seg_idx = (self.seg_idx + 1) % self.max_segs
         n_recvd += 1
     easy_tf_log.tflog('segment_idx', self.seg_idx)
     easy_tf_log.tflog('n_segments_rcvd', n_recvd)
     easy_tf_log.tflog('n_segments', len(self.segments))
    def recv_prefs(self, pref_pipe):
        n_recvd = 0
        while not self.stop_recv:
            try:
                s1, s2, pref = pref_pipe.get(block=True, timeout=1)
            except queue.Empty:
                continue
            n_recvd += 1

            val_fraction = self.val_db.maxlen / (self.val_db.maxlen +
                                                 self.train_db.maxlen)

            self.lock.acquire(blocking=True)
            if np.random.rand() < val_fraction:
                self.val_db.append(s1, s2, pref)
                easy_tf_log.tflog('val_db_len', len(self.val_db))
            else:
                self.train_db.append(s1, s2, pref)
                easy_tf_log.tflog('train_db_len', len(self.train_db))
            self.lock.release()

            easy_tf_log.tflog('n_prefs_recvd', n_recvd)
Example #18
0
#%% Tensorflow / Keras

#For specifying device to use
with tf.device('/gpu:0'): pass

# Adding new axis to array
x_train = train[..., tf.newaxis]

# Tensorboard setup
logdir="logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
callbacks=[tensorboard_callback] # in model.fit()

# Easy tf log to tensorboard for scalars
etl.set_dir('logs2')
for k in range(20, 30): etl.tflog('baz', k)
# to start tensorboard put this into the terminal: tensorboard --logdir path/to/log/dir

# Plot Graphs
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
pass

# Class for displaying progress on the end of an epoch
class DisplayCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    # end of an episode
    if done:
        print('At the end of episode', episode_nb, 'the total reward was :',
              reward_sum)

        # increment episode number
        episode_nb += 1

        # training
        model.fit(x=np.vstack(x_train),
                  y=np.vstack(y_train),
                  verbose=1,
                  callbacks=[tbCallBack],
                  sample_weight=discount_rewards(rewards, gamma))

        # Saving the weights used by our model
        if episode_nb % epochs_before_saving == 0:
            model.save_weights('my_model_weights' +
                               datetime.now().strftime("%Y%m%d-%H%M%S") +
                               '.h5')

        # Log the reward
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        tflog('running_reward', running_reward, custom_dir=log_dir)

        # Reinitialization
        x_train, y_train, rewards = [], [], []
        observation = env.reset()
        reward_sum = 0
        prev_input = None
Example #20
0
    def sample(self):
        self.step += 1
        if self._current_observation_n is None:
            self._current_observation_n = self.env.reset()
        action_n = []

        for agent, current_observation in zip(self.agents, self._current_observation_n):
            action, _ = agent.policy.get_action(self._current_observation_n)
            # print(action)
            if agent.joint_policy:
                action_n.append(np.array(action)[0:agent._action_dim])
            else:
                action_n.append(np.array(action))

        try:
            action_n = np.asarray(action_n)
            # action_n = .5 * np.ones_like(action_n)
            next_observation_n, reward_n, done_n, info = self.env.step(action_n)
            self.step_act_dict[self.step] = action_n
            self.step_rew_dict[self.step] = reward_n
            print(reward_n)
        except:
            import pdb; pdb.set_trace()
        if self.global_reward:
            reward_n = np.array([np.sum(reward_n)] * self.agent_num)

        self._path_length += 1
        self._path_return += np.array(reward_n[0], dtype=np.float32)
        self._total_samples += 1

        for i, agent in enumerate(self.agents):
            action = deepcopy(action_n[i])
            if agent.pool.joint:
                # opponent_action = deepcopy(action_n)
                # opponent_action = np.delete(opponent_action, i, 0)
                # opponent_action = np.array(opponent_action).flatten()
                agent.pool.add_sample(observation=self._current_observation_n[i],
                                      action=action,
                                      reward=reward_n[i],
                                      terminal=done_n[i],
                                      next_observation=next_observation_n[i])
            else:
                agent.pool.add_sample(observation=self._current_observation_n[i],
                                      action=action,
                                      reward=reward_n[i],
                                      terminal=done_n[i],
                                      next_observation=next_observation_n[i])
        self._current_observation_n = next_observation_n
        for i, rew in enumerate(reward_n):
            self.episode_rewards[-1] += rew
            self.agent_rewards[-1] += rew

        if self.step % (25 * 1000) == 0:
            print("steps: {}, episodes: {}, mean episode reward: {}".format(
                        self.step, len(self.episode_rewards), np.mean(self.episode_rewards[-1000:])))
        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._current_observation_n = self.env.reset()
            self._max_path_return = np.maximum(self._max_path_return, self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return
            self.episode_rewards.append(0)
            # import pdb; pdb.set_trace()
            self.agent_rewards.append(0)
                # a.append(0)
            self._path_length = 0

            self._path_return = np.array([0.] * self.agent_num, dtype=np.float32)
            self._n_episodes += 1
            # self.log_diagnostics()
            # logger.dump_tabular(with_prefix=False)
            tflog('mean-return', self._mean_path_return[0])

        else:
            self._current_observation_n = next_observation_n
        # Sample a minibatch from memory
        if t_steps % train_every == 0: 
            samples = random.sample(replay_memory, batch_size * train_every)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples))        

            # Compute target
            #q_values_next = target_estimator.predict(next_states_batch)
            q_values_next = target_estimator.predict(next_states_batch)
            targets = reward_batch + (1-done_batch) * discount_factor * np.amax(q_values_next, axis=1)

            # Update estimator weights
            target_f = q_estimator.predict(states_batch)

            for i, action in enumerate(action_batch):
                target_f[i,action] = targets[i]

            loss = q_estimator.train_on_batch(states_batch, target_f)

            eps_loss += loss
        
        if done:
            break
        
        obs = new_obs
        t_steps += 1

    tflog('running_reward', eps_reward, custom_dir=log_dir)
    tflog('eps_length', t, custom_dir=log_dir)
    tflog('epsilon', epsilon, custom_dir=log_dir)
    tflog('loss', eps_loss, custom_dir=log_dir)
Example #22
0
 def f(queue):
     easy_tf_log.tflog('foo', 0)
     queue.put(True)
Example #23
0
    def sample(self):
        self.step += 1
        if self._current_observation_n is None:
            self._current_observation_n = self.env.reset()

        action_n = self.agents[0].policy.get_actions(self._current_observation_n)
        if self._do_nego > 0:
            action_n = self.agents[0].nego_policy.get_actions(self._current_observation_n, action_n)
            
        try:
            action_n = np.asarray(action_n).reshape(-1)
            # action_n = .5 * np.ones_like(action_n)
            next_observation_n, reward_n, done_n, info = self.env.step(action_n)
            print(reward_n)
        except:
            import pdb; pdb.set_trace()
        if self.global_reward:
            reward_n = np.array([np.sum(reward_n)] * self.agent_num)

        self._path_length += 1
        self._path_return += np.array(reward_n, dtype=np.float32)
        self._total_samples += 1

        for i, agent in enumerate(self.agents):
            action = deepcopy(action_n[i])
            if agent.pool.joint:
                agent.pool.add_sample(observation=self._current_observation_n.reshape(-1),
                                      action=action.reshape(-1),
                                      reward=reward_n[i],
                                      terminal=done_n[i],
                                      next_observation=next_observation_n.reshape(-1))
            else:
                agent.pool.add_sample(observation=self._current_observation_n.reshape(-1),
                                      action=action.reshape(-1),
                                      reward=reward_n[i],
                                      terminal=done_n[i],
                                      next_observation=next_observation_n.reshape(-1))
        self._current_observation_n = next_observation_n
        for i, rew in enumerate(reward_n):
            self.episode_rewards[-1] += rew
            self.agent_rewards[i][-1] += rew

        if self.step % (25 * 1000) == 0:
            print("steps: {}, episodes: {}, mean episode reward: {}".format(
                        self.step, len(self.episode_rewards), np.mean(self.episode_rewards[-1000:])))
        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._current_observation_n = self.env.reset()
            self._max_path_return = np.maximum(self._max_path_return, self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return
            self.episode_rewards.append(0)
            for a in self.agent_rewards:
                a.append(0)
            self._path_length = 0

            self._path_return = np.array([0.] * self.agent_num, dtype=np.float32)
            self._n_episodes += 1
            tflog('mean-return', self._mean_path_return[0])
            # self.log_diagnostics()
            # logger.dump_tabular(with_prefix=False)

        else:
            self._current_observation_n = next_observation_n
Example #24
0
def objective(arglist):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
    sess = tf.Session(config=config)
    set_session(sess)

    game_name = arglist.game_name
    # 'abs', 'one'
    reward_type = arglist.reward_type
    p = arglist.p
    agent_num = arglist.n
    u_range = 1.
    k = 0
    print(arglist.aux, 'arglist.aux')
    model_names_setting = arglist.model_names_setting.split('_')
    model_names = model_names_setting
    model_name = '_'.join(model_names)
    path_prefix = game_name
    if game_name == 'pbeauty':
        env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p)
        path_prefix  = game_name + '-' + reward_type + '-' + str(p)
    elif 'matrix' in game_name:
        matrix_game_name = game_name.split('-')[-1]
        repeated = arglist.repeat
        max_step = arglist.max_path_length
        memory = arglist.memory
        env = MatrixGame(game=matrix_game_name, agent_num=agent_num,
                         action_num=2, repeated=repeated,
                         max_step=max_step, memory=memory,
                         discrete_action=False, tuple_obs=False)
        path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory)

    elif 'diff' in game_name:
        diff_game_name = game_name.split('-')[-1]
        agent_num = 3
        s2 = arglist.s2
        x2 = arglist.x2
        y2 = arglist.y2
        con = arglist.con
        env = DifferentialGame(diff_game_name, agent_num, x2, y2, s2, con)

    elif 'particle' in game_name:
        particle_game_name = game_name.split('-')[-1]
        env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist)

    now = datetime.datetime.now()
    timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z')
    if 'CG' in model_name:
        model_name = model_name + '-{}'.format(arglist.mu)
    if not arglist.aux:
        model_name = model_name + '-{}'.format(arglist.aux)

    suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp)

    print(suffix)

    # logger.add_tabular_output('./log/{}.csv'.format(suffix))
    # snapshot_dir = './snapshot/{}'.format(suffix)
    # policy_dir = './policy/{}'.format(suffix)
    # os.makedirs(snapshot_dir, exist_ok=True)
    # os.makedirs(policy_dir, exist_ok=True)
    # logger.set_snapshot_dir(snapshot_dir)

    agents = []
    M = arglist.hidden_size
    batch_size = arglist.batch_size

    sampler = MASampler(agent_num=agent_num, joint=True, global_reward=arglist.global_reward, max_path_length=25, min_pool_size=100, batch_size=batch_size)

    base_kwargs = {
        'sampler': sampler,
        'epoch_length': 1,
        'n_epochs': arglist.max_steps,
        'n_train_repeat': 1,
        'eval_render': True,
        'eval_n_episodes': 10
    }

    _alpha = arglist.alpha
    lr = arglist.lr
    n_pars = arglist.n_pars
    result = 0.

    with U.single_threaded_session():
        for i, model_name in enumerate(model_names):
            if 'PR2AC' in model_name:
                k = int(model_name[-1])
                g = False
                mu = arglist.mu
                if 'G' in model_name:
                    g = True
                agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs,  lr=lr, n_pars=n_pars, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux)
            elif model_name == 'MASQL':
                agent = masql_agent(model_name, i, env, M, u_range, base_kwargs,  lr=lr, n_pars=n_pars, game_name=game_name)
            elif model_name == 'ROMMEO':
                agent = rom_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name)
            else:
                if model_name == 'DDPG':
                    joint = False
                    opponent_modelling = False
                elif model_name == 'MADDPG':
                    joint = True
                    opponent_modelling = False
                elif model_name == 'DDPG-OM':
                    joint = True
                    opponent_modelling = True
                agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs,lr=lr, game_name=game_name)

            agents.append(agent)

        sampler.initialize(env, agents)

        for agent in agents:
            agent._init_training()
        gt.rename_root('MARLAlgorithm')
        gt.reset()
        gt.set_def_unique(False)
        initial_exploration_done = False
        # noise = .1
        noise = .5


        for agent in agents:
            try:
                agent.policy.set_noise_level(noise)
            except:
                pass
        # alpha = .5
        for steps in gt.timed_for(range(base_kwargs['n_epochs'] + 1)):
            # import pdb; pdb.set_trace()
            # alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500.
            if steps < base_kwargs['n_epochs']//3:
                # alpha = _alpha
                alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500.
            elif  steps < base_kwargs['n_epochs']//2:
                alpha = _alpha/10
            else:
                alpha = .3
            tflog('alpha', alpha)
            print('alpha', alpha)
            # if steps > 100 and steps<150:
            #     alpha = .1 - 0.099 * steps/(150)
            # elif steps >= 150:
            #     alpha = 1e-3
            print('alpha', alpha)
            # logger.push_prefix('Epoch #%d | ' % steps)
            if steps % (25*1000) == 0:
                print(suffix)
            for t in range(base_kwargs['epoch_length']):
                # TODO.code consolidation: Add control interval to sampler
                if not initial_exploration_done:
                    # if steps >= 1000:
                    if steps >= 10:
                        initial_exploration_done = True
                sampler.sample()
                if not initial_exploration_done:
                    continue
                gt.stamp('sample')
                print('Sample Done')
                if steps == 10000:
                    noise = 0.1

                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                    # alpha = 10.
                # if steps == 2000:
                if steps > base_kwargs['n_epochs'] / 10:
                    noise = 0.1
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                    # alpha = .1
                if steps > base_kwargs['n_epochs'] / 5:
                    noise = 0.05
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                if steps > base_kwargs['n_epochs'] / 6:
                    noise = 0.01
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                if steps % arglist.training_interval != 0:
                    continue
                for j in range(base_kwargs['n_train_repeat']):
                    batch_n = []
                    recent_batch_n = []
                    indices = None
                    receent_indices = None
                    for i, agent in enumerate(agents):
                        if i == 0:
                            batch = agent.pool.random_batch(batch_size)
                            indices = agent.pool.indices
                            receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top))

                        batch_n.append(agent.pool.random_batch_by_indices(indices))
                        recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices))

                    # print(len(batch_n))
                    target_next_actions_n = []
                    # try:
                    all_obs = np.array(np.concatenate([batch['observations'] for batch in batch_n], axis=-1))
                    all_next_obs = np.array(np.concatenate([batch['next_observations'] for batch in batch_n], axis=-1))
                    # print(all_obs[0])
                    for batch in batch_n:
                        # print('making all obs')
                        batch['all_observations'] = deepcopy(all_obs)
                        batch['all_next_observations'] = deepcopy(all_next_obs)
                    opponent_current_actions_n = []
                    for agent, batch in zip(agents, batch_n):
                        target_next_actions_n.append(agent.target_policy.get_actions(batch['next_observations']))
                        opponent_current_actions_n.append(agent.policy.get_actions(batch['observations']))

                    for i, agent in enumerate(agents):
                        batch_n[i]['opponent_current_actions'] = np.reshape(
                            np.delete(deepcopy(opponent_current_actions_n), i, 0), (-1, agent._opponent_action_dim))
                    # except:
                    #     pass


                    opponent_actions_n = np.array([batch['actions'] for batch in batch_n])
                    recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n])

                    ####### figure out
                    recent_opponent_observations_n = []
                    for batch in recent_batch_n:
                        recent_opponent_observations_n.append(batch['observations'])


                    current_actions = [agents[i].policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)]
                    all_actions_k = []
                    for i, agent in enumerate(agents):
                        if isinstance(agent, MAVBAC):
                            if agent._k > 0:
                                batch_actions_k = agent.policy.get_all_actions(batch_n[i]['next_observations'])
                                actions_k = [a[0][0] for a in batch_actions_k]
                                all_actions_k.append(';'.join(list(map(str, actions_k))))
                    # if len(all_actions_k) > 0:
                    #     with open('{}/all_actions.csv'.format(policy_dir), 'a') as f:
                    #         f.write(','.join(list(map(str, all_actions_k))) + '\n')
                    # with open('{}/policy.csv'.format(policy_dir), 'a') as f:
                    #     f.write(','.join(list(map(str, current_actions)))+'\n')
                    # print('============')
                    for i, agent in enumerate(agents):
                        try:
                            batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i])
                        except:
                            pass
                        batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim))
                        if agent.joint:
                            if agent.opponent_modelling:
                                batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i]
                                batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim))
                                batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations'])
                            else:
                                batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim))

                        if isinstance(agent, MAVBAC) or isinstance(agent, MASQL) or isinstance(agent, ROMMEO):
                            agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i], annealing=alpha)
                        else:
                            agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i])
                gt.stamp('train')
            result = sampler.terminate()
    clear_session()
    return result
Example #25
0
def main():
    args, lr_args, log_dir, preprocess_wrapper, ckpt_timer = parse_args()
    easy_tf_log.set_dir(log_dir)

    utils.set_random_seeds(args.seed)
    sess = tf.Session()

    envs = make_envs(args.env_id, preprocess_wrapper, args.max_n_noops,
                     args.n_workers, args.seed, args.debug, log_dir)

    step_counter = utils.GraphCounter(sess)
    update_counter = utils.GraphCounter(sess)
    lr = make_lr(lr_args, step_counter.value)
    optimizer = make_optimizer(lr)

    networks = make_networks(n_workers=args.n_workers,
                             n_actions=envs[0].action_space.n,
                             weight_inits=args.weight_inits,
                             value_loss_coef=args.value_loss_coef,
                             entropy_bonus=args.entropy_bonus,
                             max_grad_norm=args.max_grad_norm,
                             optimizer=optimizer,
                             debug=args.debug)

    # Why save_relative_paths=True?
    # So that the plain-text 'checkpoint' file written uses relative paths,
    # which seems to be needed in order to avoid confusing saver.restore()
    # when restoring from FloydHub runs.
    global_vars = tf.trainable_variables('global')
    saver = tf.train.Saver(global_vars,
                           max_to_keep=1,
                           save_relative_paths=True)
    checkpoint_dir = osp.join(log_dir, 'checkpoints')
    os.makedirs(checkpoint_dir)
    checkpoint_file = osp.join(checkpoint_dir, 'network.ckpt')

    if args.load_ckpt:
        print("Restoring from checkpoint '%s'..." % args.load_ckpt,
              end='',
              flush=True)
        saver.restore(sess, args.load_ckpt)
        print("done!")
    else:
        sess.run(tf.global_variables_initializer())

    workers = make_workers(sess=sess,
                           envs=envs,
                           networks=networks,
                           n_workers=args.n_workers,
                           log_dir=log_dir)

    worker_threads = start_workers(n_steps=args.n_steps,
                                   steps_per_update=args.steps_per_update,
                                   step_counter=step_counter,
                                   update_counter=update_counter,
                                   workers=workers)
    ckpt_timer.reset()
    step_rate = utils.RateMeasure()
    step_rate.reset(int(step_counter))
    while True:
        time.sleep(args.wake_interval_seconds)

        steps_per_second = step_rate.measure(int(step_counter))
        easy_tf_log.tflog('misc/steps_per_second', steps_per_second)
        easy_tf_log.tflog('misc/steps', int(step_counter))
        easy_tf_log.tflog('misc/updates', int(update_counter))
        easy_tf_log.tflog('misc/lr', sess.run(lr))

        alive = [t.is_alive() for t in worker_threads]

        if ckpt_timer.done() or not any(alive):
            saver.save(sess, checkpoint_file, int(step_counter))
            print("Checkpoint saved to '{}'".format(checkpoint_file))
            ckpt_timer.reset()

        if not any(alive):
            break

    for env in envs:
        env.close()
Example #26
0
#!/usr/bin/env python
import time

import easy_tf_log

# Logging using the global logger

# Will log to automatically-created 'logs' directory
for i in range(10):
    easy_tf_log.tflog('foo', i)
for j in range(10, 20):
    easy_tf_log.tflog('bar', j)

easy_tf_log.set_dir('logs2')

for k in range(20, 30):
    easy_tf_log.tflog('baz', k)
for l in range(5):
    easy_tf_log.tflog('qux', l, step=(10 * l))

# Logging using a Logger object

logger = easy_tf_log.Logger(log_dir='logs3')

for i in range(10):
    logger.log_key_value('quux', i)

logger.log_list_stats('quuz', [1, 2, 3, 4, 5])

logger.measure_rate('corge', 10)
time.sleep(1)
Example #27
0
    def run(self):
        nenvs = len(self.env.remotes)
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = \
            [], [], [], [], []
        mb_states = self.states

        # Run for nsteps steps in the environment
        for _ in range(self.nsteps):
            actions, values, states = self.model.step(self.obs, self.states,
                                                      self.dones)
            mb_obs.append(np.copy(self.obs))
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(self.dones)
            # len({obs, rewards, dones}) == nenvs
            obs, rewards, dones, _ = self.env.step(actions)
            self.states = states
            self.dones = dones
            for n, done in enumerate(dones):
                if done:
                    self.obs[n] = self.obs[n] * 0
            # SubprocVecEnv automatically resets when done
            self.update_obs(obs)
            mb_rewards.append(rewards)
        mb_dones.append(self.dones)
        # batch of steps to batch of rollouts
        # i.e. from nsteps, nenvs to nenvs, nsteps
        mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        # The first entry was just the init state of 'dones' (all False),
        # before we'd actually run any steps, so drop it.
        mb_dones = mb_dones[:, 1:]

        # Log original rewards
        for env_n, (rs, dones) in enumerate(zip(mb_rewards, mb_dones)):
            assert_equal(rs.shape, (self.nsteps, ))
            assert_equal(dones.shape, (self.nsteps, ))
            for step_n in range(self.nsteps):
                self.orig_reward[env_n] += rs[step_n]
                if dones[step_n]:
                    easy_tf_log.tflog("orig_reward_{}".format(env_n),
                                      self.orig_reward[env_n])
                    self.orig_reward[env_n] = 0

        if self.env.env_id == 'MovingDotNoFrameskip-v0':
            # For MovingDot, reward depends on both current observation and
            # current action, so encode action in the observations.
            # (We only need to set this in the most recent frame,
            # because that's all that the reward predictor for MovingDot
            # uses.)
            mb_obs[:, :, 0, 0, -1] = mb_actions[:, :]

        # Generate segments
        # (For MovingDot, this has to happen _after_ we've encoded the action
        # in the observations.)
        if self.gen_segments:
            self.update_segment_buffer(mb_obs, mb_rewards, mb_dones)

        # Replace rewards with those from reward predictor
        # (Note that this also needs to be done _after_ we've encoded the
        # action.)
        logging.debug("Original rewards:\n%s", mb_rewards)
        if self.reward_predictor:
            assert_equal(mb_obs.shape, (nenvs, self.nsteps, 84, 84, 4))
            mb_obs_allenvs = mb_obs.reshape(nenvs * self.nsteps, 84, 84, 4)

            rewards_allenvs = self.reward_predictor.reward(mb_obs_allenvs)
            assert_equal(rewards_allenvs.shape, (nenvs * self.nsteps, ))
            mb_rewards = rewards_allenvs.reshape(nenvs, self.nsteps)
            assert_equal(mb_rewards.shape, (nenvs, self.nsteps))

            logging.debug("Predicted rewards:\n%s", mb_rewards)

        # Save frames for episode rendering
        if self.episode_vid_queue is not None:
            self.update_episode_frame_buffer(mb_obs, mb_dones)

        # Discount rewards
        mb_obs = mb_obs.reshape(self.batch_ob_shape)
        last_values = self.model.value(self.obs, self.states,
                                       self.dones).tolist()
        # discount/bootstrap off value fn
        for n, (rewards, dones,
                value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            dones = dones.tolist()
            if dones[-1] == 0:
                # Make sure that the first iteration of the loop inside
                # discount_with_dones picks up 'value' as the initial
                # value of r
                rewards = discount_with_dones(rewards + [value], dones + [0],
                                              self.gamma)[:-1]
            else:
                rewards = discount_with_dones(rewards, dones, self.gamma)
            mb_rewards[n] = rewards

        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()
        return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
Example #28
0
    def run(self):
        """Performs training.

    Trains a model using episodic training.
    Every so often, runs some evaluations on validation data.
    """

        train_data, valid_data = self.train_data, self.valid_data
        input_dim, output_dim = self.input_dim, self.output_dim
        rep_dim, episode_length = self.rep_dim, self.episode_length
        episode_width, memory_size = self.episode_width, self.memory_size
        batch_size = self.batch_size
        # create data generator
        birds_data = FewshotBirdsDataGenerator(self.batch_size,
                                               self.episode_length,
                                               self.episode_width,
                                               image_dim=input_dim)

        train_size = len(train_data)
        valid_size = len(valid_data)
        logging.info('train_size (number of labels) %d', train_size)
        logging.info('valid_size (number of labels) %d', valid_size)
        logging.info('input_dim %d', input_dim)
        logging.info('output_dim %d', output_dim)
        logging.info('rep_dim %d', rep_dim)
        logging.info('episode_length %d', episode_length)
        logging.info('episode_width %d', episode_width)
        logging.info('memory_size %d', memory_size)
        logging.info('batch_size %d', batch_size)

        assert all(
            len(v) >= float(episode_length) / episode_width
            for v in train_data.values())
        assert all(
            len(v) >= float(episode_length) / episode_width
            for v in valid_data.values())

        output_dim = episode_width
        self.model = self.get_model()
        self.model.setup()

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        saver = tf.train.Saver(max_to_keep=10)
        # for inception
        #ckpt = tf.train.get_checkpoint_state(INCEPTION_CKPT)
        print('use resnet:', FLAGS.use_resnet)
        ckpt = RESNET_CKPT if FLAGS.use_resnet else INCEPTION_CKPT
        scope = 'core/resnet_v2_50' if FLAGS.use_resnet else 'core/InceptionV3'
        incpt_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope=scope)
        incpt_vars = [v for v in incpt_vars if 'Adam' not in v.name]
        incpt_vars = [v for v in incpt_vars if 'BatchNorm' not in v.name]
        incpt_vars = {v.name.split('core/')[1][0:-2]: v for v in incpt_vars}
        assign_fn = tf.contrib.framework.assign_from_checkpoint_fn(
            ckpt,
            incpt_vars,
            ignore_missing_vars=True,
            reshape_variables=False)
        assign_fn(sess)

        ckpt = None
        if FLAGS.save_dir:
            ckpt = tf.train.get_checkpoint_state(FLAGS.save_dir)
        if ckpt and ckpt.model_checkpoint_path:
            logging.info('restoring from %s', ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)

        logging.info('starting now')
        losses = []
        random.seed(FLAGS.seed)
        np.random.seed(FLAGS.seed)
        # used for sampling data
        use_parts = FLAGS.num_parts > 1
        for i in xrange(FLAGS.num_episodes):
            # TODO: add parts
            x, p1, p2, y = birds_data.sample_episode_batch(
                birds_data.train_data, use_parts=use_parts)
            #outputs = self.model.episode_step_with_parts(sess, x, p1, p2, y, True, clear_memory=True)
            # TODO: this doesn't make sense
            if FLAGS.num_parts > 1:
                parts = [
                    np.concatenate([pp1, pp2], axis=0)
                    for pp1, pp2 in zip(p1, p2)
                ]
            else:
                parts = x
            #outputs = self.model.episode_step_with_parts(sess, parts, y, True, clear_memory=True)
            outputs = self.model.episode_step_n_parts(sess,
                                                      parts,
                                                      y,
                                                      True,
                                                      clear_memory=True)
            #x, y = self.sample_episode_batch(
            #    train_data, episode_length, episode_width, batch_size)
            #outputs = self.model.episode_step(sess, x, y, clear_memory=True)
            # plot a histogram of the different labels
            loss = outputs
            losses.append(loss)

            if i % FLAGS.validation_frequency == 0:
                logging.info('episode batch %d, avg train loss %f', i,
                             np.mean(losses))
                tflog('loss', np.mean(losses))
                losses = []

                # validation
                correct = []
                num_shots = episode_length // episode_width
                correct_by_shot = dict((k, []) for k in xrange(num_shots))
                for _ in xrange(FLAGS.validation_length):
                    # TODO: add parts
                    #x, y = self.sample_episode_batch(
                    #    valid_data, episode_length, episode_width, 1)
                    x, p1, p2, y = birds_data.sample_episode_batch(
                        birds_data.val_data, use_parts=use_parts)
                    if FLAGS.num_parts > 1:
                        parts = [
                            np.concatenate([pp1, pp2], axis=0)
                            for pp1, pp2 in zip(p1, p2)
                        ]
                    else:
                        parts = x
                    #outputs = self.model.episode_predict_with_parts(
                    #    sess, x, p1, p2, y, False, clear_memory=True)
                    outputs = self.model.episode_predict_n_parts(
                        sess, parts, y, False, clear_memory=True)
                    y_preds = outputs
                    correct.append(self.compute_correct(np.array(y), y_preds))

                    # compute per-shot accuracies
                    seen_counts = [0] * episode_width
                    # loop over episode steps
                    for yy, yy_preds in zip(y, y_preds):
                        # loop over batch examples
                        yyy, yyy_preds = int(yy[0]), int(yy_preds[0])
                        count = seen_counts[yyy % episode_width]
                        if count in correct_by_shot:
                            correct_by_shot[count].append(
                                self.individual_compute_correct(
                                    yyy, yyy_preds))
                        seen_counts[yyy % episode_width] = count + 1

                tflog('val_accuracy', np.mean(correct))
                for k_shot, correct in correct_by_shot.items():
                    tflog(str(k_shot) + '_shot_accuracy', np.mean(correct))

                logging.info('validation overall accuracy %f',
                             np.mean(correct))
                logging.info(
                    '%d-shot: %.3f, ' * num_shots,
                    *sum([[k, np.mean(correct_by_shot[k])]
                          for k in xrange(num_shots)], []))

                if saver and FLAGS.save_dir:
                    saved_file = saver.save(sess,
                                            os.path.join(
                                                FLAGS.save_dir, 'model.ckpt'),
                                            global_step=self.model.global_step)
                    logging.info('saved model to %s', saved_file)
    def train(self):

        self.build_model()
        self.__model.summary()
        self.__model.compile(loss='binary_crossentropy',
                             optimizer='adam',
                             metrics=['accuracy'])

        UP_ACTION = 2
        DOWN_ACTION = 3

        # hyperparameters
        gamma = .99

        # initializing variables
        x_train, y_train, rewards = [], [], []
        reward_sum = 0
        episode_nb = 0

        # initialize variables
        resume = True
        running_reward = None
        epochs_before_saving = 10
        log_dir = './log' + datetime.now().strftime("%Y%m%d-%H%M%S") + "/"

        # load pre-trained model if exist
        if (resume and os.path.isfile(LOG_DIR + 'my_model_weights.h5')):
            print("loading previous weights")
            self.__model.load_weights(LOG_DIR + 'my_model_weights.h5')

        # add a callback tensorboard object to visualize learning
        tbCallBack = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0, \
                                           write_graph=True, write_images=True)

        # initializing environment
        env = gym.make('Pong-v0')
        observation = env.reset()
        prev_input = None

        # main loop
        while (True):

            # preprocess the observation, set input as difference between images
            cur_input = prepro(observation)
            x = cur_input - prev_input if prev_input is not None else np.zeros(
                80 * 80)
            prev_input = cur_input

            # forward the policy network and sample action according to the proba distribution
            proba = self.__model.predict(np.expand_dims(x, axis=1).T)
            action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION
            y = 1 if action == 2 else 0  # 0 and 1 are our labels

            # log the input and label to train later
            x_train.append(x)
            y_train.append(y)

            # do one step in our environment
            observation, reward, done, info = env.step(action)
            rewards.append(reward)
            reward_sum += reward

            # end of an episode
            if done:
                print('At the end of episode', episode_nb,
                      'the total reward was :', reward_sum)

                # increment episode number
                episode_nb += 1
                # training
                self.__model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, callbacks=[tbCallBack], \
                                 sample_weight=discount_rewards(rewards, gamma))

                # Saving the weights used by our model
                if episode_nb % epochs_before_saving == 0:
                    self.__model.save_weights(
                        'my_model_weights' +
                        datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5')
                    # Log the reward
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                tflog('running_reward', running_reward, custom_dir=log_dir)

                # Reinitialization
                x_train, y_train, rewards = [], [], []
                observation = env.reset()
                reward_sum = 0
                prev_input = None
Example #30
0
import easy_tf_log

for i in range(10):
    easy_tf_log.tflog('foo', i)
for j in range(10, 20):
    easy_tf_log.tflog('bar', j)

easy_tf_log.set_dir('logs2')

for k in range(20, 30):
    easy_tf_log.tflog('baz', k)