Beispiel #1
0
 def train(self, sess, saver, summary_writer, progress_fd, model_path, batch_size=64, step=10, start_episode=0,
           train_episodes=1000, save_episodes=100, epsilon=0.3, apply_her=False, n_goals=10):
     total_rewards = []
     sess.run([self.init_critic])
     for i_episode in tqdm(range(train_episodes), ncols=100):
         states, actions, returns, nexts, are_non_terminal, total_reward = self.collect_trajectory()
         feed_dict = {self.states: states, self.actions: actions, self.rewards: returns,
                      self.nexts: nexts, self.are_non_terminal: are_non_terminal, self.training: True}
         total_rewards.append(total_reward)
         perm = np.random.permutation(len(states))
         for s in range(step):
             sess.run([self.critic_step],
                  feed_dict=feed_dict)
         sess.run([self.update_critic])
         sess.run([self.actor_step],
                  feed_dict=feed_dict)
         
         # summary_writer.add_summary(summary, global_step=self.global_step.eval())
         critic_loss = self.critic_loss.eval(feed_dict=feed_dict).mean()
         actor_loss = self.actor_loss.eval(feed_dict=feed_dict).mean()
         append_summary(progress_fd, str(start_episode + i_episode) + ",{0:.2f}".format(total_reward)\
             +",{0:.4f}".format(actor_loss)+",{0:.4f}".format(critic_loss))
         if (i_episode + 1) % save_episodes == 0:
             saver.save(sess, model_path)
     return total_rewards
Beispiel #2
0
 def train(self,
           sess,
           saver,
           summary_writer,
           progress_fd,
           model_path,
           batch_size=64,
           step=10,
           start_episode=0,
           train_episodes=1000,
           save_episodes=100,
           epsilon=0.3,
           apply_her=False,
           n_goals=10):
     total_rewards = []
     sess.run([self.init_actor, self.init_critic])
     for i_episode in tqdm(range(train_episodes), ncols=100):
         total_reward = self.collect_trajectory(epsilon, apply_her, n_goals)
         append_summary(
             progress_fd,
             str(start_episode + i_episode) +
             ',{0:.2f}'.format(total_reward))
         total_rewards.append(total_reward)
         states, actions, rewards, nexts, are_non_terminal = self.replay_memory.sample_batch(
             step * batch_size)
         for t in range(step):
             sess.run(
                 [self.critic_step],
                 feed_dict={
                     self.states:
                     states[t * batch_size:(t + 1) * batch_size],
                     self.actions:
                     actions[t * batch_size:(t + 1) * batch_size],
                     self.rewards:
                     rewards[t * batch_size:(t + 1) * batch_size],
                     self.nexts:
                     nexts[t * batch_size:(t + 1) * batch_size],
                     self.are_non_terminal:
                     are_non_terminal[t * batch_size:(t + 1) * batch_size],
                     self.training:
                     True
                 })
             sess.run(
                 [self.actor_step],
                 feed_dict={
                     self.states:
                     states[t * batch_size:(t + 1) * batch_size],
                     self.training: True
                 })
             sess.run([self.update_actor, self.update_critic])
         # summary_writer.add_summary(summary, global_step=self.global_step.eval())
         if (i_episode + 1) % save_episodes == 0:
             saver.save(sess, model_path)
     return total_rewards
Beispiel #3
0
 def train(self,
           sess,
           saver,
           summary_writer,
           progress_fd,
           model_path,
           batch_size=64,
           step=10,
           start_episode=0,
           train_episodes=1000,
           save_episodes=100,
           epsilon=0.3,
           max_episode_len=25):
     total_rewards = []
     sess.run([agent.init_qnetwork for agent in self.agents])
     for i_episode in tqdm(range(train_episodes), ncols=100):
         cur_epsilon = self.linear_decay_epsilon(i_episode,
                                                 train_episodes * 0.5,
                                                 epsilon)
         total_reward = self.collect_trajectory(cur_epsilon,
                                                max_episode_len)
         append_summary(
             progress_fd,
             str(start_episode + i_episode) +
             ',{0:.2f}'.format(total_reward))
         total_rewards.append(total_reward)
         for agent in self.agents:
             states, actions, rewards, nexts, are_non_terminal = agent.replay_memory.sample_batch(
                 step * batch_size)
             for t in range(step):
                 sess.run(
                     [agent.step],
                     feed_dict={
                         agent.states:
                         states[t * batch_size:(t + 1) * batch_size],
                         agent.actions:
                         actions[t * batch_size:(t + 1) * batch_size],
                         agent.rewards:
                         rewards[t * batch_size:(t + 1) * batch_size],
                         agent.nexts:
                         nexts[t * batch_size:(t + 1) * batch_size],
                         agent.are_non_terminal:
                         are_non_terminal[t * batch_size:(t + 1) *
                                          batch_size],
                         agent.training:
                         True
                     })
             sess.run([agent.update_qnetwork])
         # summary_writer.add_summary(summary, global_step=self.global_step.eval())
         if (i_episode + 1) % save_episodes == 0:
             saver.save(sess, model_path)
     return total_rewards
Beispiel #4
0
 def train(self,
           sess,
           saver,
           summary_writer,
           progress_fd,
           model_path,
           batch_size=64,
           step=10,
           start_episode=0,
           train_episodes=1000,
           save_episodes=100,
           epsilon=0.3,
           apply_her=False,
           n_goals=10,
           train_steps=-1):
     total_rewards = []
     n_step = 0
     i_episode = 0
     for i_episode in tqdm(range(train_episodes), ncols=100):
         states_mem, actions_mem, action_loglikelihood_mem, returns_mem, advantage_mem, epi_avg_reward = self.collect_transitions(
             sess)
         #self.global_step.assign_add(1)
         for s in range(step):
             perm = np.random.permutation(len(states_mem))
             for sample_id in range(0, len(perm), batch_size):
                 feed_dict = {
                     self.states:
                     states_mem[perm[sample_id:sample_id + batch_size]],
                     self.actions:
                     actions_mem[perm[sample_id:sample_id + batch_size]],
                     self.action_loglikelihood:
                     action_loglikelihood_mem[perm[sample_id:sample_id +
                                                   batch_size]],
                     self.returns:
                     returns_mem[perm[sample_id:sample_id + batch_size]],
                     self.advantages:
                     advantage_mem[perm[sample_id:sample_id + batch_size]],
                     self.training:
                     True
                 }
                 sess.run([self.actor_step, self.critic_step],
                          feed_dict=feed_dict)
         n_step += len(states_mem)
         if not epi_avg_reward is None:
             append_summary(
                 progress_fd,
                 str(start_episode + i_episode) +
                 ",{0:.2f}".format(epi_avg_reward) + ",{}".format(n_step))
             total_rewards.append(epi_avg_reward)
         if (i_episode + 1) % save_episodes == 0:
             saver.save(sess, model_path)
     return total_rewards
Beispiel #5
0
 def train(self,
           sess,
           saver,
           summary_writer,
           progress_fd,
           model_path,
           filter_path,
           batch_size=64,
           step=10,
           start_episode=0,
           train_episodes=1000,
           save_episodes=100,
           max_episode_len=25,
           **kargs):
     total_rewards = []
     n_step = 0
     i_episode = 0
     for i_episode in tqdm(range(train_episodes), ncols=100):
         states_mem, actions_mem, action_loglikelihood_mem, returns_mem, advantage_mem, epi_avg_reward = self.collect_transitions(
             sess, max_episode_len)
         for s in range(step):
             perm = np.random.permutation(len(states_mem))
             for sample_id in range(0, len(perm), batch_size):
                 feed_dict = {
                     self.states:
                     states_mem[perm[sample_id:sample_id + batch_size]],
                     self.actions:
                     actions_mem[perm[sample_id:sample_id + batch_size]],
                     self.action_loglikelihood:
                     action_loglikelihood_mem[perm[sample_id:sample_id +
                                                   batch_size]],
                     self.returns:
                     returns_mem[perm[sample_id:sample_id + batch_size]],
                     self.advantages:
                     advantage_mem[perm[sample_id:sample_id + batch_size]],
                     self.training:
                     True
                 }
                 sess.run(self.actor_step_list + self.critic_step_list,
                          feed_dict=feed_dict)
         n_step += len(states_mem)
         append_summary(
             progress_fd,
             str(start_episode + i_episode) +
             ",{0:.2f}".format(epi_avg_reward) + ",{}".format(n_step))
         total_rewards.append(epi_avg_reward)
         if (i_episode + 1) % save_episodes == 0:
             saver.save(sess, model_path)
             self.save_state_filter(filter_path)
     return total_rewards
Beispiel #6
0
    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(log_path,
                                           graph=tf.get_default_graph())

    with tf.Session(config=config) as sess:
        if args.eval or args.restore:
            saver.restore(sess, model_path)
            if not args.eval:
                progress_fd = open(progress_file, 'r')
                start_episode = len(progress_fd.readlines()) - 1
                progress_fd.close()
                progress_fd = open(progress_file, 'a')
        else:
            progress_fd = open(progress_file, 'w')
            append_summary(progress_fd, 'episode, avg-reward, n_step')
            progress_fd.flush()
            start_episode = 0
            tf.global_variables_initializer().run()
        if not args.eval:
            total_rewards = agent.train(sess,
                                        saver,
                                        summary_writer,
                                        progress_fd,
                                        model_path,
                                        batch_size=args.batch_size,
                                        step=args.step,
                                        train_episodes=args.train_episodes,
                                        start_episode=start_episode,
                                        save_episodes=args.save_episodes,
                                        epsilon=args.epsilon,
Beispiel #7
0
    config = tf.ConfigProto(gpu_options=gpu_ops, allow_soft_placement=True)
    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(log_path, graph=tf.get_default_graph())

    with tf.Session(config=config) as sess:
        if args.eval or args.restore:
            saver.restore(sess, model_path)
            agent.load_state_filter(filter_path)
            if not args.eval:
                progress_fd = open(progress_file, 'r')
                start_episode = len(progress_fd.readlines()) - 1
                progress_fd.close()
                progress_fd = open(progress_file, 'a')
        else:
            progress_fd = open(progress_file, 'w')
            append_summary(progress_fd, 'episode, first-agent-reward')
            progress_fd.flush()
            start_episode = 0
            tf.global_variables_initializer().run()
        if not args.eval:
            total_rewards = agent.train(
                sess, saver, summary_writer, progress_fd, model_path, filter_path, batch_size=args.batch_size, step=args.step,
                train_episodes=args.train_episodes, start_episode=start_episode, save_episodes=args.save_episodes,
                max_episode_len=args.max_episode_len)
            progress_fd.close()
            plot(os.path.join(args.plot_dir, args.model + '_' + args.env), np.array(total_rewards) + 1e-10)
            summary_writer.close()
        else:
            if args.benchmark:
                infos = []
                n_epi = 400