Esempio n. 1
0
#     for i in range(200):
#         observation, reward, done, info = env.step(tools.active_one_muscle("iliopsoas", "r", 1))
#
#         if i == 20:
#             state_desc = env.get_state_desc()
#             print(type(state_desc["body_pos"]["toes_r"][0:2]))
#             print(state_desc["body_pos"]["talus_l"][0:2])
#             print(state_desc["body_pos"]["talus_r"][0:2])
#             print(state_desc["misc"]["mass_center_pos"])
#             print(state_desc["body_pos_rot"])
#             input("Press Enter to continue...")
#             print(reward)

if __name__ == '__main__':
    #init()
    # with open("C:\\Users\\YunfeiZHAO\\Desktop\\1.json", 'r') as f:
    #     content = f.read()
    #     print(content)
    #     parsed = json.loads(content)
    #     jp = json.dump(parsed, indent=4, sort_keys=True)
    #     print(jp)

    env = L2M2019Env(visualize=True)
    observation = env.reset(project=True,
                            seed=None,
                            init_pose=None,
                            obs_as_dict=True)
    tools = Tools()
    state_desc = env.get_state_desc()
    print(tools.get_reward(state_desc))
Esempio n. 2
0
class Trainer():
    def __init__(self, env, args):
        self.direction = args.direction
        self.env = env
        self.num_episodes = args.episodes
        self.episode_start = 0
        self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape))
        self.noise_decay = args.noise_decay
        self.count_exp_replay = 0
        self.train_iteration = 0
        self.tau = args.TAU
        self.tools = Tools()

    def tryLoadWeights(self):
        print("Load weights \n")
        try:
            with open("./log/data.txt", 'r') as f:
                data = f.read()
            num_episodes, model_name_prefix, noise, epsilon = data.split(" ")
            self.episode_start = num_episodes
            self.num_episodes += num_episodes
            self.noise = noise
            self.epsilon = epsilon
            print("Log loaded !\n")

            self.saver.restore(self.sess, "./model/model.ckpt")
            print("Weights load successfully ! \n")
            self.memory_buffer.load()
            print("Memory buffer load succesfully ! \n")
            return (0)
        except:
            # if self.episode_start == 0:
            #    return(False)
            print("New training \n")
            return (1)

    def play_to_init_buffer(self):
        state = self.env.reset(obs_as_dict=False)
        state = np.asarray(state)
        for random_step in range(1, args.init_buffer_size + 1):
            #self.env.render()
            print("\r Examples : {}/{}".format(random_step,
                                               args.init_buffer_size),
                  end="")
            sys.stdout.flush()
            action = self.env.action_space.sample()
            next_state, reward, terminal, _ = self.env.step(action,
                                                            obs_as_dict=False)

            reward = self.tools.get_reward(self.direction,
                                           self.env.get_state_desc())

            next_state = np.asarray(next_state)
            self.model.memory_buffer.add(state, action, reward, next_state,
                                         terminal)
            state = np.copy(next_state)
            if terminal:
                self.env.reset()

    def DDPG(self):

        tf.reset_default_graph()
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(config=config) as sess:
            self.model = Actor_Critic(env, args)
            self.target_Q_ph = tf.placeholder(tf.float32, shape=(None, 1))
            self.actions_grads_ph = tf.placeholder(
                tf.float32, shape=((None, ) + self.env.action_space.shape))
            # train operation
            self.actor_train_ops = self.model.Actor.train_step(
                self.actions_grads_ph)
            self.critic_train_ops = self.model.Critic.train_step(
                self.target_Q_ph)
            # update operation
            self.update_critic_target = self.model.update_target_network(
                self.model.Critic.network_params,
                self.model.Critic_target.network_params, self.tau)
            self.update_actor_target = self.model.update_target_network(
                self.model.Actor.network_params,
                self.model.Actor_target.network_params, self.tau)
            # reward summary for tensorboard
            self.tf_reward = tf.Variable(0.0,
                                         trainable=False,
                                         name='Reward_per_episode')
            self.tf_reward_summary = tf.summary.scalar("Reward by episode",
                                                       self.tf_reward)
            # time
            self.tf_time = tf.Variable(0.0,
                                       trainable=False,
                                       name='Time_per_episode')
            self.tf_time_summary = tf.summary.scalar("Time per episode",
                                                     self.tf_time)
            # step
            self.tf_step = tf.Variable(0.0,
                                       trainable=False,
                                       name='Step_per_episode')
            self.tf_step_summary = tf.summary.scalar("Step per episode",
                                                     self.tf_step)
            # writer
            self.writer = tf.summary.FileWriter('./graphs', sess.graph)
            sess.run(tf.initialize_all_variables())
            # init target networks by copying weights from actor and critic network
            sess.run(
                self.model.update_target_network(
                    self.model.Critic.network_params,
                    self.model.Critic_target.network_params))
            sess.run(
                self.model.update_target_network(
                    self.model.Actor.network_params,
                    self.model.Actor_target.network_params))
            #save by saver
            saver = tf.train.Saver()

            scores = []
            for i_episode in range(self.episode_start, self.num_episodes):
                start = time.time()
                one_episode_score = 0

                # write log of training
                name = "./log/training.txt"
                with open(name, 'a') as f:
                    f.write("Episode {}/{} \n".format(i_episode + 1,
                                                      self.num_episodes))
                f.close()

                if (i_episode + 1) % 100 == 0:
                    avg = np.mean(np.asarray(scores))
                    self.noise_decay *= 0.95
                    if (i_episode + 1) % 500 == 0:
                        #self.saveWeights("./checkpoints/{}_{}_".format(args.direction, i_episode), sess)
                        save = saver.save(
                            sess,
                            "./checkpoints/left_{}.ckpt".format(i_episode))
                        print(save)
                    print(
                        "Episode {}/{} : Average score in 100 latest episodes : {}"
                        .format(i_episode + 1, self.num_episodes, avg))
                    scores.clear()
                # reset env
                state = self.env.reset(obs_as_dict=False)
                state = np.asarray(state)
                self.noise.reset()
                angle_state = np.arccos(
                    self.tools.get_reward(self.direction,
                                          self.env.get_state_desc()))

                for i_step in itertools.count():
                    action = sess.run(self.model.Actor.output,
                                      feed_dict={
                                          self.model.states_ph:
                                          np.expand_dims(
                                              np.array([angle_state]), 0),
                                          self.model.is_training_ph:
                                          False
                                      })[0]

                    action += self.noise() * self.noise_decay

                    # execute action action_with_noise and observe reward r_t and s_t+1
                    next_state, reward, terminal, _ = self.env.step(
                        action, obs_as_dict=False)
                    reward = self.tools.get_reward(self.direction,
                                                   self.env.get_state_desc())
                    #angle_next_state = np.arccos(self.tools.get_reward(self.direction, self.env.get_state_desc()))
                    angle_next_state = self.tools.get_reward(
                        self.direction, self.env.get_state_desc())
                    #reward = np.cos(angle_next_state)

                    name = "./log/training.txt"
                    with open(name, 'a') as f:
                        f.write("Episode {}/{} == Step : {} =>>> Reward {} \n".
                                format(i_episode + 1, self.num_episodes,
                                       i_step, reward))
                    f.close()

                    next_state = np.asarray(next_state)
                    self.model.memory_buffer.add(angle_state, action, reward,
                                                 angle_next_state, terminal)
                    angle_state = angle_next_state

                    one_episode_score += reward
                    state = np.copy(next_state)
                    #self.experience_replay()

                    if self.model.memory_buffer.count(
                    ) >= self.model.batch_size * 10:
                        batch, w_id, eid = self.model.memory_buffer.getBatch(
                            self.model.batch_size)

                        batch_state = np.zeros((self.model.batch_size, 1))
                        batch_reward = np.zeros((self.model.batch_size, ))
                        batch_action = np.zeros(
                            (self.model.batch_size,
                             self.env.action_space.shape[0]))
                        batch_next_state = np.zeros((self.model.batch_size, 1))
                        batch_done = np.zeros((self.model.batch_size, ))
                        e_id = eid

                        for k, (s0, a, r, s1, done) in enumerate(batch):
                            batch_state[k] = s0
                            batch_reward[k] = r
                            batch_action[k] = a
                            batch_next_state[k] = s1
                            batch_done[k] = done
                        batch_done = batch_done.astype(int)

                        future_action = sess.run(
                            self.model.Actor_target.output,
                            feed_dict={self.model.states_ph: batch_next_state})
                        future_Q = sess.run(self.model.Critic_target.output,
                                            feed_dict={
                                                self.model.states_ph:
                                                batch_next_state,
                                                self.model.actions_ph:
                                                future_action
                                            })[:, 0]

                        future_Q[batch_done] = 0
                        targets = batch_reward + (future_Q *
                                                  self.model.discount)
                        # train Critic
                        sess.run(self.critic_train_ops,
                                 feed_dict={
                                     self.model.states_ph: batch_state,
                                     self.model.actions_ph: batch_action,
                                     self.target_Q_ph:
                                     np.expand_dims(targets, 1)
                                 })

                        # train Actor
                        actor_actions = sess.run(
                            self.model.Actor.output,
                            feed_dict={self.model.states_ph: batch_state})
                        action_grads = sess.run(self.model.Critic.action_grads,
                                                feed_dict={
                                                    self.model.states_ph:
                                                    batch_state,
                                                    self.model.actions_ph:
                                                    actor_actions
                                                })
                        sess.run(self.actor_train_ops,
                                 feed_dict={
                                     self.model.states_ph: batch_state,
                                     self.actions_grads_ph: action_grads[0]
                                 })
                        # update target
                        sess.run(self.update_critic_target)
                        sess.run(self.update_actor_target)

                        # calcul TD error
                        old_Q_value = sess.run(self.model.Critic.output,
                                               feed_dict={
                                                   self.model.states_ph:
                                                   batch_state,
                                                   self.model.actions_ph:
                                                   batch_action
                                               })[:, 0]
                        future_action = future_action = sess.run(
                            self.model.Actor_target.output,
                            feed_dict={self.model.states_ph: batch_next_state})
                        future_Q_value = sess.run(
                            self.model.Critic_target.output,
                            feed_dict={
                                self.model.states_ph: batch_next_state,
                                self.model.actions_ph: future_action
                            })[:, 0]
                        error = np.absolute(batch_reward +
                                            self.model.discount *
                                            (future_Q_value - old_Q_value))

                        # update priority
                        self.model.memory_buffer.update_priority(e_id, error)
                        self.train_iteration += 1
                        if self.train_iteration % 100 == 0:
                            self.model.memory_buffer.rebalance()

                    if terminal or i_step == 50000:
                        end = time.time()
                        print("Episode {} =>>>>> Score {}".format(
                            i_episode + 1, one_episode_score))
                        scores.append(one_episode_score)
                        # write reward for tensorboard
                        summary = sess.run(
                            self.tf_reward_summary,
                            feed_dict={self.tf_reward: one_episode_score})
                        # add summary to writer
                        self.writer.add_summary(summary, i_episode)
                        # timer
                        summary = sess.run(
                            self.tf_time_summary,
                            feed_dict={self.tf_time: end - start})
                        self.writer.add_summary(summary, i_episode)
                        # timer
                        summary = sess.run(self.tf_step_summary,
                                           feed_dict={self.tf_step: i_step})
                        self.writer.add_summary(summary, i_episode)
                        break
                name = "./log/training.txt"
                with open(name, 'a') as f:
                    f.write("Total score : {} \n".format(one_episode_score))
                f.close()
            sess.close()

    """def experience_replay(self):