# for i in range(200): # observation, reward, done, info = env.step(tools.active_one_muscle("iliopsoas", "r", 1)) # # if i == 20: # state_desc = env.get_state_desc() # print(type(state_desc["body_pos"]["toes_r"][0:2])) # print(state_desc["body_pos"]["talus_l"][0:2]) # print(state_desc["body_pos"]["talus_r"][0:2]) # print(state_desc["misc"]["mass_center_pos"]) # print(state_desc["body_pos_rot"]) # input("Press Enter to continue...") # print(reward) if __name__ == '__main__': #init() # with open("C:\\Users\\YunfeiZHAO\\Desktop\\1.json", 'r') as f: # content = f.read() # print(content) # parsed = json.loads(content) # jp = json.dump(parsed, indent=4, sort_keys=True) # print(jp) env = L2M2019Env(visualize=True) observation = env.reset(project=True, seed=None, init_pose=None, obs_as_dict=True) tools = Tools() state_desc = env.get_state_desc() print(tools.get_reward(state_desc))
class Trainer(): def __init__(self, env, args): self.direction = args.direction self.env = env self.num_episodes = args.episodes self.episode_start = 0 self.noise = OUNoise(mu=np.zeros(self.env.action_space.shape)) self.noise_decay = args.noise_decay self.count_exp_replay = 0 self.train_iteration = 0 self.tau = args.TAU self.tools = Tools() def tryLoadWeights(self): print("Load weights \n") try: with open("./log/data.txt", 'r') as f: data = f.read() num_episodes, model_name_prefix, noise, epsilon = data.split(" ") self.episode_start = num_episodes self.num_episodes += num_episodes self.noise = noise self.epsilon = epsilon print("Log loaded !\n") self.saver.restore(self.sess, "./model/model.ckpt") print("Weights load successfully ! \n") self.memory_buffer.load() print("Memory buffer load succesfully ! \n") return (0) except: # if self.episode_start == 0: # return(False) print("New training \n") return (1) def play_to_init_buffer(self): state = self.env.reset(obs_as_dict=False) state = np.asarray(state) for random_step in range(1, args.init_buffer_size + 1): #self.env.render() print("\r Examples : {}/{}".format(random_step, args.init_buffer_size), end="") sys.stdout.flush() action = self.env.action_space.sample() next_state, reward, terminal, _ = self.env.step(action, obs_as_dict=False) reward = self.tools.get_reward(self.direction, self.env.get_state_desc()) next_state = np.asarray(next_state) self.model.memory_buffer.add(state, action, reward, next_state, terminal) state = np.copy(next_state) if terminal: self.env.reset() def DDPG(self): tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: self.model = Actor_Critic(env, args) self.target_Q_ph = tf.placeholder(tf.float32, shape=(None, 1)) self.actions_grads_ph = tf.placeholder( tf.float32, shape=((None, ) + self.env.action_space.shape)) # train operation self.actor_train_ops = self.model.Actor.train_step( self.actions_grads_ph) self.critic_train_ops = self.model.Critic.train_step( self.target_Q_ph) # update operation self.update_critic_target = self.model.update_target_network( self.model.Critic.network_params, self.model.Critic_target.network_params, self.tau) self.update_actor_target = self.model.update_target_network( self.model.Actor.network_params, self.model.Actor_target.network_params, self.tau) # reward summary for tensorboard self.tf_reward = tf.Variable(0.0, trainable=False, name='Reward_per_episode') self.tf_reward_summary = tf.summary.scalar("Reward by episode", self.tf_reward) # time self.tf_time = tf.Variable(0.0, trainable=False, name='Time_per_episode') self.tf_time_summary = tf.summary.scalar("Time per episode", self.tf_time) # step self.tf_step = tf.Variable(0.0, trainable=False, name='Step_per_episode') self.tf_step_summary = tf.summary.scalar("Step per episode", self.tf_step) # writer self.writer = tf.summary.FileWriter('./graphs', sess.graph) sess.run(tf.initialize_all_variables()) # init target networks by copying weights from actor and critic network sess.run( self.model.update_target_network( self.model.Critic.network_params, self.model.Critic_target.network_params)) sess.run( self.model.update_target_network( self.model.Actor.network_params, self.model.Actor_target.network_params)) #save by saver saver = tf.train.Saver() scores = [] for i_episode in range(self.episode_start, self.num_episodes): start = time.time() one_episode_score = 0 # write log of training name = "./log/training.txt" with open(name, 'a') as f: f.write("Episode {}/{} \n".format(i_episode + 1, self.num_episodes)) f.close() if (i_episode + 1) % 100 == 0: avg = np.mean(np.asarray(scores)) self.noise_decay *= 0.95 if (i_episode + 1) % 500 == 0: #self.saveWeights("./checkpoints/{}_{}_".format(args.direction, i_episode), sess) save = saver.save( sess, "./checkpoints/left_{}.ckpt".format(i_episode)) print(save) print( "Episode {}/{} : Average score in 100 latest episodes : {}" .format(i_episode + 1, self.num_episodes, avg)) scores.clear() # reset env state = self.env.reset(obs_as_dict=False) state = np.asarray(state) self.noise.reset() angle_state = np.arccos( self.tools.get_reward(self.direction, self.env.get_state_desc())) for i_step in itertools.count(): action = sess.run(self.model.Actor.output, feed_dict={ self.model.states_ph: np.expand_dims( np.array([angle_state]), 0), self.model.is_training_ph: False })[0] action += self.noise() * self.noise_decay # execute action action_with_noise and observe reward r_t and s_t+1 next_state, reward, terminal, _ = self.env.step( action, obs_as_dict=False) reward = self.tools.get_reward(self.direction, self.env.get_state_desc()) #angle_next_state = np.arccos(self.tools.get_reward(self.direction, self.env.get_state_desc())) angle_next_state = self.tools.get_reward( self.direction, self.env.get_state_desc()) #reward = np.cos(angle_next_state) name = "./log/training.txt" with open(name, 'a') as f: f.write("Episode {}/{} == Step : {} =>>> Reward {} \n". format(i_episode + 1, self.num_episodes, i_step, reward)) f.close() next_state = np.asarray(next_state) self.model.memory_buffer.add(angle_state, action, reward, angle_next_state, terminal) angle_state = angle_next_state one_episode_score += reward state = np.copy(next_state) #self.experience_replay() if self.model.memory_buffer.count( ) >= self.model.batch_size * 10: batch, w_id, eid = self.model.memory_buffer.getBatch( self.model.batch_size) batch_state = np.zeros((self.model.batch_size, 1)) batch_reward = np.zeros((self.model.batch_size, )) batch_action = np.zeros( (self.model.batch_size, self.env.action_space.shape[0])) batch_next_state = np.zeros((self.model.batch_size, 1)) batch_done = np.zeros((self.model.batch_size, )) e_id = eid for k, (s0, a, r, s1, done) in enumerate(batch): batch_state[k] = s0 batch_reward[k] = r batch_action[k] = a batch_next_state[k] = s1 batch_done[k] = done batch_done = batch_done.astype(int) future_action = sess.run( self.model.Actor_target.output, feed_dict={self.model.states_ph: batch_next_state}) future_Q = sess.run(self.model.Critic_target.output, feed_dict={ self.model.states_ph: batch_next_state, self.model.actions_ph: future_action })[:, 0] future_Q[batch_done] = 0 targets = batch_reward + (future_Q * self.model.discount) # train Critic sess.run(self.critic_train_ops, feed_dict={ self.model.states_ph: batch_state, self.model.actions_ph: batch_action, self.target_Q_ph: np.expand_dims(targets, 1) }) # train Actor actor_actions = sess.run( self.model.Actor.output, feed_dict={self.model.states_ph: batch_state}) action_grads = sess.run(self.model.Critic.action_grads, feed_dict={ self.model.states_ph: batch_state, self.model.actions_ph: actor_actions }) sess.run(self.actor_train_ops, feed_dict={ self.model.states_ph: batch_state, self.actions_grads_ph: action_grads[0] }) # update target sess.run(self.update_critic_target) sess.run(self.update_actor_target) # calcul TD error old_Q_value = sess.run(self.model.Critic.output, feed_dict={ self.model.states_ph: batch_state, self.model.actions_ph: batch_action })[:, 0] future_action = future_action = sess.run( self.model.Actor_target.output, feed_dict={self.model.states_ph: batch_next_state}) future_Q_value = sess.run( self.model.Critic_target.output, feed_dict={ self.model.states_ph: batch_next_state, self.model.actions_ph: future_action })[:, 0] error = np.absolute(batch_reward + self.model.discount * (future_Q_value - old_Q_value)) # update priority self.model.memory_buffer.update_priority(e_id, error) self.train_iteration += 1 if self.train_iteration % 100 == 0: self.model.memory_buffer.rebalance() if terminal or i_step == 50000: end = time.time() print("Episode {} =>>>>> Score {}".format( i_episode + 1, one_episode_score)) scores.append(one_episode_score) # write reward for tensorboard summary = sess.run( self.tf_reward_summary, feed_dict={self.tf_reward: one_episode_score}) # add summary to writer self.writer.add_summary(summary, i_episode) # timer summary = sess.run( self.tf_time_summary, feed_dict={self.tf_time: end - start}) self.writer.add_summary(summary, i_episode) # timer summary = sess.run(self.tf_step_summary, feed_dict={self.tf_step: i_step}) self.writer.add_summary(summary, i_episode) break name = "./log/training.txt" with open(name, 'a') as f: f.write("Total score : {} \n".format(one_episode_score)) f.close() sess.close() """def experience_replay(self):