class DDPG(): def __init__(self, task, sess): self.sess = sess self.env = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.tau = 0.001 self.minibatch_size = 64 self.critic_lr = 0.001 self.gamma = 0.99 self.buffer_size = 1000000 self.random_seed = 1234 self.summary_dir = "/" #self.max_episode = 100 #self.max_episode_len = 100 self.mu = 0 self.actor = ActorNetwork(self.sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.tau, self.minibatch_size) self.critic = CriticNetwork(self.sess, self.state_size, self.action_size, self.critic_lr, self.tau, self.gamma, self.actor.get_num_trainable_vars()) # Initialize replay memory self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed) self.sess.run(tf.global_variables_initializer()) self.actor.update_target_network() self.critic.update_target_network() self.noise = OUNoise(self.action_size, self.mu) self.sess.run(tf.global_variables_initializer()) def reset_episode(self): #self.actor_noise.reset() state = self.env.reset() self.last_state = state self.ep_ave_max_q = 0 self.ep_reward = 0 return state def step(self, s, a, r, terminal, s2): # Save experience / reward #self.memory.add(self.last_state, action, reward, next_state, done) #summary_ops, summary_vars = self.build_summaries() self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )), np.reshape(a, (self.actor.a_dim, )), r, terminal, np.reshape(s2, (self.actor.s_dim, ))) # Learn, if enough samples are available in memory if self.replay_buffer.size() > self.minibatch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch) target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() # Roll over last state and action self.last_state = s2 ''' self.ep_reward +=r if terminal: summary_str = self.sess.run( , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)}) writer.add_summary(summary_str, i) #writer.flush() print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \ (self.ep_ave_max_q / float(j)))) ''' def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.actor.predict(states)[0] #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size)) #print(actions) return actions + self.noise.sample() # add some noise for exploration def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch): target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() def build_summaries(self): episode_reward = tf.Variable(0.) tf.summary.scalar("Reward", episode_reward) episode_ave_max_q = tf.Variable(0.) tf.summary.scalar("Qmax Value", episode_ave_max_q) summary_vars = [episode_reward, episode_ave_max_q] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars
class DrlAgent: def __init__(self, sess, is_train, dim_state, dim_action, num_paths, actor_learn_rate, critic_learn_rate, tau, buffer_size, mini_batch, ep_begin, epsilon_end, gamma, max_epoch, seed=66): self.__is_train = is_train self.__dim_state = dim_state self.__dim_action = dim_action self.__mini_batch = mini_batch self.__ep_begin = ep_begin self.__gamma = gamma self.__max_epoch = max_epoch self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0, actor_learn_rate, tau, num_paths) self.__critic = CriticNetwork(sess, dim_state, dim_action, critic_learn_rate, tau) self.__replay = ReplayBuffer(buffer_size, seed) self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch, dim_action, num_paths, seed) self.__state_curt = np.zeros(dim_state) self.__action_curt = self.__explorer.convert_action( np.ones(dim_action)) self.__episode = 0 self.__step = 0 def target_paras_init(self): self.__actor.update_target_paras() self.__critic.update_target_paras() def predict(self, state, reward): action_original = self.__actor.predict([state])[0] if not self.__is_train: return action_original action = self.__explorer.get_act(action_original) self.__replay.add(self.__state_curt, self.__action_curt, reward, state) self.__state_curt = state self.__action_curt = action if len(self.__replay) > self.__mini_batch: self.train() self.__step += 1 if self.__step >= self.__max_epoch: self.__step = 0 self.__episode += 1 self.__explorer.reset_ep(self.__ep_begin) return action def train(self): batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch( self.__mini_batch) weights = [1.0] * self.__mini_batch weights = np.expand_dims(weights, axis=1) target_q = self.__critic.predict_target( batch_state_next, self.__actor.predict_target(batch_state_next)) value_q = self.__critic.predict(batch_state, batch_action) batch_y = [] batch_error = [] for k in range(len(batch_reward)): target_y = batch_reward[k] + self.__gamma * target_q[k] batch_error.append(abs(target_y - value_q[k])) batch_y.append(target_y) predicted_q, _ = self.__critic.train(batch_state, batch_action, batch_y, weights) a_outs = self.__actor.predict(batch_state) grads = self.__critic.calculate_gradients(batch_state, a_outs) weighted_grads = weights * grads[0] self.__actor.train(batch_state, weighted_grads) self.__actor.update_target_paras() self.__critic.update_target_paras()
def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False): with tf.Session() as sess: # configuring environment env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary # Creating agent ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except: pass # print('********************************') # print('Failed to restore models') # print('********************************') for i in range(epochs): state = env.reset() state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= (epsilon/EXPLORE) epsilon = np.maximum(min_epsilon,epsilon) while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1) action = action_original + max(epsilon,0)*ruido.noise() # remove comment if you want to see a step by step update # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward, done, np.reshape(next_state, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step +=1 if done: ruido.reset() if state[0] > 0.45: #print('****************************************') #print('got it!') #print('****************************************') goal += 1 if max_state_episode > max_state: max_state = max_state_episode print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) ) # print('Efficiency', 100.*((goal)/(i+1.))) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def actor_critic(epochs=1000, GAMMA=0.99, load_file=False, render=False, temp=False, verbose=False): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear #robot = gym_pendulum(render, temp) robot = gym_mountaincar(render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE, ACTION_BOUND, device=DEVICE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars(), device=DEVICE) # starting tensorflow sess.run(tf.global_variables_initializer()) if load_file: actor.recover_actor() critic.recover_critic() for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 while (not done): # Choose and take action, and observe reward action, mu, sigma = actor.predict( np.reshape(state, (1, robot.state_dim))) new_action = action + 0.2 * (np.random.rand(1)[0]) action_noise = np.clip(new_action, -ACTION_BOUND, ACTION_BOUND) # print(round(action,3), round(new_action,3), round(action_noise,3), round(mu,3), round(sigma,3)) next_state, reward, done, step = robot.update(action_noise) # Train V_minib = critic.predict( np.reshape(state, (1, robot.state_dim))) V_minib_next = critic.predict( np.reshape(next_state, (1, robot.state_dim))) if done: td_target = reward td_error = reward - V_minib # not - V_minib[k] ? else: td_target = reward + GAMMA * V_minib_next td_error = reward + GAMMA * V_minib_next - V_minib #critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) actor.train(np.reshape(state, (1, robot.state_dim)), np.reshape(action, (1, 1)), np.reshape(td_error, (1, 1))) state = next_state ep_reward = ep_reward + reward # this print is usefull for debuggin if verbose: print(step, 'action', round(action, 3), 'state', round(robot.state[0], 3), round(robot.state[1], 3), 'r', round(reward, 3)) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') #time.sleep(1) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
class DDPG: def __init__(self, pretrain=False): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.session = tf.Session(config=config) # self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches # self.visualize_input = VISUALIZE_BUFFER # if self.visualize_input: # self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 662 self.width = 1 self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='float32') self.old_action = np.ones(2, dtype='float32') self.network_action = np.zeros(2, dtype='float32') self.noise_action = np.zeros(2, dtype='float32') self.action = np.zeros(2, dtype='float32') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph) def train(self): # Check if the buffer is big enough to start training if self.data_manager.enough_data(): # start_ = time.time() # get the next random batch from the data manger state_batch, \ action_batch, \ reward_batch, \ next_state_batch, \ is_episode_finished_batch = self.data_manager.get_next_batch() state_batch = np.divide(state_batch, 10.0) next_state_batch = np.divide(next_state_batch, 10.0) # Are we visualizing the first state batch for debugging? # If so: We have to scale up the values for grey scale before plotting # if self.visualize_input: # state_batch_np = np.asarray(state_batch) # state_batch_np = np.multiply(state_batch_np, -100.0) # state_batch_np = np.add(state_batch_np, 100.0) # self.viewer.set_data(state_batch_np) # self.viewer.run() # self.visualize_input = False # Calculate y for the td_error of the critic # start = time.time() y_batch = [] next_action_batch = self.actor_network.target_evaluate( next_state_batch, action_batch) q_value_batch = self.critic_network.target_evaluate( next_state_batch, next_action_batch) # done = time.time() # elapsed = done - start # print "forward actor and critic time is: ", elapsed for i in range(0, BATCH_SIZE): if is_episode_finished_batch[i]: y_batch.append([reward_batch[i]]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Now that we have the y batch lets train the critic # start = time.time() self.critic_network.train(y_batch, state_batch, action_batch) # done = time.time() # elapsed = done - start # print "train critic time is: ", elapsed # self.critic_network.train(y_batch, state_batch, action_batch) # Get the action batch so we can calculate the action gradient with it # Then get the action gradient batch and adapt the gradient with the gradient inverting method # start = time.time() action_batch_for_gradients = self.actor_network.evaluate( state_batch, action_batch) # done = time.time() # elapsed = done - start # print "forward action after critic training time is: ", elapsed q_gradient_batch = self.critic_network.get_action_gradient( state_batch, action_batch_for_gradients) q_gradient_batch = self.grad_inv.invert( q_gradient_batch, action_batch_for_gradients) # Now we can train the actor # start = time.time() self.actor_network.train(q_gradient_batch, state_batch, action_batch) # done = time.time() # elapsed = done - start # print "train actor time is: ", elapsed # done = time.time() # elapsed = done - start_ # print "====== total time is: ", elapsed # Save model if necessary if self.training_step > 0 and self.training_step % SAVE_STEP == 0: self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step) # Update time step self.training_step += 1 if self.training_step % 400 == 0: print "iter: ", self.training_step # start_ = time.time() self.data_manager.check_for_enqueue() # done = time.time() # elapsed = done - start_ # print "############ check enqueue time is: ", elapsed def get_action(self, state, old_action): # normalize the state state = state.astype(float) state = np.divide(state, 10.0) # Get the action self.action = self.actor_network.get_action(state, old_action) self.action = self.action.reshape((2, )) # Are we using noise? if self.noise_flag: # scale noise down to 0 at training step 3000000 self.action = 0.8 * self.exploration_noise.noise() # if self.training_step < MAX_NOISE_STEP: # self.action += (MAX_NOISE_STEP - self.training_step) / \ # MAX_NOISE_STEP * self.exploration_noise.noise() # if action value lies outside of action bounds, rescale the action vector # if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]: # self.action *= np.fabs(A0_BOUNDS[0] / self.action[0]) # if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]: # self.action *= np.fabs(A1_BOUNDS[0] / self.action[1]) # Life q value output for this action and state self.print_q_value(state, self.action) return self.action def set_experience(self, state, reward, is_episode_finished): # Make sure we're saving a new old_state for the first experience of every episode if self.first_experience: self.first_experience = False else: state.astype('float32') self.old_action.astype('float32') self.old_action.astype('float32') self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state, is_episode_finished) # Uncomment if collecting data for the auto_encoder # experience = (self.old_state, self.old_action, reward, state, is_episode_finished) # self.buffer.append(experience) if is_episode_finished: self.first_experience = True self.exploration_noise.reset() # Safe old state and old action for next experience self.old_state = state self.old_action = self.action def print_q_value(self, state, action): string = "-" q_value = self.critic_network.evaluate([state], [action]) stroke_pos = 30 * q_value[0][0] + 30 if stroke_pos < 0: stroke_pos = 0 elif stroke_pos > 60: stroke_pos = 60
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.Session() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.sess.run(tf.global_variables_initializer()) #target_param <- eval_param self.actor_network.update_target() self.critic_network.update_target() def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.sample(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.size > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: def __init__(self): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches self.visualize_input = VISUALIZE_BUFFER if self.visualize_input: self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 86 self.width = self.height self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8') self.old_action = np.ones(2, dtype='float') self.network_action = np.zeros(2, dtype='float') self.noise_action = np.zeros(2, dtype='float') self.action = np.zeros(2, dtype='float') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph) def train(self): # Check if the buffer is big enough to start training if self.data_manager.enough_data(): # get the next random batch from the data manger state_batch, \ action_batch, \ reward_batch, \ next_state_batch, \ is_episode_finished_batch = self.data_manager.get_next_batch() state_batch = np.divide(state_batch, 100.0) next_state_batch = np.divide(next_state_batch, 100.0) # Are we visualizing the first state batch for debugging? # If so: We have to scale up the values for grey scale before plotting if self.visualize_input: state_batch_np = np.asarray(state_batch) state_batch_np = np.multiply(state_batch_np, -100.0) state_batch_np = np.add(state_batch_np, 100.0) self.viewer.set_data(state_batch_np) self.viewer.run() self.visualize_input = False # Calculate y for the td_error of the critic y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): if is_episode_finished_batch[i]: y_batch.append([reward_batch[i]]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Now that we have the y batch lets train the critic self.critic_network.train(y_batch, state_batch, action_batch) # Get the action batch so we can calculate the action gradient with it # Then get the action gradient batch and adapt the gradient with the gradient inverting method action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients) q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients) # Now we can train the actor self.actor_network.train(q_gradient_batch, state_batch) # Save model if necessary if self.training_step > 0 and self.training_step % SAVE_STEP == 0: self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step) # Update time step self.training_step += 1 self.data_manager.check_for_enqueue() def get_action(self, state): # normalize the state state = state.astype(float) state = np.divide(state, 100.0) # Get the action self.action = self.actor_network.get_action(state) # Are we using noise? if self.noise_flag: # scale noise down to 0 at training step 3000000 if self.training_step < MAX_NOISE_STEP: self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise() # if action value lies outside of action bounds, rescale the action vector if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]: self.action *= np.fabs(A0_BOUNDS[0]/self.action[0]) if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]: self.action *= np.fabs(A1_BOUNDS[0]/self.action[1]) # Life q value output for this action and state self.print_q_value(state, self.action) return self.action def set_experience(self, state, reward, is_episode_finished): # Make sure we're saving a new old_state for the first experience of every episode if self.first_experience: self.first_experience = False else: self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state, is_episode_finished) # Uncomment if collecting data for the auto_encoder # experience = (self.old_state, self.old_action, reward, state, is_episode_finished) # self.buffer.append(experience) if is_episode_finished: self.first_experience = True self.exploration_noise.reset() # Safe old state and old action for next experience self.old_state = state self.old_action = self.action def print_q_value(self, state, action): string = "-" q_value = self.critic_network.evaluate([state], [action]) stroke_pos = 30 * q_value[0][0] + 30 if stroke_pos < 0: stroke_pos = 0 elif stroke_pos > 60: stroke_pos = 60 print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \ "\tt: ", self.training_step
def actor_critic(epochs=1000, GAMMA=0.99, train_indicator=True, render=False, temp=False): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars()) # starting tensorflow sess.run(tf.global_variables_initializer()) for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 while (not done): # Choose and take action, and observe reward action_prob = actor.predict( np.reshape(state, (1, robot.state_dim))) action = np.random.choice(np.arange(len(action_prob)), p=action_prob) next_state, reward, done, step = robot.update(action) # Train V_minib = critic.predict( np.reshape(state, (1, robot.state_dim))) V_minib_next = critic.predict( np.reshape(next_state, (1, robot.state_dim))) if done: td_target = reward td_error = reward - V_minib # not - V_minib[k] ? else: td_target = reward + GAMMA * V_minib_next td_error = reward + GAMMA * V_minib_next - V_minib critic.train(np.reshape(state, (1, robot.state_dim)), np.reshape(td_target, (1, 1))) actor.train(np.reshape(state, (1, robot.state_dim)), np.reshape(action, (1, 1)), np.reshape(td_error, (1, 1))) state = next_state ep_reward = ep_reward + reward # this print is usefull for debuggin #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def actor_critic(epochs=1000, GAMMA=0.99, train_indicator=True, render=False, temp=False, baseline=True): with tf.Session() as sess: # define objects # the gym environment is wrapped in a class. this way of working allows portability with other robots in the lab & makes the main very clear robot = gym_environment('FrozenLakeNonskid8x8-v0', False, render, temp) actor = ActorNetwork(sess, robot.state_dim, robot.action_dim, ACTOR_LEARNING_RATE) critic = CriticNetwork(sess, robot.state_dim, CRITIC_LEARNING_RATE, actor.get_num_trainable_vars()) # starting tensorflow sess.run(tf.global_variables_initializer()) for i in range(epochs): # Reset the environment state, done, step = robot.reset() ep_reward = 0 total_reward = np.zeros(max_episode) total_state = deque() total_action = deque() k = 0 while (not done) and k < max_episode: # Choose and take action, and observe reward action_prob = actor.predict( np.reshape(state, (1, robot.state_dim))) action = np.random.choice(np.arange(len(action_prob)), p=action_prob) next_state, reward, done, step = robot.update(action) # store episode information total_reward[k] = reward total_state.append(state) total_action.append(action) state = next_state k = k + 1 # Train # get G for l in range(k): G = np.sum(total_reward[l:k + 1]) #print(l,G) # print for debug state = np.reshape(total_state[l], (1, robot.state_dim)) action = np.reshape(total_action[l], (1, 1)) if baseline: delta = G - critic.predict(state) critic.train(state, delta) actor.train(state, action, delta) else: actor.train(state, action, G) # this print is usefull for debuggin #print(step,'action', action, 'state', robot.uncodedstate,'r', round(reward,3), 'prob', action_prob) print('episode', i + 1, 'Steps', step, 'Reward:', ep_reward, 'goal achieved:', robot.goal, 'Efficiency', round(100. * ((robot.goal) / (i + 1.)), 0), '%') print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
def trainer(env, outdir, epochs=100, MINIBATCH_SIZE=64, GAMMA=0.99, epsilon=0.01, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=False, render=False): tf.reset_default_graph() with tf.Session(config=config) as sess: # configuring environment #env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space action_dim = env.action_space action_bound = np.float64( 1 ) # I choose this number since the mountain continuos does not have a boundary # Creating agent # FOR the RNN #tf.contrib.rnn.core_rnn_cell.BasicLSTMCell from https://github.com/tensorflow/tensorflow/issues/8771 #cell = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None) #cell_target = tf.contrib.rnn.BasicLSTMCell(num_units=300,state_is_tuple=True, reuse = None) ruido = OUNoise(action_dim, mu=0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, outdir) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), outdir) #sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) replay_buffer.load() #goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except Exception as e: print('********************************') print(e) print('********************************') #critic.recover_critic() #actor.recover_actor() for i in range(epochs): state = env.reset() #state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= epsilon / EXPLORE if epsilon < min_epsilon: epsilon = min_epsilon while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise np.set_printoptions(precision=4) # remove comment if you want to see a step by step update #print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : action_original = actor.predict( np.reshape(state, (1, actor.s_dim ))) # + (10. / (10. + i))* np.random.randn(1) action = action_original #+ max(epsilon, 0) * ruido.noise() ''' for j in range(action.shape[1]): if abs(action[0,j]) > 1: act=action[0,j] action[0,j]=act/abs(act) else: continue ''' action = np.reshape(action, (actor.a_dim, )) next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim, )), np.reshape(action, (actor.a_dim, )), reward, done, np.reshape(next_state, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch), 20) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)), 20) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs, 20) c = np.array(grads) #print(c.shape) #print('...') #print('...',c[0].shape) #print('...') actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step += 1 if max_state_episode > max_state: max_state = max_state_episode print('th', i + 1, 'Step', step, 'Reward:', ep_reward, 'Pos', next_state[0], next_state[1], 'epsilon', epsilon) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************') replay_buffer.save() #proc = Popen(['rosclean','purge'],stdout=PIPE, stdin=PIPE, stderr=PIPE,universal_newlines=True) #out,err = proc.communicate(input="{}\n".format("y")) #print('maxmimum state reach', max_state) #print('the reward at the end of the episode,', reward) #print('Efficiency', 100.*((goal)/(i+1.))) ''' print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************') replay_buffer.save() #env.close() ''' sess.close() return 0