class DDPG: """docstring for DDPG""" def __init__(self, env, results_file): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) results_file.write(ActorNetwork.get_settings()) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) self.critic_network = CriticNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self, observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer, BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch, [BATCH_SIZE, 1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate( next_state_batch) q_value_batch = self.critic_network.target_evaluate( next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) / BATCH_SIZE self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action + self.exploration_noise.noise(), self.environment.action_space.low, self.environment.action_space.high) def set_feedback(self, observation, action, reward, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append( (self.state, action, reward, next_state, done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class RDPG: """docstring for RDPG""" def __init__(self, env): self.name = 'RDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.saver = tf.train.Saver() def train(self): # Sample a random minibatch of N sequences from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # Construct histories observations = [] next_observations = [] actions = [] rewards = [] dones = [] for each in minibatch: for i in range(1, len(each.observations)): observations.append(self.pad(each.observations[0:i])) next_observations.append(self.pad(each.observations[1, i + 1])) actions.append(each.actions[0:i - 1]) rewards.append(each.rewards[0:i]) if i == len(each.observations) - 1: dones.append(True) else: dones.append(False) # Calculate y_batch next_action_batch = self.actor_network.target_action(observations) q_value_batch = self.critic_network.target_q( next_observations, [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)]) y_batch = [] for i in range(len(observations)): if dones[i]: y_batch.append(rewards[i][-1]) else: y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [len(observations), 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, observations, [self.pad(i) for i in actions]) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(observations) q_gradient_batch = self.critic_network.gradients( observations, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, observations) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def save_model(self, path, episode): self.saver.save(self.sess, path + "modle.ckpt", episode) def noise_action(self, history): # Select action a_t according to a sequence of observation and action action = self.actor_network.action(history) return action + self.exploration_noise.noise() def action(self, history): action = self.actor_network.action(history) return action def perceive(self, history): # Store the history sequence in the replay buffer self.replay_buffer.add(history) # Store history to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def pad(self, input): dim = len(input[0]) return input + [[0] * dim] * (1000 - len(input))
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self,observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch,[BATCH_SIZE,1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch) for i in range(0,BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high) def set_feedback(self,observation,action,reward,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append((self.state,action,reward,next_state,done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return def train(self): action_dim = self.action_dim minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # sample BATCH_SIZE from replay_buffer state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # if action_dim = 1, it's a number not a array action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim]) # calculate y_batch via target network next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch) y_batch = [] for i in range(BATCH_SIZE): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # print np.shape(reward_batch), np.shape(y_batch) # train actor network self.actor_network.train(state_batch) # train critic network self.critic_network.train(y_batch, state_batch, action_batch) # update target network self.actor_network.update_target() self.critic_network.update_target() return def noise_action(self, state): action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def _record_log(self, reward, living_time): summary_str = self.sess.run(self.summary_op, feed_dict={ self.reward_input: reward, self.time_input: living_time }) self.summary_writer.add_summary(summary_str, self.time_step) return def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.episode_start_time == 0.0: self.episode_start_time = time.time() # for testing # self.time_step += 1 # if self.time_step == 100: # print '--------------------------------' # self.replay_buffer.save_to_pickle() # return self.episode_reward += reward living_time = time.time() - self.episode_start_time if self.time_step % 1000 == 0 or done: self._record_log(self.episode_reward, living_time) if self.replay_buffer.size() > REPLAY_START_SIZE: self.train() if self.time_step % 100000 == 0: self.save_network() if done: print '===============reset noise=========================' self.exploration_noise.reset() self.episode_reward = 0.0 self.episode_start_time = time.time() self.time_step += 1 return def load_time_step(self): if not os.path.exists(self.dir_path): return files = os.listdir(self.dir_path) step_list = [] for filename in files: if ('meta' in filename) or ('-' not in filename): continue step_list.append(int(filename.split('-')[-1])) step_list = sorted(step_list) if len(step_list) == 0: return self.time_step = step_list[-1] + 1 return def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.dir_path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print 'Successfully loaded:', checkpoint.model_checkpoint_path else: print 'Could not find old network weights' return def save_network(self): print 'save actor-critic network...', self.time_step self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step) return
class Worker: """docstring for DDPG""" def __init__(self, sess, number, model_path, global_episodes, explore, decay, training): self.name = 'worker_' + str(number) # name for uploading results self.number = number # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 41 self.action_dim = 18 self.model_path = model_path self.global_episodes = global_episodes self.increment = self.global_episodes.assign_add(1) self.sess = sess self.explore = explore self.decay = decay self.training = training self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/actor') self.actor_network.update_target(self.sess) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.name + '/critic') self.critic_network.update_target(self.sess) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.update_local_ops_actor = update_target_graph( 'global/actor', self.name + '/actor') self.update_local_ops_critic = update_target_graph( 'global/critic', self.name + '/critic') def start(self, setting=0): self.env = RunEnv(visualize=True) self.setting = setting def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions( self.sess, next_state_batch) q_value_batch = self.critic_network.target_q(self.sess, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.sess, selfstate_batch) q_gradient_batch = self.critic_network.gradients( self.sess, state_batch, action_batch_for_gradients) self.actor_network.train(self.sess, q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target(self.sess) self.critic_network.update_target(self.sess) def save_model(self, saver, episode): #if self.episode % 10 == 1: if self.name == 'worker_0': saver.save(self.sess, self.model_path + "/model-" + str(episode) + ".ckpt") def noise_action(self, state, decay): # Select action a_t according to the current policy and exploration noise which gradually vanishes action = self.actor_network.action(self.sess, state) return action + self.exploration_noise.noise() * decay def action(self, state): action = self.actor_network.action(self.sess, state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE and self.training: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def work(self, coord, saver): if self.training: episode_count = self.sess.run(self.global_episodes) else: episode_count = 0 wining_episode_count = 0 total_steps = 0 print("Starting worker_" + str(self.number)) with self.sess.as_default(), self.sess.graph.as_default(): #not_start_training_yet = True while not coord.should_stop(): returns = [] rewards = [] episode_reward = 0 if np.random.rand( ) < 0.9: # change Aug20 stochastic apply noise noisy = True self.decay -= 1. / self.explore else: noisy = False self.sess.run(self.update_local_ops_actor) self.sess.run(self.update_local_ops_critic) state = self.env.reset(difficulty=self.setting) #print(observation) s = process_frame(state) print "episode:", episode_count # Train for step in xrange(self.env.spec.timestep_limit): state = process_frame(state) if noisy: action = np.clip( self.noise_action(state, np.maximum(self.decay, 0)), 0.0, 1.0 ) # change Aug20, decay noise (no noise after ep>=self.explore) else: action = self.action(state) next_state, reward, done, _ = self.env.step(action) #print('state={}, action={}, reward={}, next_state={}, done={}'.format(state, action, reward, next_state, done)) next_state = process_frame(next_state) self.perceive(state, action, reward * 100, next_state, done) state = next_state episode_reward += reward if done: break if episode % 5 == 0: print "episode reward:", reward_episode # Testing: #if episode % 1 == 0: if self.name == 'worker_0' and episode_count % 50 == 0 and episode_count > 1: # change Aug19 self.save_model(saver, episode_count) total_return = 0 ave_reward = 0 for i in xrange(TEST): state = self.env.reset() reward_per_step = 0 for j in xrange(self.env.spec.timestep_limit): action = self.action( process_frame(state)) # direct action for test state, reward, done, _ = self.env.step(action) total_return += reward if done: break reward_per_step += (reward - reward_per_step) / (j + 1) ave_reward += reward_per_step ave_return = total_return / TEST ave_reward = ave_reward / TEST returns.append(ave_return) rewards.append(ave_reward) print 'episode: ', episode, 'Evaluation Average Return:', ave_return, ' Evaluation Average Reward: ', ave_reward if self.name == 'worker_0' and self.training: sess.run(self.increment) episode_count += 1 # All done Stop trail # Confirm exit print('Done ' + self.name)
class DDPG: """docstring for DDPG""" def __init__(self, env_name, state_dim, action_dim): self.name = 'DDPG' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Ensure action bound is symmetric self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(save_location) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.getBatch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def saveNetwork(self): self.saver.save(self.sess, save_location + self.env_name + 'network' + '-ddpg', global_step=self.time_step) def action(self, state): action = self.actor_network.action(state) action[0] = np.clip(action[0], -1, 1) action[1] = np.clip(action[1], 0, 1) action[2] = np.clip(action[2], 0, 1) #print "Action:", action return action def noise_action(self, state, epsilon): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) #print action.shape #print "Action_No_Noise:", action noise_t = np.zeros(self.action_dim) noise_t[0] = epsilon * self.OU.function(action[0], 0.0, 0.60, 0.80) noise_t[1] = epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10) noise_t[2] = epsilon * self.OU.function(action[2], -0.1, 1.00, 0.05) if random.random() <= 0.01: # 0.1 print("********Stochastic brake***********") noise_t[2] = epsilon * self.OU.function(action[2], 0.2, 1.00, 0.10) action = action + noise_t action[0] = np.clip(action[0], -1, 1) action[1] = np.clip(action[1], 0, 1) action[2] = np.clip(action[2], 0, 1) #print "Action_Noise:", action return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer if (not (math.isnan(reward))): self.replay_buffer.add(state, action, reward, next_state, done) self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train()
class DDPG: """docstring for DDPG""" def __init__(self): self.name = 'DDPG' # name for uploading results # self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = 12 self.action_dim = 10 self.has_kicked = False self.laststep_haskicked = False self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.saver = tf.train.Saver(max_to_keep=1) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # print(minibatch) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) # print(q_value_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) with open('/home/ruizhao/Desktop/a.txt', 'a') as f: print("action_batch[0]", file=f) print(action_batch[0], file=f) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) with open('/home/ruizhao/Desktop/a.txt', 'a') as f: print("q_gradient_batch[0]", file=f) print(q_gradient_batch[0], file=f) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action2(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def noise_action(self, state): action = self.actor_network.action(state) random_action = np.zeros(10, float) random_action[random.randint(0, 3)] = 1 random_action[4] = random.uniform(-100, 100) #DASH POWER random_action[5] = random.uniform(-180, 180) #DASH DEGREES random_action[6] = random.uniform(-180, 180) #TURN DEGREES random_action[7] = random.uniform(-180, 180) #TACKLE DEGREES random_action[8] = random.uniform(0, 100) #KICK POWER random_action[9] = random.uniform(-180, 180) #KICK DEGREES if np.random.uniform() < EPSILON: return action else: return random_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG(object): def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env self.epsilon_expert_range = (1.0, 0.1) self.epsilon_expert = self.epsilon_expert_range[0] self.epsilon_random_range = (0.1, 0.01) self.epsilon_random = self.epsilon_random_range[0] # Randomly initialize actor network and critic network # with both their target networks # self.state_dim = env.observation_space.shape[0] self.state_dim = 16 # self.action_dim = env.action_space.shape[0] self.action_dim = 3 self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) # self.exploration_noise = OUNoise() self.OU = OU() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: path = checkpoint.model_checkpoint_path self.saver.restore(self.sess, path) self.time_step = int(path[path.rindex('-') + 1:]) self.epsilon_expert -= ( self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= ( self.epsilon_random_range[0] - self.epsilon_random_range[1]) * self.time_step / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.warn( "Successfully loaded: %s, step: %d, epsilon_expert: %s, epsilon_random: %s" % (path, self.time_step, self.epsilon_expert, self.epsilon_random)) else: logger.warn("Could not find old network weights") self.critic_cost = 0 def train(self): self.time_step = self.time_step + 1 self.epsilon_expert -= (self.epsilon_expert_range[0] - self.epsilon_expert_range[1]) / EXPLORE_COUNT self.epsilon_expert = max(self.epsilon_expert, self.epsilon_expert_range[1]) self.epsilon_random -= (self.epsilon_random_range[0] - self.epsilon_random_range[1]) / EXPLORE_COUNT self.epsilon_random = max(self.epsilon_random, self.epsilon_random_range[1]) logger.debug( "step: %d, epsilon_expert: %s, epsilon_random: %s" % (self.time_step, self.epsilon_expert, self.epsilon_random)) # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # if done_batch[i]: # y_batch.append(reward_batch[i]) # else : # y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_cost = self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() # def noise_action(self,state): # # Select action a_t according to the current policy and exploration noise # action = self.actor_network.action(state) # noise = self.exploration_noise.noise(action) # noise_action = action + noise # clipped_noise_action = np.clip(noise_action, 0, 1) # return clipped_noise_action # def noise_action(self,state): # # Select action a_t according to the current policy and exploration noise # action = self.actor_network.action(state) # noise = np.zeros(self.action_dim) # noise[0] = self.epsilon * self.OU.function(action[0], 0.5, 1.00, 0.10) # noise[1] = self.epsilon * self.OU.function(action[1], 0.5, 1.00, 0.10) # noise[2] = self.epsilon * self.OU.function(action[2], 0.5, 1.00, 0.10) # noise_action = action + noise # logger.debug("action: %s, noise: %s" % (action, noise)) # clipped_noise_action = np.clip(noise_action, 0, 1) # return clipped_noise_action def action(self, state): action = self.actor_network.action(state) logger.debug("action: %s" % (action)) return action def opposite_action(self, state): logger.debug("state: %s" % (state)) action = self.actor_network.action(state) logger.debug("action: %s" % (action)) action[0] = 1 - action[0] logger.debug("opposite action: %s" % (action)) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() >= REPLAY_START_SIZE: # logger.debug("train...") self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'DDPG') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class DDPG: def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] # self.state_dim = env.observation_space.shape[0] * 2 self.action_dim = env.action_space.shape[0] self.time_step = 0 self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration # self.exploration_noise = OUNoise(self.action_dim) self.exploration_noise = OUNoise() # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(MODEL_PATH) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) my_config.logger.warn("Successfully loaded: %s" % (checkpoint.model_checkpoint_path)) else: my_config.logger.error("Could not find old network weights") def train(self): # my_config.logger.debug("......enter tain......") # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) noise = self.exploration_noise.noise(action) # if random.random() <= 0.5: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75, 0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5]) # else: # noise = self.exploration_noise.noise(action, # mu=[0, 0, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0, 0, 1, 0, 0, 0.25, 0.75, 0.75]) noise_action = action + noise clipped_noise_action = np.clip(noise_action, 0, 1) # if (self.time_step < 5): # my_config.logger.debug("action: %s, noise: %s, clip: %s" % (action, noise, clipped_noise_action)) return clipped_noise_action def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) self.time_step = self.time_step + 1 # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends # if done: # self.exploration_noise.reset() def saveNetwork(self): # my_config.logger.warn("time step: %s, save model" % (self.time_step)) ckpt_file = os.path.join(MODEL_PATH, 'ltr') self.saver.save(self.sess, ckpt_file, global_step=self.time_step)
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.environment = env self.time_step = 0 self.state_dim = state_dim self.action_dim = action_dim self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.linear_noise = OUNoise(1, 0.5, 0.3, 0.6) self.angular_noise = OUNoise(1, 0, 0.6, 0.8) def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state, epsilon): action = self.actor_network.action(state) noise_t = np.zeros(self.action_dim) noise_t[0] = epsilon * self.linear_noise.noise() noise_t[1] = epsilon * self.angular_noise.noise() action = action + noise_t a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) #print(a_linear, a_angular) return [a_linear, a_angular] def action(self, state): action = self.actor_network.action(state) a_linear = np.clip(action[0], 0, 1) a_linear = round(a_linear, 1) a_angular = np.clip(action[1], -1, 1) a_angular = round(a_angular, 1) return [a_linear, a_angular] def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) if done: self.linear_noise.reset() self.angular_noise.reset() return self.time_step
class ddpg: def __init__(self, env_name, sess, state_dim, action_dim, models_dir, img_dim): self.name = 'DDPG' self.env_name = env_name self.state_dim = state_dim self.action_dim = action_dim self.img_dim = img_dim self.models_dir = models_dir # Ensure action bound is symmetric self.time_step = 0 self.sess = sess self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim, self.img_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.saver = tf.train.Saver() def train(self): minibatch = self.replay_buffer.getBatch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) img_batch = np.asarray([data[5] for data in minibatch]) next_img_batch = np.asarray([data[6] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions( next_state_batch, next_img_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch, next_img_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) critic_cost = self.critic_network.train(y_batch, state_batch, action_batch, img_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( state_batch, img_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients, img_batch) self.actor_network.train(q_gradient_batch, state_batch, img_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() return critic_cost def save_network(self, step): self.saver.save(self.sess, self.models_dir + self.env_name + '-network-ddpg.ckpt', global_step=step) def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.models_dir) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old network weights") ''' def action(self,state): action = self.actor_network.action(state) action[0][0] = np.clip( action[0][0], -1 , 1 ) action[0][1] = np.clip( action[0][1], 0 , 1 ) action[0][2] = np.clip( action[0][2], 0 , 1 ) #print "Action:", action return action[0] def noise_action(self,state,epsilon): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) print action.shape print "Action_No_Noise:", action noise_t = np.zeros([1,self.action_dim]) noise_t[0][0] = epsilon * self.OU.function(action[0][0], 0.0 , 0.60, 0.80) noise_t[0][1] = epsilon * self.OU.function(action[0][1], 0.5 , 1.00, 0.10) noise_t[0][2] = epsilon * self.OU.function(action[0][2], -0.1 , 1.00, 0.05) action = action+noise_t action[0][0] = np.clip( action[0][0], -1 , 1 ) action[0][1] = np.clip( action[0][1], 0 , 1 ) action[0][2] = np.clip( action[0][2], 0 , 1 ) print "Action_Noise:", action return action[0] ''' def action(self, state, img): action = self.actor_network.action(state, img) action[0] = np.clip(action[0], -1, 1) # action[1] = np.clip( action[1], 0 , 1 ) # action[2] = np.clip( action[2], 0 , 1 ) return action def noise_action(self, state, epsilon, img): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state, img) noise_t = np.zeros(self.action_dim) if self.time_step < 100000: noise_t[0] = epsilon * ornstein_uhlenbeck_process( action[0], 0.0, 0.60, 0.80) # noise_t[1] = epsilon * ornstein_uhlenbeck_process(action[1], 0.5 , 1.00, 0.10) # noise_t[2] = epsilon * ornstein_uhlenbeck_process(action[2], -0.1 , 1.00, 0.05) elif self.time_step < 200000: if np.random.random() < 0.1: noise_t[0] = 0.1 * ornstein_uhlenbeck_process( action[0], 0.0, 0.60, 0.80) action = action + noise_t action[0] = np.clip(action[0], -1, 1) # action[1] = np.clip( action[1], 0 , 1) # action[2] = np.clip( action[2], 0 , 1) return action def perceive(self, state, action, reward, next_state, done, img, next_img): if not (math.isnan(reward)): self.replay_buffer.add(state, action, reward, next_state, done, img, next_img) self.time_step = self.time_step + 1 # Return critic cost if self.replay_buffer.count() > REPLAY_START_SIZE: return self.train() else: return 0
class DDPG_TF: """docstring for DDPG""" def __init__(self, env,loadfilename=None,printVars=False): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) #print 'init complete' self.all_vars = tf.global_variables() if printVars: for v in self.all_vars: print v.name.ljust(30), v.shape self.saver = tf.train.Saver(self.all_vars) if loadfilename is not None: self.saver.restore(self.sess, loadfilename) #print 'restore complete' def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def actions(self, states): actions = self.actor_network.actions_no_training(states) return actions def target_actions(self, states): actions = self.actor_network.target_actions(states) return actions def value(self, states): actions = self.actor_network.actions_no_training(states) values = self.critic_network.q_value(states,actions) return values def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: def __init__(self, env, replay_buffer, sample_batch, train_iter, gamma, tau, batch_size, n_train, n_episode): # Gym environment self.env = env env_flattened = gym.wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'achieved_goal', 'desired_goal']) # Get space sizes self.state_dim = env_flattened.observation_space.shape[0] #self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Get replay buffer and function get a batch from it self.replay_buffer = replay_buffer self.sample_batch = sample_batch self.sess = tf.InteractiveSession() # Hyper parameters self.gamma = gamma self.tau = tau self.batch_size = batch_size self.n_train = n_train self.n_episode = n_episode # Initialize networks self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim) self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim) def train(self): batch = self.sample_batch(self.batch_size) state_batch = np.asarray([data[0] for data in batch]) action_batch = np.asarray([data[1] for data in batch]) reward_batch = np.asarray([data[2] for data in batch]) next_state_batch = np.asarray([data[3] for data in batch]) done_batch = np.asarray([data[4] for data in batch]) next_action_batch = self.actor.target_actions(next_state_batch) q_value_batch = self.critic.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(batch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + self.gamma * q_value_batch[i]) y_batch = np.resize(y_batch, [self.batch_size, 1]) # Update critic by minimizing the loss L self.critic.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor.actions(state_batch) q_gradient_batch = self.critic.gradients(state_batch, action_batch_for_gradients) self.actor.train(q_gradient_batch, state_batch) # Update the target networks self.actor.update_target() self.critic.update_target() def noise_action(self, state): action = self.actor.action(state) return action + self.exploration_noise.noise() def action(self, state): return self.actor.action(state) def reset_noise(self): self.exploration_noise.reset() def save_policy(self, save_path): self.actor.save_network(save_path)