def main(): experiment = 'model-builder-v0' #specify environments here env = gym.make(experiment) #steps= env.spec.timestep_limit #steps per episode steps = 20 assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) for i in range(episodes): print("==== Starting episode no:", i, "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print("Action at step", t, " :", action, "\n") observation, reward, done, info = env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode) print("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print('\n\n') break total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))
def main(): experiment= 'InvertedPendulum-v1' #specify environments here env= gym.make(experiment) steps= env.spec.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 reward_per_episode = 0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:",i,"====","\n" observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise print "Action at step", t ," :",action,"\n" observation,reward,done,info=env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode print "Printing reward to file" exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def main(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm, CA_OBS_SPACE, CA_ACTION_SPACE, CA_ACTION_BOUND) exploration_noise = OUNoise(CA_ACTION_SPACE) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = CA_OBS_SPACE num_actions = CA_ACTION_SPACE print "Number of States:", num_states print "Number of Actions:", num_actions print "Number of Steps per episode:", steps #saving reward: reward_st = np.array([0]) for i in xrange(episodes): print "==== Starting episode no:", i, "====", "\n" # observation = env.reset() observation = ca_reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) # env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print "Action at step", t, " :", action, "\n" # observation,reward,done,info=env.step(action) observation, reward, done, info = ca_step(action) print x, observation, action, reward, done #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): print 'EPISODE: ', i, ' Steps: ', t, ' Total Reward: ', reward_per_episode print "Printing reward to file" exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print '\n\n' break total_reward += reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
def main(): env = Env(19997) steps= 10000 num_states = 59 num_actions = 3 #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(num_actions) counter=0 reward_per_episode = 0 total_reward=0 reward_st = np.array([0]) agent.actor_net.load_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.load_critic(os.getcwd() + '/weights/critic/model.ckpt') for i in range(episodes): # print "==== Starting episode no:",i,"====","\n" observation = env.reset() done =False reward_per_episode = 0 for t in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise for i in range(num_actions): if action[i] > 1.0: action[i] = 1.0 if action[i] < -1.0: action[i] = -1.0 observation,reward,done = env.step(action) print("reward:", reward, "\n") agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done or (t == steps-1)): print('Episode',i,'Steps: ',t,'Episode Reward:',reward_per_episode) exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") agent.actor_net.save_actor(os.getcwd() + '/weights/actor/model.ckpt') agent.critic_net.save_critic(os.getcwd() + '/weights/critic/model.ckpt') break total_reward+=reward_per_episode
def main(): env= Env(19997) steps = 300 #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(2) counter = 0 reward_per_episode = 0. num_states = 32*16 num_actions = 2 #saving reward: reward_st = np.array([0]) for i in range(episodes): print ("==== Starting episode no:",str(i),"====","\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise observation,reward,done=env.step(action,t) agent.add_experience(x,observation,action,reward,done) if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if (done): print ('EPISODE: ',str(i),' Steps: ',str(t),' Total Reward: ',str(reward_per_episode)) exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") agent.actor_net.save_actor('/home/lee/Projects/Tracking/RL/weights/actor/model.ckpt') agent.critic_net.save_critic('/home/lee/Projects/Tracking/RL/weights/critic/model.ckpt') print ('\n\n') break
class DDPG_REC: def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr, gamma, buffer_size, item_space, summary_dir): self.state_item_num = state_item_num self.action_item_num = action_item_num self.emb_dim = emb_dim self.batch_size = batch_size self.tau = tau self.actor_lr = actor_lr self.critic_lr = critic_lr self.gamma = gamma self.buffer_size = buffer_size self.item_space = item_space self.summary_dir = summary_dir self.sess = tf.Session() self.s_dim = emb_dim * state_item_num self.a_dim = emb_dim * action_item_num self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr) self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim, self.actor.get_num_trainable_vars(), gamma, tau, critic_lr) self.exploration_noise = OUNoise(self.a_dim) # set up summary operators self.summary_ops, self.summary_vars = self.build_summaries() self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph) # initialize target network weights self.actor.hard_update_target_network() self.critic.hard_update_target_network() # initialize replay memory self.replay_buffer = ReplayBuffer(buffer_size) def gene_actions(self, weight_batch): """use output of actor network to calculate action list Args: weight_batch: actor network outputs Returns: recommendation list """ item_ids = list(self.item_space.keys()) item_weights = list(self.item_space.values()) max_ids = list() for weight in weight_batch: score = np.dot(item_weights, np.transpose(weight)) idx = np.argmax(score, 0) max_ids.append([item_ids[_] for _ in idx]) return max_ids # def gene_action(self, weight): # """use output of actor network to calculate action list # Args: # weight: actor network outputs # # Returns: # recommendation list # """ # item_ids = list(self.item_space.keys()) # item_weights = list(self.item_space.values()) # score = np.dot(item_weights, np.transpose(weight)) # idx = np.argmax(score) # return item_ids[idx] @staticmethod def build_summaries(): episode_reward = tf.Variable(0.) tf.summary.scalar("reward", episode_reward) episode_max_q = tf.Variable(0.) tf.summary.scalar("max_q_value", episode_max_q) critic_loss = tf.Variable(0.) tf.summary.scalar("critic_loss", critic_loss) summary_vars = [episode_reward, episode_max_q, critic_loss] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars def _train(self): samples = self.replay_buffer.sample_batch(self.batch_size) state_batch = np.asarray([_[0] for _ in samples]) action_batch = np.asarray([_[1] for _ in samples]) reward_batch = np.asarray([_[2] for _ in samples]) n_state_batch = np.asarray([_[3] for _ in samples]) done_batch = np.asarray([_[4] for _ in samples]) seq_len_batch = np.asarray([self.state_item_num] * self.batch_size) # calculate predicted q value action_weights = self.actor.predict_target(state_batch, seq_len_batch) # [batch_size, n_action_batch = self.gene_actions(action_weights.reshape((-1, self.action_item_num, self.emb_dim))) n_action_emb_batch = get_item_emb(n_action_batch, item_ids_emb_dict) target_q_batch = self.critic.predict_target(n_state_batch.reshape((-1, self.s_dim)), n_action_emb_batch.reshape((-1, self.a_dim)), seq_len_batch) y_batch = [] for i in range(self.batch_size): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + self.critic.gamma * target_q_batch[i]) # train critic q_value, critic_loss, _ = self.critic.train(state_batch, action_batch, np.reshape(y_batch, (self.batch_size, 1)), seq_len_batch) # train actor action_weight_batch_for_gradients = self.actor.predict(state_batch, seq_len_batch) action_batch_for_gradients = self.gene_actions(action_weight_batch_for_gradients) action_emb_batch_for_gradients = get_item_emb(action_batch_for_gradients, item_ids_emb_dict) a_gradient_batch = self.critic.action_gradients(state_batch, action_emb_batch_for_gradients.reshape((-1, self.a_dim)), seq_len_batch) self.actor.train(state_batch, a_gradient_batch[0], seq_len_batch) # update target networks self.actor.update_target_network() self.critic.update_target_network() return np.amax(q_value), critic_loss def action(self, state): weight = self.actor.predict(np.reshape(state, [1, self.s_dim]), np.array([self.state_item_num])) + \ self.exploration_noise.noise().reshape( (1, self.action_item_num, int(self.a_dim / self.action_item_num))) action = self.gene_actions(weight) return np.array(action[0]) def perceive_and_train(self, state, action, reward, n_state, done): action_emb = get_item_emb(action, item_ids_emb_dict) self.replay_buffer.add(list(state.reshape((self.s_dim,))), list(action_emb.reshape((self.a_dim,))), [reward], list(n_state.reshape((self.s_dim,))), [done]) # Store transitions to replay start size then start training ep_q_value_, critic_loss = 0, 0 if self.replay_buffer.size() > self.batch_size: ep_q_value_, critic_loss = self._train() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() return ep_q_value_, critic_loss def write_summary(self, ep_reward, ep_q_value, loss, i): summary_str = self.sess.run(self.summary_ops, feed_dict={self.summary_vars[0]: ep_reward, self.summary_vars[1]: ep_q_value, self.summary_vars[2]: loss}) self.writer.add_summary(summary_str, i) def save(self): self.writer.close() saver = tf.train.Saver() ckpt_path = os.path.join(os.path.dirname(__file__), "models") saver.save(self.sess, ckpt_path, write_meta_graph=False)
def main(): experiment= 'InvertedPendulum-v1' env= gym.make(experiment) assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer agent = DDPG(env) exploration_noise = OUNoise(env.action_space.shape[0]) counter=0 total_reward=0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] #saving reward: reward_st = np.array([0]) for i in xrange(episodes): observation = env.reset() reward_per_episode = 0 for t in xrange(steps): #rendering environmet (optional) #env.render() x = observation #select action using actor network model action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states])) noise = exploration_noise.noise() action = action[0] + noise print 'Agent.Action :',action print '\n' print '\n' observation,reward,done,[]=env.step(action) #add s_t,s_t+1,action,reward to experience memeroy agent.add_experience(x,observation,action,reward,done) #train critic and actor network if counter > 64: agent.train() reward_per_episode+=reward counter+=1 #check if episode ends: if done: print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode exploration_noise.reset() reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print '\n' print '\n' break total_reward+=reward_per_episode print "Average reward per episode {}".format(total_reward / episodes)
class DDPG: def __init__(self, pretrain=False): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.session = tf.Session(config=config) # self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches # self.visualize_input = VISUALIZE_BUFFER # if self.visualize_input: # self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 662 self.width = 1 self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='float32') self.old_action = np.ones(2, dtype='float32') self.network_action = np.zeros(2, dtype='float32') self.noise_action = np.zeros(2, dtype='float32') self.action = np.zeros(2, dtype='float32') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph) def train(self): # Check if the buffer is big enough to start training if self.data_manager.enough_data(): # start_ = time.time() # get the next random batch from the data manger state_batch, \ action_batch, \ reward_batch, \ next_state_batch, \ is_episode_finished_batch = self.data_manager.get_next_batch() state_batch = np.divide(state_batch, 10.0) next_state_batch = np.divide(next_state_batch, 10.0) # Are we visualizing the first state batch for debugging? # If so: We have to scale up the values for grey scale before plotting # if self.visualize_input: # state_batch_np = np.asarray(state_batch) # state_batch_np = np.multiply(state_batch_np, -100.0) # state_batch_np = np.add(state_batch_np, 100.0) # self.viewer.set_data(state_batch_np) # self.viewer.run() # self.visualize_input = False # Calculate y for the td_error of the critic # start = time.time() y_batch = [] next_action_batch = self.actor_network.target_evaluate( next_state_batch, action_batch) q_value_batch = self.critic_network.target_evaluate( next_state_batch, next_action_batch) # done = time.time() # elapsed = done - start # print "forward actor and critic time is: ", elapsed for i in range(0, BATCH_SIZE): if is_episode_finished_batch[i]: y_batch.append([reward_batch[i]]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Now that we have the y batch lets train the critic # start = time.time() self.critic_network.train(y_batch, state_batch, action_batch) # done = time.time() # elapsed = done - start # print "train critic time is: ", elapsed # self.critic_network.train(y_batch, state_batch, action_batch) # Get the action batch so we can calculate the action gradient with it # Then get the action gradient batch and adapt the gradient with the gradient inverting method # start = time.time() action_batch_for_gradients = self.actor_network.evaluate( state_batch, action_batch) # done = time.time() # elapsed = done - start # print "forward action after critic training time is: ", elapsed q_gradient_batch = self.critic_network.get_action_gradient( state_batch, action_batch_for_gradients) q_gradient_batch = self.grad_inv.invert( q_gradient_batch, action_batch_for_gradients) # Now we can train the actor # start = time.time() self.actor_network.train(q_gradient_batch, state_batch, action_batch) # done = time.time() # elapsed = done - start # print "train actor time is: ", elapsed # done = time.time() # elapsed = done - start_ # print "====== total time is: ", elapsed # Save model if necessary if self.training_step > 0 and self.training_step % SAVE_STEP == 0: self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step) # Update time step self.training_step += 1 if self.training_step % 400 == 0: print "iter: ", self.training_step # start_ = time.time() self.data_manager.check_for_enqueue() # done = time.time() # elapsed = done - start_ # print "############ check enqueue time is: ", elapsed def get_action(self, state, old_action): # normalize the state state = state.astype(float) state = np.divide(state, 10.0) # Get the action self.action = self.actor_network.get_action(state, old_action) self.action = self.action.reshape((2, )) # Are we using noise? if self.noise_flag: # scale noise down to 0 at training step 3000000 self.action = 0.8 * self.exploration_noise.noise() # if self.training_step < MAX_NOISE_STEP: # self.action += (MAX_NOISE_STEP - self.training_step) / \ # MAX_NOISE_STEP * self.exploration_noise.noise() # if action value lies outside of action bounds, rescale the action vector # if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]: # self.action *= np.fabs(A0_BOUNDS[0] / self.action[0]) # if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]: # self.action *= np.fabs(A1_BOUNDS[0] / self.action[1]) # Life q value output for this action and state self.print_q_value(state, self.action) return self.action def set_experience(self, state, reward, is_episode_finished): # Make sure we're saving a new old_state for the first experience of every episode if self.first_experience: self.first_experience = False else: state.astype('float32') self.old_action.astype('float32') self.old_action.astype('float32') self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state, is_episode_finished) # Uncomment if collecting data for the auto_encoder # experience = (self.old_state, self.old_action, reward, state, is_episode_finished) # self.buffer.append(experience) if is_episode_finished: self.first_experience = True self.exploration_noise.reset() # Safe old state and old action for next experience self.old_state = state self.old_action = self.action def print_q_value(self, state, action): string = "-" q_value = self.critic_network.evaluate([state], [action]) stroke_pos = 30 * q_value[0][0] + 30 if stroke_pos < 0: stroke_pos = 0 elif stroke_pos > 60: stroke_pos = 60
class DDPG(): """Reinforcement learning agent that learns using DDPG.""" def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Set the learning rate suggested by paper: https://pdfs.semanticscholar.org/71f2/03de1a53deae81a7707143f0ed564661e279.pdf self.actor_learning_rate = 0.001 self.actor_decay = 0.0 self.critic_learning_rate = 0.001 self.critic_decay = 0.0 # Actor Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_learning_rate, self.actor_decay) # Critic Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) self.critic_target = Critic(self.state_size, self.action_size, self.critic_learning_rate, self.critic_decay) # initialize targets model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # self.exploration_theta = 0.15 # self.exploration_sigma = 0.2 self.exploration_theta = 0.01 self.exploration_sigma = 0.02 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.best_w = None self.best_score = -np.inf # self.noise_scale = 0.7 self.score = 0 # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Indicate if we want to learn (or use to predict without learn) self.set_train(train) def reset_episode(self): self.total_reward = 0.0 self.score = 0 self.step_count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.step_count += 1 # Save experience /reward self.memory.add(self.last_state, action, reward, next_state, done) self.score = self.total_reward / float(self.step_count) if self.step_count else 0.0 # Update the noise factor depending on the new score value if self.score >= self.best_score: self.best_score = self.score # Learn, if enough samples are available in memory if self.train and len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, done) # Roll over last state and action self.last_state= next_state def act(self, state): """Returns actions for given state(s) as per current policy""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add more noise for exploration def learn(self, experiences, done): """Update policy and value parameters using give batch experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_state = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) next_action = self.actor_target.model.predict_on_batch(next_state) Q_targets_next = self.critic_target.model.predict_on_batch([next_state, next_action]) # Compute Q targets for current states and train critic model(local) Q_targets = rewards + self.gamma * Q_targets_next * ( 1- dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target method self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters mush have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def set_train(self, train): self.train = train
class DDPG: def __init__(self, state_size, action_size, tau, lr_actor, lr_critic, num_agents, agent_idx, seed, device, gamma, tensorboard_writer=None): self.state_size = state_size self.action_size = action_size self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.num_agents = num_agents self.agent_idx = agent_idx self.seed = seed self.device = device self.gamma = gamma random.seed(seed) self.tensorboard_writer = tensorboard_writer self.actor_local = Actor(state_size, action_size, seed) self.actor_target = Actor(state_size, action_size, seed) critic_state_size = (state_size + action_size) * num_agents self.critic_local = Critic(critic_state_size, seed) self.critic_target = Critic(critic_state_size, seed) hard_update(self.actor_local, self.actor_target) hard_update(self.critic_local, self.critic_target) self.actor_optim = torch.optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optim = torch.optim.Adam(self.critic_local.parameters(), lr=lr_critic) self.noise = OUNoise(action_size, seed) self.iteration = 0 def to(self, device): self.actor_local.to(device) self.actor_target.to(device) self.critic_local.to(device) self.critic_target.to(device) return self def act(self, state, noise_scale, use_noise=True): state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if use_noise: action += self.noise.sample() * noise_scale return np.clip(action, -1, 1) def learn(self, experiences, all_curr_pred_actions, all_next_pred_actions): agent_idx_device = torch.tensor(self.agent_idx).to(self.device) states, actions, rewards, next_states, dones = experiences rewards = rewards.index_select(1, agent_idx_device) dones = dones.index_select(1, agent_idx_device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models batch_size = next_states.shape[0] actions_next = torch.cat(all_next_pred_actions, dim=1).to(self.device) next_states = next_states.reshape(batch_size, -1) with torch.no_grad(): Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss states = states.reshape(batch_size, -1) actions = actions.reshape(batch_size, -1) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets.detach()) # Minimize the loss self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss self.actor_optim.zero_grad() predicted_actions = torch.cat([action if idx == self.agent_idx \ else action.detach() for idx, action in enumerate(all_curr_pred_actions)], dim=1).to(self.device) actor_loss = -self.critic_local(states, predicted_actions).mean() # minimize loss actor_loss.backward() self.actor_optim.step() al = actor_loss.cpu().detach().item() cl = critic_loss.cpu().detach().item() if self.tensorboard_writer is not None: self.tensorboard_writer.add_scalar("agent{}/actor_loss".format(self.agent_idx), al, self.iteration) self.tensorboard_writer.add_scalar("agent{}/critic_loss".format(self.agent_idx), cl, self.iteration) self.tensorboard_writer.file_writer.flush() self.iteration += 1 # ----------------------- update target networks ----------------------- # soft_update(self.critic_target, self.critic_local, self.tau) soft_update(self.actor_target, self.actor_local, self.tau) def reset(self): self.noise.reset()
class PilotNode(object): def __init__(self, model, logfolder): print('initialize pilot node') np.random.seed(FLAGS.random_seed) tf.set_random_seed(FLAGS.random_seed) # Initialize replay memory self.logfolder = logfolder self.world_name = '' self.logfile = logfolder + '/tensorflow_log' self.run = 0 self.run_eva = 0 self.maxy = -10 self.speed = FLAGS.speed self.accumlosses = {} self.current_distance = 0 self.furthest_point = 0 self.average_distance = 0 self.average_distance_eva = 0 self.last_pose = [] self.model = model self.ready = False self.finished = True self.target_control = [] self.target_depth = [] self.target_odom = [] self.aux_depth = [] self.aux_odom = [] self.odom_error = [] self.prev_control = [0] self.nfc_images = [ ] #used by n_fc networks for building up concatenated frames self.nfc_poses = [] #used by n_fc networks for calculating odometry rospy.init_node('pilot', anonymous=True) self.exploration_noise = OUNoise(4, 0, FLAGS.ou_theta, 1) self.state = [] # self.delay_evaluation = 5 #can't be set by ros because node is started before ros is started... if FLAGS.show_depth: self.depth_pub = rospy.Publisher('/depth_prediction', numpy_msg(Floats), queue_size=1) if FLAGS.show_odom: self.odom_pub = rospy.Publisher('/odom_prediction', numpy_msg(Floats), queue_size=1) # if FLAGS.off_policy: # self.action_pub = rospy.Publisher('/supervised_vel', Twist, queue_size=1) # if rospy.has_param('control'): # rospy.Subscriber(rospy.get_param('control'), Twist, self.supervised_callback) if FLAGS.real or FLAGS.off_policy: self.action_pub = rospy.Publisher('/pilot_vel', Twist, queue_size=1) else: rospy.Subscriber('/supervised_vel', Twist, self.supervised_callback) if rospy.has_param('control'): self.action_pub = rospy.Publisher(rospy.get_param('control'), Twist, queue_size=1) if rospy.has_param('ready'): rospy.Subscriber(rospy.get_param('ready'), Empty, self.ready_callback) if rospy.has_param('finished'): rospy.Subscriber(rospy.get_param('finished'), Empty, self.finished_callback) if rospy.has_param('rgb_image') and not FLAGS.depth_input: rospy.Subscriber(rospy.get_param('rgb_image'), Image, self.image_callback) if rospy.has_param('depth_image'): if FLAGS.depth_input or FLAGS.auxiliary_depth or FLAGS.rl: rospy.Subscriber(rospy.get_param('depth_image'), Image, self.depth_callback) if FLAGS.recovery_cameras: # callbacks={'left':{'30':image_callback_left_30,'60':image_callback_left_60},'right':{'30':image_callback_right_30,'60':image_callback_right_60}} # callbacks_depth={'left':{'30':depth_callback_left_30,'60':depth_callback_left_60},'right':{'30':depth_callback_right_30,'60':depth_callback_right_60}} self.recovery_images = {} for d in ['left', 'right']: self.recovery_images[d] = {} for c in ['30', '60']: self.recovery_images[d][c] = {} self.recovery_images[d][c]['rgb'] = [] self.recovery_images[d][c]['depth'] = [] rospy.Subscriber( re.sub(r"kinect", "kinect_" + d + "_" + c, rospy.get_param('rgb_image')), Image, self.image_callback_recovery, (d, c)) rospy.Subscriber( re.sub(r"kinect", "kinect_" + d + "_" + c, rospy.get_param('depth_image')), Image, self.depth_callback_recovery, (d, c)) if not FLAGS.real: # in simulation self.replay_buffer = ReplayBuffer(FLAGS.buffer_size, FLAGS.random_seed) self.accumloss = 0 rospy.Subscriber('/ground_truth/state', Odometry, self.gt_callback) def ready_callback(self, msg): if not self.ready and self.finished: print('Neural control activated.') self.ready = True self.start_time = rospy.get_time() self.finished = False self.exploration_noise.reset() self.speed = FLAGS.speed + ( not FLAGS.evaluate) * np.random.uniform( -FLAGS.sigma_x, FLAGS.sigma_x) if rospy.has_param('evaluate') and not FLAGS.real: # FLAGS.evaluate = False FLAGS.evaluate = rospy.get_param('evaluate') print '--> set evaluate to: ', FLAGS.evaluate # if FLAGS.lstm: # self.state=self.model.get_init_state(True) # print 'set state to: ', self.state if rospy.has_param('world_name'): self.world_name = os.path.basename( rospy.get_param('world_name').split('.')[0]) if 'sandbox' in self.world_name: self.world_name = 'sandbox' def gt_callback(self, data): if not self.ready: return # Keep track of positions for logging current_pos = [ data.pose.pose.position.x, data.pose.pose.position.y, data.pose.pose.position.z ] if len(self.last_pose) != 0: self.current_distance += np.sqrt( (self.last_pose[0, 3] - current_pos[0])**2 + (self.last_pose[1, 3] - current_pos[1])**2) self.furthest_point = max([ self.furthest_point, np.sqrt(current_pos[0]**2 + current_pos[1]**2) ]) # Get pose (rotation and translation) for odometry quaternion = (data.pose.pose.orientation.x, data.pose.pose.orientation.y, data.pose.pose.orientation.z, data.pose.pose.orientation.w) self.last_pose = transformations.quaternion_matrix( quaternion ) # orientation of current frame relative to global frame self.last_pose[0:3, 3] = current_pos def process_rgb(self, msg): # self.time_1 = time.time() # if not self.ready or self.finished or (rospy.get_time()-self.start_time) < self.delay_evaluation: return if not self.ready or self.finished: return [] try: # Convert your ROS Image message to OpenCV2 im = bridge.imgmsg_to_cv2( msg, 'rgb8' ) # changed to normal RGB order as i ll use matplotlib and PIL instead of opencv # an idea could be to swap these channels during online training as this shouldnt matter though this could # explain the performance drop coming from a pretrained network. # This does mean that online trained nets might be worth nothing... # im = bridge.imgmsg_to_cv2(msg, 'bgr8') except CvBridgeError as e: print(e) else: # self.time_2 = time.time() size = self.model.input_size[1:] im = sm.imresize(im, tuple(size), 'nearest') # im = im*1/255. # Basic preprocessing: center + make 1 standard deviation # im -= FLAGS.mean # im = im*1/FLAGS.std return im def process_depth(self, msg): # if not self.ready or self.finished or (rospy.get_time()-self.start_time) < self.delay_evaluation: return if not self.ready or self.finished: return [] try: # Convert your ROS Image message to OpenCV2 im = bridge.imgmsg_to_cv2(msg, desired_encoding='passthrough' ) #gets float of 32FC1 depth image except CvBridgeError as e: print(e) else: im = im[::8, ::8] shp = im.shape # assume that when value is not a number it is due to a too large distance # values can be nan for when they are closer than 0.5m but than the evaluate node should # kill the run anyway. im = np.asarray([ e * 1.0 if not np.isnan(e) else 5 for e in im.flatten() ]).reshape(shp) # clipping nans: dur: 0.010 # print 'min: ',np.amin(im),' and max: ',np.amax(im) # im=np.asarray([ e*1.0 if not np.isnan(e) else 0 for e in im.flatten()]).reshape(shp) # clipping nans: dur: 0.010 # Resize image if FLAGS.auxiliary_depth or FLAGS.rl: size = self.model.depth_input_size #(55,74) im = sm.imresize(im, size, 'nearest') # dur: 0.002 # cv2.imshow('depth', im) # dur: 0.002 if FLAGS.depth_input: size = (self.model.input_size[1], self.model.input_size[1]) im = sm.imresize(im, size, 'nearest') # dur: 0.009 im = im[im.shape[0] / 2, :] # cv2.imshow('depth', im.reshape(1,im.shape[0])) # dur: 0.002 # cv2.waitKey(2) im = im * 1 / 255. * 5. # dur: 0.00004 return im def image_callback(self, msg): im = self.process_rgb(msg) if len(im) != 0: if FLAGS.n_fc: self.nfc_images.append(im) self.nfc_poses.append(copy.deepcopy(self.last_pose)) if len(self.nfc_images) < FLAGS.n_frames: # print('filling concatenated frames: ',len(self.nfc_images)) return else: # concatenate last n-frames im = np.concatenate(np.asarray( self.nfc_images[-FLAGS.n_frames:]), axis=2) self.nfc_images = self.nfc_images[ -FLAGS.n_frames + 1:] # concatenate last n-1-frames self.nfc_poses.pop(0) #get rid of the first one assert len(self.nfc_poses) == FLAGS.n_frames - 1 # # calculate target odometry from previous global pose and current global pose # euler = transformations.euler_from_matrix(self.nfc_poses[1], 'rxyz') # # print 'current: ',str(euler[2]),str(self.nfc_poses[1][0,3]),str(self.nfc_poses[1][1,3]) # i_T_pg = transformations.inverse_matrix(self.nfc_poses[0]) # euler = transformations.euler_from_matrix(i_T_pg, 'rxyz') # # print 'inverse prev: ',str(euler[2]), str(i_T_pg[0,3]),str(i_T_pg[1,3]) # T_cp = transformations.concatenate_matrices(i_T_pg, self.nfc_poses[1]) # r,p,yw = transformations.euler_from_matrix(T_cp, 'rxyz') # x,y,z = T_cp[0:3,3] # self.target_odom = [x,y,z,r,p,yw] # print 'odom: ',str(self.target_odom[5]),str(self.target_odom[0]),str(self.target_odom[1]) # self.target_odom = [self.nfc_poses[1][i]-self.nfc_poses[0][i] for i in range(len(self.nfc_poses[0]))] # print 'Target odometry: ', self.target_odom self.process_input(im) def image_callback_recovery(self, msg, args): im = self.process_rgb(msg) if len(im) == 0: return trgt = -100. if FLAGS.auxiliary_depth and len( self.recovery_images[args[0]][args[1]]['depth']) == 0: print("No target depth: {0} {1}".format(args[0], args[1])) return else: trgt_depth = copy.deepcopy( self.recovery_images[args[0]][args[1]]['depth']) if len(self.target_control) == 0: print("No target control: {0} {1}".format(args[0], args[1])) return else: # left ==> -1, right ==> +1, 30dg ==> 0.5, 60dg ==> 1.0 compensation = -(args[0] == 'left') * int( args[1]) / 60. + (args[0] == 'right') * int(args[1]) / 60. trgt = compensation + self.target_control[5] if FLAGS.experience_replay and not FLAGS.evaluate and trgt != -100: if FLAGS.auxiliary_depth: print('added experience of camera: {0} {1} with control {2}'. format(args[0], args[1], trgt)) self.replay_buffer.add(im, [trgt], [trgt_depth]) else: self.replay_buffer.add(im, [trgt]) def depth_callback(self, msg): im = self.process_depth(msg) if len(im) != 0: if FLAGS.auxiliary_depth or FLAGS.rl: self.target_depth = im #(64,) if FLAGS.depth_input: if FLAGS.network == 'nfc_control': self.nfc_images.append(im) if len(self.nfc_images) < 4: # print('filling concatenated frames: ',len(self.nfc_images)) return else: # print np.asarray(self.nfc_images).shape im = np.concatenate(np.asarray(self.nfc_images)) # print im.shape self.nfc_images.pop(0) self.process_input(im) def depth_callback_recovery(self, msg, args): im = self.process_depth(msg) self.recovery_images[args[0]][args[1]]['depth'] = im def process_input(self, im): self.time_3 = time.time() trgt = -100. # if self.target_control == None or FLAGS.evaluate: if FLAGS.evaluate: ### EVALUATE trgt_depth = [] trgt_odom = [] with_loss = False if len( self.target_control ) != 0 and not FLAGS.auxiliary_depth and not FLAGS.auxiliary_odom: trgt = self.target_control[5] with_loss = True elif len(self.target_control ) != 0 and FLAGS.auxiliary_depth and len( self.target_depth) != 0 and not FLAGS.auxiliary_odom: trgt = self.target_control[5] trgt_depth = [copy.deepcopy(self.target_depth)] with_loss = True elif len( self.target_control ) != 0 and not FLAGS.auxiliary_depth and FLAGS.auxiliary_odom and len( self.target_odom) != 0: trgt = self.target_control[5] trgt_odom = [copy.deepcopy(self.target_odom)] with_loss = True elif len( self.target_control ) != 0 and FLAGS.auxiliary_depth and len( self.target_depth) != 0 and FLAGS.auxiliary_odom and len( self.target_odom) != 0: trgt = self.target_control[5] trgt_odom = [copy.deepcopy(self.target_odom)] trgt_depth = [copy.deepcopy(self.target_depth)] with_loss = True if with_loss and False: # for now skip calculating accumulated loses. prev_ctr = [[self.prev_control[0]]] control, self.state, losses, aux_results = self.model.forward( [[im]] if FLAGS.lstm else [im], states=self.state, auxdepth=FLAGS.show_depth, auxodom=FLAGS.show_odom, prev_action=prev_ctr, targets=[[trgt]], target_depth=trgt_depth, target_odom=trgt_odom) if len(self.accumlosses.keys()) == 0: self.accumlosses = losses else: # self.accumlosses=[self.accumlosses[i]+losses[i] for i in range(len(losses))] for v in losses.keys(): self.accumlosses[v] = self.accumlosses[v] + losses[v] else: prev_ctr = [[self.prev_control[0]]] control, self.state, losses, aux_results = self.model.forward( [[im]] if FLAGS.lstm else [im], states=self.state, auxdepth=FLAGS.show_depth, auxodom=FLAGS.show_odom, prev_action=prev_ctr) if FLAGS.show_depth and FLAGS.auxiliary_depth and len( aux_results) > 0: self.aux_depth = aux_results['depth'] if FLAGS.show_odom and FLAGS.auxiliary_odom and len( aux_results) > 0: self.aux_odom = aux_results['odom'] else: ###TRAINING # Get necessary labels, if label is missing wait... if len(self.target_control) == 0: print('No target control') return else: trgt = self.target_control[5] # print(trgt) if (FLAGS.auxiliary_depth or FLAGS.rl) and len( self.target_depth) == 0: print('No target depth') return else: trgt_depth = copy.deepcopy(self.target_depth) # self.target_depth = [] if FLAGS.auxiliary_odom and (len(self.target_odom) == 0 or len(self.prev_control) == 0): print('no target odometry or previous control') return else: trgt_odom = copy.deepcopy(self.target_odom) # check if depth image corresponds to rgb image # cv2.imshow('rgb', im) # cv2.waitKey(2) # cv2.imshow('depth', trgt_depth*1/5.) # cv2.waitKey(2) # ---------------------------------------------------------- DEPRECATED # if not FLAGS.experience_replay: ### TRAINING WITHOUT EXPERIENCE REPLAY # if FLAGS.auxiliary_depth: # control, losses = self.model.backward([im],[[trgt]], [[[trgt_depth]]]) # else: # control, losses = self.model.backward([im],[[trgt]]) # print 'Difference: '+str(control[0,0])+' and '+str(trgt)+'='+str(abs(control[0,0]-trgt)) # self.accumlosses += losses[0] # else: ### TRAINING WITH EXPERIENCE REPLAY # wait for first target depth in case of auxiliary depth. # in case the network can predict the depth self.time_4 = time.time() prev_ctr = [[self.prev_control[0]]] control, self.state, losses, aux_results = self.model.forward( [[im]] if FLAGS.lstm else [im], states=self.state, auxdepth=FLAGS.show_depth, auxodom=FLAGS.show_odom, prev_action=prev_ctr) if FLAGS.show_depth and FLAGS.auxiliary_depth: self.aux_depth = aux_results['depth'] if FLAGS.show_odom and FLAGS.auxiliary_odom: self.aux_odom = aux_results['odom'] self.time_5 = time.time() # print 'state: ', self.state ### SEND CONTROL noise = self.exploration_noise.noise() # yaw = control[0,0] # if np.random.binomial(1,FLAGS.epsilon) and not FLAGS.evaluate: # yaw = max(-1,min(1,np.random.normal())) if trgt != 100 and not FLAGS.evaluate: action = trgt if np.random.binomial(1, FLAGS.alpha** self.run) else control[0, 0] else: action = control[0, 0] msg = Twist() if FLAGS.type_of_noise == 'ou': msg.linear.x = self.speed #0.8 # 1.8 # # msg.linear.x = FLAGS.speed+(not FLAGS.evaluate)*FLAGS.sigma_x*noise[0] #0.8 # 1.8 # msg.linear.y = (not FLAGS.evaluate) * noise[1] * FLAGS.sigma_y msg.linear.z = (not FLAGS.evaluate) * noise[2] * FLAGS.sigma_z msg.angular.z = max( -1, min(1, action + (not FLAGS.evaluate) * FLAGS.sigma_yaw * noise[3])) elif FLAGS.type_of_noise == 'uni': msg.linear.x = self.speed # msg.linear.x = FLAGS.speed + (not FLAGS.evaluate)*np.random.uniform(-FLAGS.sigma_x, FLAGS.sigma_x) msg.linear.y = (not FLAGS.evaluate) * np.random.uniform( -FLAGS.sigma_y, FLAGS.sigma_y) msg.linear.z = (not FLAGS.evaluate) * np.random.uniform( -FLAGS.sigma_z, FLAGS.sigma_z) msg.angular.z = max( -1, min( 1, action + (not FLAGS.evaluate) * np.random.uniform(-FLAGS.sigma_yaw, FLAGS.sigma_yaw))) else: raise IOError('Type of noise is unknown: {}'.format( FLAGS.type_of_noise)) self.action_pub.publish(msg) self.prev_control = [msg.angular.z] self.time_6 = time.time() if FLAGS.show_depth and len(self.aux_depth) != 0 and not self.finished: # print('shape aux depth: {}'.format(self.aux_depth.shape)) self.aux_depth = self.aux_depth.flatten() self.depth_pub.publish(self.aux_depth) self.aux_depth = [] if FLAGS.show_odom and len(self.aux_odom) != 0 and not self.finished: # trgt_odom = [copy.deepcopy(self.target_odom)] # final_img = cv2.hconcat((im[:,:,0:3], im[:,:,3:6],im[:,:,6:])) # final_img = cv2.hconcat((im[:,:,[2,1,0]], im[:,:,[5,4,3]],im[:,:,[8,7,6]])) # print trgt_odom # cv2.imshow('Final', final_img) # cv2.waitKey(100) # cv2.destroyAllWindows() concat_odoms = np.concatenate( (self.aux_odom.astype(np.float32).flatten(), np.array(trgt_odom).astype(np.float32).flatten())) # self.odom_pub.publish(self.aux_odom.flatten()) # print concat_odoms[4:6],' and ',concat_odoms[0:2] self.odom_pub.publish(concat_odoms.astype(np.float32)) # self.odom_error.append(np.abs(np.array(trgt_odom).flatten()-self.aux_odom.flatten())) self.aux_odom = [] # ADD EXPERIENCE REPLAY if FLAGS.experience_replay and not FLAGS.evaluate and trgt != -100: aux_info = {} if FLAGS.auxiliary_depth or FLAGS.rl: aux_info['target_depth'] = trgt_depth if FLAGS.auxiliary_odom: # print trgt_odom # print 'target odom ',trgt_odom aux_info['target_odom'] = trgt_odom aux_info['prev_action'] = prev_ctr if FLAGS.lstm: # aux_info['state']=(np.zeros(())) # state type: <type 'tuple'> len: 2 len sub: 2 len subsub: 1 len subsubsub: 100 aux_info['state'] = self.state # aux_info['state']=((np.zeros((1,100)),np.zeros((1,100))+10),(np.ones((1,100)),np.ones((1,100))+20)) # print aux_info['state'] # (state layer0,output layer0,state layer1,output layer1) # print 'state type: ',type(aux_info['state']),' len: ', len(aux_info['state']),' len sub: ', len(aux_info['state'][0]),' len subsub: ', len(aux_info['state'][0][0]),' len subsubsub: ', len(self.state[0][0][0]) self.replay_buffer.add(im, [trgt], aux_info=aux_info) self.time_7 = time.time() if FLAGS.save_input: self.depthfile = open(self.logfolder + '/depth_input', 'a') np.set_printoptions(precision=5) message = "{0} : {1} : {2:.4f} \n".format( self.run, ' '.join('{0:.5f}'.format(k) for k in np.asarray(im)), trgt) self.depthfile.write(message) self.depthfile.close() self.time_8 = time.time() # print 'processed image @: {0:.2f}'.format(time.time()) # print("Time debugging: \n cvbridge: {0} , \n resize: {1}, \n copy: {2} , \n net pred: {3}, \n pub: {4},\n exp buf: {5},\n pos file: {6} s".format((self.time_2-self.time_1), # (self.time_3-self.time_2),(self.time_4-self.time_3),(self.time_5-self.time_4),(self.time_6-self.time_5),(self.time_7-self.time_6),(self.time_8-self.time_7))) # Delay values with auxiliary depth (at the beginning of training) # cv bridge (RGB): 0.0003s # resize (RGB): 0.0015s # copy control+depth: 2.7e-5 s # net prediction: 0.011s # publication: 0.0002s # fill experience buffer: 1.8e-5 s # write position: 2.1e-6 s def supervised_callback(self, data): if not self.ready: return self.target_control = [ data.linear.x, data.linear.y, data.linear.z, data.angular.x, data.angular.y, data.angular.z ] def finished_callback(self, msg): if self.ready and not self.finished: # self.depth_pub.publish(self.aux_depth) print('neural control deactivated.') self.ready = False self.finished = True # Train model from experience replay: # Train the model with batchnormalization out of the image callback loop activation_images = [] depth_predictions = [] endpoint_activations = [] tloss = [] #total loss closs = [] #control loss dloss = [] #depth loss oloss = [] #odometry loss qloss = [] #RL cost-to-go loss tlossm, clossm, dlossm, olossm, qlossm, tlossm_eva, clossm_eva, dlossm_eva, olossm_eva, qlossm_eva = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 #tot_batch_loss = [] if FLAGS.experience_replay and self.replay_buffer.size() > ( FLAGS.batch_size if not FLAGS.lstm else FLAGS.batch_size * FLAGS.num_steps) and not FLAGS.evaluate: for b in range( min(int(self.replay_buffer.size() / FLAGS.batch_size), 10)): inputs, targets, aux_info = self.replay_buffer.sample_batch( FLAGS.batch_size) # import pdb; pdb.set_trace() #print('time to smaple batch of images: ',time.time()-st) if b == 0: if FLAGS.plot_activations: activation_images = self.model.plot_activations( inputs, targets.reshape((-1, 1))) if FLAGS.plot_depth and FLAGS.auxiliary_depth: depth_predictions = self.model.plot_depth( inputs, aux_info['target_depth'].reshape(-1, 55, 74)) if FLAGS.plot_histograms: endpoint_activations = self.model.get_endpoint_activations( inputs) init_state = [] depth_targets = [] odom_targets = [] prev_action = [] if FLAGS.lstm: init_state = (aux_info['state'][:, 0, 0, 0, 0, :], aux_info['state'][:, 0, 0, 1, 0, :], aux_info['state'][:, 0, 1, 0, 0, :], aux_info['state'][:, 0, 1, 1, 0, :]) # if FLAGS.use_init_state: # init_state= assert init_state[0].shape[0] == FLAGS.batch_size # print 'init_state sizes ',init_state[0].shape if FLAGS.auxiliary_depth or FLAGS.rl: depth_targets = aux_info['target_depth'].reshape( -1, 55, 74) # depth_targets=aux_info['target_depth'].reshape(-1,55,74) if not FLAGS.lstm else aux_info['target_depth'].reshape(-1,FLAGS.num_steps, 55,74) if FLAGS.auxiliary_odom: odom_targets = aux_info['target_odom'].reshape( -1, 4) if not FLAGS.lstm else aux_info[ 'target_odom'].reshape(-1, FLAGS.num_steps, 4) # odom_targets=aux_info['target_odom'].reshape(-1,6) if not FLAGS.lstm else aux_info['target_odom'].reshape(-1,FLAGS.num_steps, 6) prev_action = aux_info['prev_action'].reshape( -1, 1 ) #if not FLAGS.lstm else aux_info['prev_action'].reshape(-1,FLAGS.num_steps, 1) # todo add initial state for each rollout in the batch controls, losses = self.model.backward( inputs, init_state, targets[:].reshape(-1, 1), depth_targets, odom_targets, prev_action) tloss = losses['t'] if not FLAGS.rl or FLAGS.auxiliary_ctr: closs = losses['c'] if FLAGS.auxiliary_depth: dloss.append(losses['d']) if FLAGS.auxiliary_odom: oloss.append(losses['o']) if FLAGS.rl: qloss.append(losses['q']) tlossm = np.mean(tloss) clossm = np.mean( closs) if not FLAGS.rl or FLAGS.auxiliary_ctr else 0 dlossm = np.mean(dloss) if FLAGS.auxiliary_depth else 0 olossm = np.mean(oloss) if FLAGS.auxiliary_odom else 0 qlossm = np.mean(qloss) if FLAGS.rl else 0 else: print('Evaluating or filling buffer or no experience_replay: ', self.replay_buffer.size()) if 't' in self.accumlosses.keys(): tlossm_eva = self.accumlosses['t'] if 'c' in self.accumlosses.keys(): clossm_eva = self.accumlosses['c'] if 'd' in self.accumlosses.keys(): dlossm_eva = self.accumlosses['d'] if 'o' in self.accumlosses.keys(): olossm_eva = self.accumlosses['o'] if 'q' in self.accumlosses.keys(): qlossm_eva = self.accumlosses['q'] if not FLAGS.evaluate: self.average_distance = self.average_distance - self.average_distance / ( self.run + 1) self.average_distance = self.average_distance + self.current_distance / ( self.run + 1) else: self.average_distance_eva = self.average_distance_eva - self.average_distance_eva / ( self.run_eva + 1) self.average_distance_eva = self.average_distance_eva + self.current_distance / ( self.run_eva + 1) odom_errx, odom_erry, odom_errz, odom_erryaw = 0, 0, 0, 0 if len(self.odom_error) != 0: odom_errx = np.mean([e[0] for e in self.odom_error]) odom_erry = np.mean([e[1] for e in self.odom_error]) odom_errz = np.mean([e[2] for e in self.odom_error]) odom_erryaw = np.mean([e[3] for e in self.odom_error]) try: sumvar = {} # sumvar={k : 0 for k in self.model.summary_vars.keys()} sumvar["Distance_current_" + self.world_name if len(self.world_name) != 0 else "Distance_current"] = self.current_distance sumvar["Distance_furthest_" + self.world_name if len(self.world_name) != 0 else "Distance_furthest"] = self.furthest_point if FLAGS.evaluate: sumvar["Distance_average_eva"] = self.average_distance_eva else: sumvar["Distance_average"] = self.average_distance if tlossm != 0: sumvar["Loss_total"] = tlossm if clossm != 0: sumvar["Loss_control"] = clossm if dlossm != 0: sumvar["Loss_depth"] = dlossm if olossm != 0: sumvar["Loss_odom"] = olossm if qlossm != 0: sumvar["Loss_q"] = qlossm if tlossm_eva != 0: sumvar["Loss_total_eva"] = tlossm_eva if clossm_eva != 0: sumvar["Loss_control_eva"] = clossm_eva if dlossm_eva != 0: sumvar["Loss_depth_eva"] = dlossm_eva if olossm_eva != 0: sumvar["Loss_odom_eva"] = olossm_eva if qlossm_eva != 0: sumvar["Loss_q_eva"] = qlossm_eva if odom_errx != 0: sumvar["odom_errx"] = odom_errx if odom_erry != 0: sumvar["odom_erry"] = odom_erry if odom_errz != 0: sumvar["odom_errz"] = odom_errz if odom_erryaw != 0: sumvar["odom_erryaw"] = odom_erryaw if FLAGS.plot_activations and len(activation_images) != 0: sumvar["conv_activations"] = activation_images # sumvar.append(activation_images) if FLAGS.plot_depth and FLAGS.auxiliary_depth: sumvar["depth_predictions"] = depth_predictions # sumvar.append(depth_predictions) if FLAGS.plot_histograms: for i, ep in enumerate(self.model.endpoints): sumvar['activations_{}'.format( ep)] = endpoint_activations[i] # sumvar.extend(endpoint_activations) self.model.summarize(sumvar) except Exception as e: print('failed to write', e) pass else: print( '{0}: control finished {1}:[ current_distance: {2:0.3f}, average_distance: {3:0.3f}, furthest point: {4:0.1f}, total loss: {5:0.3f}, control loss: {6:0.3e}, depth loss: {7:0.3e}, odom loss: {8:0.3e}, q loss: {9:0.3e}, world: {10}' .format( time.strftime('%H:%M'), self.run if not FLAGS.evaluate else self.run_eva, self.current_distance, self.average_distance if not FLAGS.evaluate else self.average_distance_eva, self.furthest_point, tlossm if not FLAGS.evaluate else tlossm_eva, clossm if not FLAGS.evaluate else clossm_eva, dlossm if not FLAGS.evaluate else dlossm_eva, olossm if not FLAGS.evaluate else olossm_eva, qlossm if not FLAGS.evaluate else qlossm_eva, self.world_name)) l_file = open(self.logfile, 'a') tag = 'train' if FLAGS.evaluate: tag = 'val' l_file.write( '{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}\n'.format( self.run if not FLAGS.evaluate else self.run_eva, self.current_distance, self.average_distance if not FLAGS.evaluate else self.average_distance_eva, self.furthest_point, tlossm, clossm, dlossm, olossm, qlossm, tag, self.world_name)) l_file.close() self.accumlosses = {} self.maxy = -10 self.current_distance = 0 self.last_pose = [] self.nfc_images = [] self.nfc_poses = [] self.furthest_point = 0 if FLAGS.lstm and not FLAGS.evaluate: self.replay_buffer.new_run() self.world_name = '' if self.run % 10 == 0 and not FLAGS.evaluate: # Save a checkpoint every 20 runs. self.model.save(self.logfolder) self.state = [] if not FLAGS.evaluate: self.run += 1 else: self.run_eva += 1 # wait for gzserver to be killed gzservercount = 1 while gzservercount > 0: #print('gzserver: ',gzservercount) gzservercount = os.popen("ps -Af").read().count('gzserver') time.sleep(0.1) sys.stdout.flush()
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) self.critic_network = CriticNetwork( state_size=environment.observation_space.shape[0], action_size=environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self, observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer, BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch, [BATCH_SIZE, 1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate( next_state_batch) q_value_batch = self.critic_network.target_evaluate( next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) / BATCH_SIZE self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action + self.exploration_noise.noise(), self.environment.action_space.low, self.environment.action_space.high) def set_feedback(self, observation, action, reward, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append( (self.state, action, reward, next_state, done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class MaDDPG: def __init__(self, num_agents, state_dim, action_dim): # track training times self.time_step = 0 # use set session use GPU #self.sess = tf.InteractiveSession() self.sess = tf.Session(config=tf.ConfigProto( log_device_placement=True)) self.num_agents = num_agents self.state_dim = state_dim self.action_dim = action_dim self.agents = self.create_multi_agents(self.sess, num_agents, self.state_dim, self.action_dim) # make sure create Criticnetwork later, summarise mean Q value inside self.critic = CriticNetwork(self.sess, state_dim, action_dim) self.exploration_noise = OUNoise((self.num_agents, action_dim)) self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # for store checkpoint self.saver = tf.train.Saver() def train(self): minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.zeros((BATCH_SIZE, self.num_agents, self.state_dim)) action_batch = np.zeros((BATCH_SIZE, self.num_agents, self.action_dim)) reward_batch = np.zeros((BATCH_SIZE, self.num_agents)) next_state_batch = np.zeros( (BATCH_SIZE, self.num_agents, self.state_dim)) done_batch = np.zeros((BATCH_SIZE)) for ii in range(BATCH_SIZE): state_batch[ii, :, :] = minibatch[ii][0] action_batch[ii, :, :] = minibatch[ii][1] reward_batch[ii, :] = minibatch[ii][2] next_state_batch[ii, :, :] = minibatch[ii][3] done_batch[ii] = minibatch[ii][4] # calculate Gt batch next_action_batch = self.target_actions(next_state_batch) q_value_batch = self.critic.target_q(next_state_batch, next_action_batch) gt = np.zeros((BATCH_SIZE, self.num_agents)) for ii in range(BATCH_SIZE): if done_batch[ii]: gt[ii, :] = reward_batch[ii, :] else: gt[ii, :] = reward_batch[ii, :] + GAMMA * q_value_batch[ii, :] #update critic by minimizing the loss self.critic.train(gt, state_batch, action_batch) # update policy using the sampling gradients actions_for_grad = self.actions(state_batch) q_gradients_batch = self.critic.gradients(state_batch, actions_for_grad) self.train_agents(q_gradients_batch, state_batch) # update critic target network self.critic.update_target() # update actor target self.update_agents_target() def summary(self, record_num): if self.replay_buffer.count() > SUMMARY_BATCH_SIZE: mini_batch = self.replay_buffer.popn(SUMMARY_BATCH_SIZE) state_batch = np.zeros( (SUMMARY_BATCH_SIZE, self.num_agents, self.state_dim)) for ii in range(SUMMARY_BATCH_SIZE): state_batch[ii, :, :] = mini_batch[ii][0] actions_for_summary = self.actions(state_batch) self.critic.write_summaries(state_batch, actions_for_summary, record_num) def update_agents_target(self): for agent in self.agents: agent.update_target() def train_agents(self, gradients_batch, state_batch): # gradients_batch = [batchsize* agents* action_dim] # state_batch = [batchsize* agents * state_dim ] for ii in range(self.num_agents): grad = gradients_batch[:, ii, :] state = state_batch[:, ii, :] self.agents[ii].train(grad, state) def create_multi_agents(self, sess, num_agents, state_dim, action_dim): agents = [] nets = None for ii in range(num_agents): agent_name = 'agent' + str(ii) agents.append( ActorNetwork(sess, state_dim, action_dim, agent_name, nets)) nets = agents[-1].nets return agents def add_agents(self, add_num): for ii in range(add_num): #self.num_agents+=1 agent_name = 'agent' + str(self.num_agents) self.agents.append( ActorNetwork(self.sess, self.state_dim, self.action_dim, agent_name, self.agents[-1].nets)) # the agents' name is from 0-num_agents-1 self.num_agents += 1 # if add a new agent then reset the noise and replay buffer self.exploration_noise = OUNoise((self.num_agents, self.action_dim)) #self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.replay_buffer.erase() # re-create a saver # the new saver will contains all the savable variables. # otherwise only contains the initially created agents self.saver = tf.train.Saver() # reset the time step # self.time_step = 0 def action( self, state ): # here is action, for one state on agent, not batch_sized actions # state = [num_agents * state_dim] # actions = [num_agents * action_dim] action = np.zeros((self.num_agents, self.action_dim)) for ii in range(self.num_agents): action[ii, :] = self.agents[ii].action(state[ii, :]) return action def actions(self, state_batch): #state = batch_size*numOfagents*state_dim #actions = batch_size*numOfagents*action_dim batch_size = state_batch.shape[0] actions = np.zeros((batch_size, self.num_agents, self.action_dim)) for ii in range(self.num_agents): actions[:, ii, :] = self.agents[ii].actions(state_batch[:, ii, :]) return actions def target_actions(self, state_batch): # the state size is batch_size* num_agents * state_dimension actions = np.zeros( (state_batch.shape[0], self.num_agents, self.action_dim)) for ii in range(self.num_agents): actions[:, ii, :] = self.agents[ii].target_actions(state_batch[:, ii, :]) return actions def noise_action(self, state): action = self.action(state) # clip the action, action \in [-1,+1] return np.clip(action + self.exploration_noise.noise(), -1, 1) def close_session(self): self.sess.close() def perceive(self, state, action, reward, next_state, done): # store {st,at,Rt+1,st+1} self.replay_buffer.add(state, action, reward, next_state, done) if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % SAVE_STEPS == 0: self.save_network() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-initialize the random process when an episode ends if done: self.exploration_noise.reset() def load_network(self): checkpoint = tf.train.get_checkpoint_state("saved_network") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("Successfully loaded:", checkpoint.model_checkpoint_path) else: print('Could not find old network weights') def save_network(self): # do not processing under Dropbox # exit drop box then run print('save network...', self.time_step) self.saver.save(self.sess, 'saved_network/' + 'network', global_step=self.time_step)
class DDPG: def __init__(self): # Make sure all the directories exist if not tf.gfile.Exists(TFLOG_PATH): tf.gfile.MakeDirs(TFLOG_PATH) if not tf.gfile.Exists(EXPERIENCE_PATH): tf.gfile.MakeDirs(EXPERIENCE_PATH) if not tf.gfile.Exists(NET_SAVE_PATH): tf.gfile.MakeDirs(NET_SAVE_PATH) # Initialize our session self.session = tf.Session() self.graph = self.session.graph with self.graph.as_default(): # View the state batches self.visualize_input = VISUALIZE_BUFFER if self.visualize_input: self.viewer = CostmapVisualizer() # Hardcode input size and action size self.height = 86 self.width = self.height self.depth = 4 self.action_dim = 2 # Initialize the current action and the old action and old state for setting experiences self.old_state = np.zeros((self.width, self.height, self.depth), dtype='int8') self.old_action = np.ones(2, dtype='float') self.network_action = np.zeros(2, dtype='float') self.noise_action = np.zeros(2, dtype='float') self.action = np.zeros(2, dtype='float') # Initialize the grad inverter object to keep the action bounds self.grad_inv = GradInverter(A0_BOUNDS, A1_BOUNDS, self.session) # Make sure the directory for the data files exists if not tf.gfile.Exists(DATA_PATH): tf.gfile.MakeDirs(DATA_PATH) # Initialize summary writers to plot variables during training self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(TFLOG_PATH) # Initialize actor and critic networks self.actor_network = ActorNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) self.critic_network = CriticNetwork(self.height, self.action_dim, self.depth, self.session, self.summary_writer) # Initialize the saver to save the network params self.saver = tf.train.Saver() # initialize the experience data manger self.data_manager = DataManager(BATCH_SIZE, EXPERIENCE_PATH, self.session) # Uncomment if collecting a buffer for the autoencoder # self.buffer = deque() # Should we load the pre-trained params? # If so: Load the full pre-trained net # Else: Initialize all variables the overwrite the conv layers with the pretrained filters if PRE_TRAINED_NETS: self.saver.restore(self.session, NET_LOAD_PATH) else: self.session.run(tf.initialize_all_variables()) tf.train.start_queue_runners(sess=self.session) time.sleep(1) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim, MU, THETA, SIGMA) self.noise_flag = True # Initialize time step self.training_step = 0 # Flag: don't learn the first experience self.first_experience = True # After the graph has been filled add it to the summary writer self.summary_writer.add_graph(self.graph) def train(self): # Check if the buffer is big enough to start training if self.data_manager.enough_data(): # get the next random batch from the data manger state_batch, \ action_batch, \ reward_batch, \ next_state_batch, \ is_episode_finished_batch = self.data_manager.get_next_batch() state_batch = np.divide(state_batch, 100.0) next_state_batch = np.divide(next_state_batch, 100.0) # Are we visualizing the first state batch for debugging? # If so: We have to scale up the values for grey scale before plotting if self.visualize_input: state_batch_np = np.asarray(state_batch) state_batch_np = np.multiply(state_batch_np, -100.0) state_batch_np = np.add(state_batch_np, 100.0) self.viewer.set_data(state_batch_np) self.viewer.run() self.visualize_input = False # Calculate y for the td_error of the critic y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch, next_action_batch) for i in range(0, BATCH_SIZE): if is_episode_finished_batch[i]: y_batch.append([reward_batch[i]]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Now that we have the y batch lets train the critic self.critic_network.train(y_batch, state_batch, action_batch) # Get the action batch so we can calculate the action gradient with it # Then get the action gradient batch and adapt the gradient with the gradient inverting method action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.get_action_gradient(state_batch, action_batch_for_gradients) q_gradient_batch = self.grad_inv.invert(q_gradient_batch, action_batch_for_gradients) # Now we can train the actor self.actor_network.train(q_gradient_batch, state_batch) # Save model if necessary if self.training_step > 0 and self.training_step % SAVE_STEP == 0: self.saver.save(self.session, NET_SAVE_PATH, global_step=self.training_step) # Update time step self.training_step += 1 self.data_manager.check_for_enqueue() def get_action(self, state): # normalize the state state = state.astype(float) state = np.divide(state, 100.0) # Get the action self.action = self.actor_network.get_action(state) # Are we using noise? if self.noise_flag: # scale noise down to 0 at training step 3000000 if self.training_step < MAX_NOISE_STEP: self.action += (MAX_NOISE_STEP - self.training_step) / MAX_NOISE_STEP * self.exploration_noise.noise() # if action value lies outside of action bounds, rescale the action vector if self.action[0] < A0_BOUNDS[0] or self.action[0] > A0_BOUNDS[1]: self.action *= np.fabs(A0_BOUNDS[0]/self.action[0]) if self.action[1] < A0_BOUNDS[0] or self.action[1] > A0_BOUNDS[1]: self.action *= np.fabs(A1_BOUNDS[0]/self.action[1]) # Life q value output for this action and state self.print_q_value(state, self.action) return self.action def set_experience(self, state, reward, is_episode_finished): # Make sure we're saving a new old_state for the first experience of every episode if self.first_experience: self.first_experience = False else: self.data_manager.store_experience_to_file(self.old_state, self.old_action, reward, state, is_episode_finished) # Uncomment if collecting data for the auto_encoder # experience = (self.old_state, self.old_action, reward, state, is_episode_finished) # self.buffer.append(experience) if is_episode_finished: self.first_experience = True self.exploration_noise.reset() # Safe old state and old action for next experience self.old_state = state self.old_action = self.action def print_q_value(self, state, action): string = "-" q_value = self.critic_network.evaluate([state], [action]) stroke_pos = 30 * q_value[0][0] + 30 if stroke_pos < 0: stroke_pos = 0 elif stroke_pos > 60: stroke_pos = 60 print '[' + stroke_pos * string + '|' + (60-stroke_pos) * string + ']', "Q: ", q_value[0][0], \ "\tt: ", self.training_step
def s2l(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer num_states = feature_size #num_states = env.observation_space.shape[0] num_actions = num_controls print ("Number of States:", num_states) print ("Number of Actions:", num_actions) action_space_high=[1.5] #[0.0,0.0,0.0] action_space_low=[0.03] #[0.5,0.5,0.5] print ("Action space highest values", action_space_high) print ("Action space lowest values:", action_space_low) robot=RoboControl() #while True: # #robot.check() # robot.publish_control([1]) # robot.reset() agent = DDPG(is_batch_norm,num_states,num_actions,action_space_high,action_space_low) exploration_noise = OUNoise(num_actions) counter=0 total_reward=0 print ("Number of Rollouts per episode:", num_rollouts) print ("Number of Steps per roll out:", steps) reward_st = np.array([0]) #saving reward eval_metric_st= np.array([0]) reward_st_all = np.array([0]) #saving reward after every step activity_obj=Vid_Feature() demo_vid_array=demo_array_extractor(demo_folder) demo_features=activity_obj.feature_extractor(demo_vid_array) frame_obj=Frame_Feature() #camera_obj= Camera() camera_obj= CameraSub() for episode in range(num_episodes): print ("==== Starting episode no:",episode,"====","\n") robot.reset() # Reset env in the begining of each episode obs_img=camera_obj.camera_subscribe() # Get the observation #obs_img=np.array(misc.imresize(obs_img,[112,112,3])) observation =np.array(frame_obj.frame_feature_extractor(obs_img)) observation=observation.reshape(-1) reward_per_episode = 0 for t in range(num_rollouts): reward_per_rollout=0 vid_robo_=[] for i in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x,[1,num_states])) noise = exploration_noise.noise() action = action[0] + noise #Select action according to current policy and exploration noise print ('Action at episode-',episode,'rollout-',t, 'step-', i ," :",action) robot.publish_control(action) obs_robo=camera_obj.camera_subscribe() # Get the observation #obs_robo=misc.imresize(obs_robo,[112,112,3]) vid_robo_.append(obs_robo) observation=np.array(frame_obj.frame_feature_extractor(np.array(obs_robo))) observation=observation.reshape(-1) #pasue() if(i==15): vid_robo=np.array(vid_robo_) robo_features=activity_obj.feature_extractor(vid_robo) reward=-(distance(demo_features,robo_features)) reward=np.array(reward) print('reward: ',reward) else: reward=0 reward=np.array(reward) print('reward: ',reward) # Storing reward after every rollout reward_st_all = np.append(reward_st_all,reward) np.savetxt('reward_all.txt',reward_st_all, newline="\n") #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x,observation,action,reward,False) reward_per_rollout+=reward counter+=1 #train critic and actor network if counter > start_training: agent.train() print ('\n\n') #Saving policy if ((episode%100)==0 and t==num_rollouts-1): print('saving policy...........................!') agent.save_actor(episode) reward_per_episode+=reward_per_rollout #check if episode ends: print ('EPISODE: ',episode,' Total Reward: ',reward_per_episode) print ("Printing reward to file") exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, fmt='%f', newline="\n") print ('\n\n') total_reward+=reward_per_episode print ("Average reward per episode {}".format(total_reward / num_episodes))
class DDPG: """docstring for DDPG""" def __init__(self, env, DIRECTORY): self.batch_size = BATCH_SIZE self.replay_start_size = REPLAY_START_SIZE # self.sub_batch_size = BATCH_SIZE / n_gpu self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) self.trace_length = TRACE_LENGTH self.temp_abstract = TEMP_ABSTRACT self.actor_network = ActorNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) self.critic_network = CriticNetwork(self.sess, BATCH_SIZE, self.state_dim, self.action_dim, self.temp_abstract, DIRECTORY) # initialize replay buffer max_len_trajectory = self.environment.spec.timestep_limit + 1 # trace_length self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE, DIRECTORY, max_len_trajectory, self.actor_network.last_epi) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) ### self.diff = 0. self.discounting_mat_dict = {} ### def state_initialiser(self, shape, mode='g'): if mode == 'z': #Zero initial = np.zeros(shape=shape) elif mode == 'g': #Gaussian # initial = stats.truncnorm.rvs(a=-0.02/0.01,b=0.02/0.01,loc=0.,scale=0.01,size=shape) initial = np.random.normal(loc=0., scale=1. / float(shape[1]), size=shape) else: # May do some adaptive initialiser can be built in later raise NotImplementedError return initial def train(self, time_step): #,time_step): ###1) Get-batch data for opt minibatch, trace_length = self.replay_buffer.get_batch( self.batch_size, self.trace_length, time_step) #, self.trace_length) try: state_trace_batch = np.stack(minibatch[:, :, 2].ravel()).reshape( self.batch_size, trace_length, self.state_dim) action_trace_batch = np.stack(minibatch[:, :, 3].ravel()).reshape( self.batch_size, trace_length, self.action_dim) next_state_batch = np.stack(minibatch[:, -1, 6].ravel()).reshape( self.batch_size, 1, self.state_dim) next_state_trace_batch = np.concatenate( [state_trace_batch, next_state_batch], axis=1) reward_trace_batch = np.stack(minibatch[:, :, 4].ravel()).reshape( self.batch_size, trace_length, 1) done_trace_batch = np.stack(minibatch[:, :, 7].ravel()).reshape( self.batch_size, trace_length, 1) except Exception as e: print(str(e)) raise ###2) Painfully initialise initial memories of LSTMs: not super-efficient, but no error guaranteed from tf's None-type zero-state problem init_actor_hidden1_cORm_batch = self.state_initialiser( shape=(self.batch_size, self.actor_network.rnn_size), mode='z') actor_init_h_batch = ( init_actor_hidden1_cORm_batch, init_actor_hidden1_cORm_batch ) #((init_hidden1_cORm_batch,init_hidden1_cORm_batch),(init_actor_hidden2_cORm_batch,init_actor_hidden2_cORm_batch)) init_critic_hidden1_cORm_batch = self.state_initialiser( shape=(self.batch_size, self.critic_network.rnn_size), mode='z') critic_init_h_batch = ( init_critic_hidden1_cORm_batch, init_critic_hidden1_cORm_batch ) #,(init_critic_hidden3_cORm_batch,init_critic_hidden3_cORm_batch)) ### self.dt_list = np.zeros(shape=(15, )) self.dt_list[-1] = time.time() if trace_length <= OPT_LENGTH: target_actor_init_h_batch = actor_init_h_batch target_critic_init_h_batch = critic_init_h_batch pass else: ### memory stuff actor_init_h_batch = self.actor_network.action( state_trace_batch[:, :-OPT_LENGTH, :], actor_init_h_batch, mode=1) target_actor_init_h_batch = actor_init_h_batch critic_init_h_batch = self.critic_network.evaluation( state_trace_batch[:, :-OPT_LENGTH, :], action_trace_batch[:, :-OPT_LENGTH, :], critic_init_h_batch, mode=1) target_critic_init_h_batch = critic_init_h_batch state_trace_batch = state_trace_batch[:, -OPT_LENGTH:, :] next_state_trace_batch = next_state_trace_batch[:, -(OPT_LENGTH + 1):, :] action_trace_batch = action_trace_batch[:, -OPT_LENGTH:, :] reward_trace_batch = reward_trace_batch[:, -OPT_LENGTH:, :] done_trace_batch = done_trace_batch[:, -OPT_LENGTH:, :] self.dt_list[0] = time.time() - np.sum(self.dt_list) ###3) Obtain target output next_action_batch = self.actor_network.target_action( next_state_trace_batch, init_temporal_hidden_cm_batch=target_actor_init_h_batch) self.dt_list[1] = time.time() - np.sum(self.dt_list) next_action_trace_batch = np.concatenate( [action_trace_batch, np.expand_dims(next_action_batch, axis=1)], axis=1) self.dt_list[2] = time.time() - np.sum(self.dt_list) target_lastQ_batch = self.critic_network.target_q_trace( next_state_trace_batch, next_action_trace_batch, init_temporal_hidden_cm_batch=target_critic_init_h_batch) self.dt_list[3] = time.time() - np.sum(self.dt_list) # Control the length of time-step for gradient if trace_length <= OPT_LENGTH: update_length = np.minimum( trace_length, OPT_LENGTH // 1) #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2) else: update_length = OPT_LENGTH // 1 #//denom: 2(opt1) #1(opt0) #OPT_LENGTH(opt2) target_lastQ_batch_masked = target_lastQ_batch * ( 1. - done_trace_batch[:, -1]) rQ = np.concatenate([ np.squeeze(reward_trace_batch[:, -update_length:], axis=-1), target_lastQ_batch_masked ], axis=1) self.dt_list[4] = time.time() - np.sum(self.dt_list) try: discounting_mat = self.discounting_mat_dict[update_length] except KeyError: discounting_mat = np.zeros(shape=(update_length, update_length + 1), dtype=np.float) for i in range(update_length): discounting_mat[i, :i] = 0. discounting_mat[i, i:] = GAMMA**np.arange(0., -i + update_length + 1) discounting_mat = np.transpose(discounting_mat) self.discounting_mat_dict[update_length] = discounting_mat try: y_trace_batch = np.expand_dims(np.matmul(rQ, discounting_mat), axis=-1) except Exception as e: print('?') raise self.dt_list[5] = time.time() - np.sum(self.dt_list) ###4)Train Critic: get next_action, target_q, then optimise critic_grad = self.critic_network.train( y_trace_batch, update_length, state_trace_batch, action_trace_batch, init_temporal_hidden_cm_batch=critic_init_h_batch) self.dt_list[6] = time.time() - np.sum(self.dt_list) ###5) Train Actor: while updated critic, we declared the dQda. Hence sess,run(dQda*dadParam_actor), then optimise actor for i in range(update_length): actor_init_h_batch_trace = (np.expand_dims(actor_init_h_batch[0], axis=1), np.expand_dims(actor_init_h_batch[1], axis=1)) critic_init_h_batch_trace = (np.expand_dims(critic_init_h_batch[0], axis=1), np.expand_dims(critic_init_h_batch[1], axis=1)) if i == 0: actor_init_h_batch_stack = actor_init_h_batch_trace critic_init_h_batch_stack = critic_init_h_batch_trace else: actor_init_h_batch_stack = (np.concatenate( (actor_init_h_batch_stack[0], actor_init_h_batch_trace[0]), axis=1), np.concatenate( (actor_init_h_batch_stack[1], actor_init_h_batch_trace[1]), axis=1)) critic_init_h_batch_stack = ( np.concatenate((critic_init_h_batch_stack[0], critic_init_h_batch_trace[0]), axis=1), np.concatenate((critic_init_h_batch_stack[1], critic_init_h_batch_trace[1]), axis=1)) action_trace_batch_for_gradients, actor_init_h_batch = self.actor_network.action_trace( np.expand_dims(state_trace_batch[:, i], 1), init_temporal_hidden_cm_batch=actor_init_h_batch) critic_init_h_batch = self.critic_network.evaluation_trace( np.expand_dims(state_trace_batch[:, i], 1), np.expand_dims(action_trace_batch[:, i], 1), init_temporal_hidden_cm_batch=critic_init_h_batch) if i == 0: action_trace_batch_for_gradients_stack = action_trace_batch_for_gradients else: action_trace_batch_for_gradients_stack = np.concatenate( (action_trace_batch_for_gradients_stack, action_trace_batch_for_gradients), axis=1) self.dt_list[7] = time.time() - np.sum(self.dt_list) state_trace_batch_stack = np.reshape( state_trace_batch, (self.batch_size * update_length, 1, self.state_dim)) action_trace_batch_stack = np.reshape( action_trace_batch, (self.batch_size * update_length, 1, self.action_dim)) action_trace_batch_for_gradients_stack = np.reshape( action_trace_batch_for_gradients_stack, (self.batch_size * update_length, 1, self.action_dim)) actor_init_h_batch_stack = (np.reshape( actor_init_h_batch_stack[0], (self.batch_size * update_length, self.actor_network.rnn_size)), np.reshape( actor_init_h_batch_stack[1], (self.batch_size * update_length, self.actor_network.rnn_size))) critic_init_h_batch_stack = (np.reshape( critic_init_h_batch_stack[0], (self.batch_size * update_length, self.critic_network.rnn_size)), np.reshape( critic_init_h_batch_stack[1], (self.batch_size * update_length, self.critic_network.rnn_size))) q_gradient_trace_batch = self.critic_network.gradients( 1, state_trace_batch_stack, action_trace_batch_for_gradients_stack, init_temporal_hidden_cm_batch=critic_init_h_batch_stack) self.dt_list[8] = time.time() - np.sum(self.dt_list) # Update the actor policy using the sampled gradient: actor_grad = self.actor_network.train( q_gradient_trace_batch, 1, state_trace_batch_stack, action_trace_batch_stack, init_temporal_hidden_cm_batch=actor_init_h_batch_stack) self.dt_list[9] = time.time() - np.sum(self.dt_list) # Update the target networks via EMA & Indicators # self.critic_network.update_target() self.dt_list[10] = time.time() - np.sum(self.dt_list) # self.actor_network.update_target() self.dt_list[11] = time.time() - np.sum(self.dt_list) # actor_diff = self.actor_network.get_diff() self.dt_list[12] = time.time() - np.sum(self.dt_list) # critic_diff = self.critic_network.get_diff() self.dt_list[13] = time.time() - np.sum(self.dt_list) self.dt_list = np.delete(self.dt_list, -1) return actor_grad, critic_grad, # actor_diff, actor_grad, critic_diff, critic_grad def action(self, state_trace, init_hidden_cm, epi, noisy=True): # Select action a_t according to the current policy and exploration noise action, last_hidden_cm = self.actor_network.action([state_trace], init_hidden_cm, mode=2) if noisy: noise = self.exploration_noise.noise() #epi) return action + noise, last_hidden_cm #, dt#, np.linalg.norm(noise) else: return action, last_hidden_cm def evaluation(self, state_trace, action_trace, action_last, init_hidden_cm): return self.critic_network.evaluation([state_trace], [action_trace], action_last, init_hidden_cm, mode=2) #q_value, last_hidden_cm # def perceive(self,actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,time_step,epi): def perceive(self, state, action, reward, next_state, done, time_step, epi): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer # self.replay_buffer.add(actor_init_hidden_cm,critic_last_hidden_cm,state,action,reward,next_state,done,epi) done = float(done) self.replay_buffer.add(state, action, reward, next_state, done, epi, time_step) # Store transitions to replay start size then start training if (self.replay_buffer.num_experiences > REPLAY_START_SIZE): # Non-zero diff should be found self.actor_grad, self.critic_grad = self.train(time_step) # self.actor_diff, self.actor_grad, self.critic_diff, self.critic_grad = self.train(time_step) else: # Zero diff as is not trained # self.actor_diff = 0. self.actor_grad = 0. # self.critic_diff = 0. self.critic_grad = 0. # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPG: """docstring for DDPG""" def __init__(self, sess, data_fname): self.name = 'DDPG' # Randomly initialize actor network and critic network # with both their target networks self.name = 'DDPG' # name for uploading results # Randomly initialize actor network and critic network # with both their target networks self.state_dim = Hp.state_dim self.action_dim = Hp.action_dim print(self.state_dim, self.action_dim) self.sess = sess self.state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.target_state_input = [ tf.placeholder(tf.float32, shape=(None, None, Hp.n_coord)) for _ in xrange(Hp.categories) ] #tf.placeholder("float",[None,self.state_dim]) self.state_network = StateEnc(self.sess, self.state_input, self.target_state_input) state_batch = self.state_network.encoding next_state_batch = self.state_network.target_encoding weights, biases, w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 = self.state_network.get_parameters( ) state_network_params = weights + biases + [ w_i2h0, w_h2h0, w_b0, w_i2h1, w_h2h1, w_b1, w_i2h2, w_h2h2, w_b2 ] self.actor_network = ActorNetwork(self.sess, Hp.n_hidden, self.action_dim, self.state_input, state_batch, next_state_batch, state_network_params) self.critic_network = CriticNetwork(self.sess, Hp.n_hidden, self.action_dim, state_batch, next_state_batch) # initialize replay buffer self.replay_buffer = ReplayBuffer(Hp.REPLAY_BUFFER_SIZE, data_fname) self.summary_str2 = None # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatches = self.replay_buffer.get_batch(Hp.batch_size * Hp.N_TRAIN) print("######### TRAINING #############") for k in range(Hp.N_TRAIN): minibatch = minibatches[k * Hp.batch_size:(k + 1) * Hp.batch_size] state_batch_r = np.asarray([data[0] for data in minibatch]) state_batch = [] for j in range(Hp.categories): new_cat = np.stack(state_batch_r[:, j], axis=0) state_batch.append(new_cat) #state_batch = [np.expand_dims(state_batch, axis=1)] action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch_r = np.asarray([data[3] for data in minibatch]) next_state_batch = [] for j in range(Hp.categories): new_cat = np.stack(next_state_batch_r[:, j], axis=0) next_state_batch.append(new_cat) #next_state_batch = [np.expand_dims(next_state_batch, axis=1)] done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [Hp.batch_size, self.action_dim]) next_action_batch = self.actor_network.target_actions( self.target_state_input, next_state_batch) q_value_batch = self.critic_network.target_q( self.target_state_input, next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + Hp.GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [Hp.batch_size, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, self.state_input, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions( self.state_input, state_batch) q_gradient_batch = self.critic_network.gradients( self.state_input, state_batch, action_batch_for_gradients) self.summary_str2 = self.actor_network.train( q_gradient_batch, self.state_input, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() self.state_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) print("no noise ", action) return np.clip( action + self.exploration_noise.noise() * np.array([-17.0, 17.0, 900.0]), [-35.0, 0.0, 0.0], [0.0, 35.0, 2000.0]) def action(self, state): state = [np.expand_dims(el, axis=0) for el in state] action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > Hp.REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class DDPGAgent: def __init__(self, env): self.sess = tf.InteractiveSession() #self.params = loadparams() # ??? self.env = env self.n_states = env.observation_space.shape[0] self.n_actions = env.action_space.shape[0] self.low = self.env.action_space.low self.high = self.env.action_space.high self.actor_network = ActorNetwork(self.sess, self.n_states, self.n_actions) self.trainable_var_count = self.actor_network.get_trainable_var_count() self.critic_network = CriticNetwork(self.sess, self.n_states, self.n_actions, \ self.actor_network, self.trainable_var_count) self.replay_buffer = ReplayBuffer(BUFFER_SIZE) #params['buffer_size']??? self.exploration_noise = OUNoise(self.n_actions) # self.noise = Noise() self.gamma = GAMMA self.sess.run(tf.global_variables_initializer()) def getNoisyAction(self, current_state): current_state = np.reshape(current_state, (1, self.n_states)) # print ("current_state =", np.shape(current_state)) action = self.actor_network.predict(current_state) return np.clip(action + self.exploration_noise.noise(), self.low, self.high) def getAction(self, current_state): return self.actor_network.predict( \ np.reshape(current_state, (1, self.n_states))) def observe(self, state, action, reward, state_, done): self.replay_buffer.add(state, action[0], reward, state_, done) # batch = tf.concat([batch, (state,action,reward,state_)]) # axis??? if (self.replay_buffer.count > 500): batch = self.replay_buffer.sampleBatch(BATCH_SIZE) self.updateActorAndCritic(batch) if done: self.exploration_noise.reset() def updateActorAndCritic(self, batch): # states, actions, rewards, states_, dones = zip(*batch) states = np.asarray([data[0] for data in batch]) actions = np.asarray([data[1] for data in batch]) rewards = np.asarray([data[2] for data in batch]) states_ = np.asarray([data[3] for data in batch]) dones = np.asarray([data[4] for data in batch]) current_batch_size = BATCH_SIZE states = np.reshape(states, (current_batch_size, self.n_states)) # print("actions shape----------", np.shape(actions)) # actions = np.reshape(actions, (current_batch_size, self.n_actions)) states_ = np.reshape(states_, (current_batch_size, self.n_states)) actions_ = self.actor_network.predict_target(states_) y_batch = [] q_batch = [] yi =[] for i in range(current_batch_size): if dones[i]: yi = rewards[i] else: yi = rewards[i] + \ self.gamma * self.critic_network.predict_target( \ np.reshape(states_[i], (1, self.n_states)), \ np.reshape(actions[i],(1, self.n_actions))) y_batch.append(yi) y_batch = np.reshape(y_batch,(current_batch_size,1)) # print("critic update begins") self.critic_network.update(y_batch, states, actions) # print("critic update ends") # print("action batch begins") action_batch_for_gradient = self.actor_network.predict(states) # print("action batch ends") # action_batch_for_gradient = np.reshape( \ # action_batch_for_gradient,(current_batch_size, 1)) # print("q batch gradient begins") q_gradient_batch = self.critic_network.get_action_gradient(states, action_batch_for_gradient) # print("q batch gradient done") # q_gradient_batch = np.reshape( \ # q_gradient_batch,(current_batch_size,1)) # print("actor update begins") self.actor_network.update(states, q_gradient_batch) # print("actor update ends") def save(self): self.critic_network.save()
def main(): experiment = 'quadruped-robot-v0' #specify environments here backupNameFile = "quadruped_robot_0" backupPathFile = "storage/" + backupNameFile bFullPath = os.path.join( os.path.split(os.path.abspath(__file__))[0], backupPathFile) env = gym.make(experiment) steps = env.spec.timestep_limit #steps per episode assert isinstance(env.observation_space, Box), "observation space must be continuous" assert isinstance(env.action_space, Box), "action space must be continuous" #Randomly initialize critic,actor,target critic, target actor network and replay buffer global agent agent = DDPG(env, is_batch_norm) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 reward_per_episode = 0 total_reward = 0 num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) print("Number of Steps per episode:", steps) #saving reward: reward_st = np.array([0]) for i in range(episodes): print("==== Starting episode no:", i, "====", "\n") observation = env.reset() reward_per_episode = 0 for t in range(steps): #rendering environmet (optional) env.render() x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise # print ("Action at step", t ," :",action,"\n") observation, reward, done, info = env.step(action) #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, done) #train critic and actor network if counter > 64: agent.train() reward_per_episode += reward counter += 1 #check if episode ends: if (done or (t == steps - 1)): # print ('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode) # print ("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline="\n") print('\n\n') break # Save some episodes # print(episodes) # if (episodes == 10): # with open(bFullPath+"_EP_"+episodes+".pkl", 'wb') as file: # pickle.dump(agent, file) # pickle.dump_session(bFullPath+"_EP_"+episodes+".pkl") # print ('SAVE EPISODE ',episodes) # break; total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / episodes))
class DDPG: """docstring for DDPG""" def __init__(self, env): mx.random.seed(seed) np.random.seed(seed) self.env = env if flg_gpu: self.ctx = mx.gpu(0) else: self.ctx = mx.cpu() self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.ddpgnet = DDPGNet(self.state_dim, self.action_dim) self.exploration_noise = OUNoise(self.action_dim) self.replay_buffer = ReplayBuffer(memory_size) self.batch_size = batch_size self.ddpgnet.init() self.train_step = 0 def train(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(self.batch_size) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [self.batch_size, self.action_dim]) # Calculate y_batch next_qvals = self.ddpgnet.get_target_q(next_state_batch).asnumpy() y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * next_qvals[i][0]) y_batch = np.resize(y_batch, [self.batch_size, 1]) # Update critic by minimizing the loss L self.ddpgnet.update_critic(state_batch, action_batch, y_batch) # Update actor by maxmizing Q self.ddpgnet.update_actor(state_batch) self.train_step += 1 # update target networks self.ddpgnet.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise state = np.reshape(state, (1, self.state_dim)) action = self.ddpgnet.get_step_action(state) return action + self.exploration_noise.noise() def action(self, state): state = np.reshape(state, (1, self.state_dim)) action = self.ddpgnet.get_step_action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > memory_start_size: self.train() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class RDPG: """docstring for RDPG""" def __init__(self, env): self.name = 'RDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.saver = tf.train.Saver() def train(self): # Sample a random minibatch of N sequences from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # Construct histories observations = [] next_observations = [] actions = [] rewards = [] dones = [] for each in minibatch: for i in range(1, len(each.observations)): observations.append(self.pad(each.observations[0:i])) next_observations.append(self.pad(each.observations[1, i + 1])) actions.append(each.actions[0:i - 1]) rewards.append(each.rewards[0:i]) if i == len(each.observations) - 1: dones.append(True) else: dones.append(False) # Calculate y_batch next_action_batch = self.actor_network.target_action(observations) q_value_batch = self.critic_network.target_q( next_observations, [self.pad(i + j) for (i, j) in zip(actions, next_action_batch)]) y_batch = [] for i in range(len(observations)): if dones[i]: y_batch.append(rewards[i][-1]) else: y_batch.append(rewards[i][-1] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [len(observations), 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, observations, [self.pad(i) for i in actions]) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(observations) q_gradient_batch = self.critic_network.gradients( observations, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, observations) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def save_model(self, path, episode): self.saver.save(self.sess, path + "modle.ckpt", episode) def noise_action(self, history): # Select action a_t according to a sequence of observation and action action = self.actor_network.action(history) return action + self.exploration_noise.noise() def action(self, history): action = self.actor_network.action(history) return action def perceive(self, history): # Store the history sequence in the replay buffer self.replay_buffer.add(history) # Store history to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def pad(self, input): dim = len(input[0]) return input + [[0] * dim] * (1000 - len(input))
class DDPG: def __init__(self, state_dim, state_channel, action_dim): self.state_dim = state_dim self.state_channel = state_channel self.action_dim = action_dim self.sess = tf.InteractiveSession() self.state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.target_state_input = tf.placeholder('float', [None, state_dim, state_dim, state_channel]) self.action_input = tf.placeholder('float', [None, action_dim]) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.state_channel, self.action_dim) # create network self.actor_network.create_network(self.state_input) self.critic_network.create_q_network(self.state_input, self.actor_network.action_output) # create target network self.actor_network.create_target_network(self.target_state_input) self.critic_network.create_target_q_network(self.target_state_input, self.actor_network.target_action_output) # create training method self.actor_network.create_training_method(self.critic_network.q_value_output) self.critic_network.create_training_method() self.sess.run(tf.initialize_all_variables()) self.actor_network.update_target() self.critic_network.update_target() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.exploration_noise = OUNoise(self.action_dim) self.dir_path = os.path.dirname(os.path.realpath(__file__)) + '/models_ddpg' if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) # for log self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(self.dir_path + '/log', self.sess.graph) self.episode_reward = 0.0 self.episode_start_time = 0.0 self.time_step = 1 self.saver = tf.train.Saver(tf.all_variables()) self.load_time_step() self.load_network() return def train(self): action_dim = self.action_dim minibatch = self.replay_buffer.get_batch(BATCH_SIZE) # sample BATCH_SIZE from replay_buffer state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # if action_dim = 1, it's a number not a array action_batch = np.resize(action_batch, [BATCH_SIZE, action_dim]) # calculate y_batch via target network next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q_value(next_state_batch, next_action_batch) y_batch = [] for i in range(BATCH_SIZE): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # print np.shape(reward_batch), np.shape(y_batch) # train actor network self.actor_network.train(state_batch) # train critic network self.critic_network.train(y_batch, state_batch, action_batch) # update target network self.actor_network.update_target() self.critic_network.update_target() return def noise_action(self, state): action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def _record_log(self, reward, living_time): summary_str = self.sess.run(self.summary_op, feed_dict={ self.reward_input: reward, self.time_input: living_time }) self.summary_writer.add_summary(summary_str, self.time_step) return def perceive(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if self.episode_start_time == 0.0: self.episode_start_time = time.time() # for testing # self.time_step += 1 # if self.time_step == 100: # print '--------------------------------' # self.replay_buffer.save_to_pickle() # return self.episode_reward += reward living_time = time.time() - self.episode_start_time if self.time_step % 1000 == 0 or done: self._record_log(self.episode_reward, living_time) if self.replay_buffer.size() > REPLAY_START_SIZE: self.train() if self.time_step % 100000 == 0: self.save_network() if done: print '===============reset noise=========================' self.exploration_noise.reset() self.episode_reward = 0.0 self.episode_start_time = time.time() self.time_step += 1 return def load_time_step(self): if not os.path.exists(self.dir_path): return files = os.listdir(self.dir_path) step_list = [] for filename in files: if ('meta' in filename) or ('-' not in filename): continue step_list.append(int(filename.split('-')[-1])) step_list = sorted(step_list) if len(step_list) == 0: return self.time_step = step_list[-1] + 1 return def load_network(self): checkpoint = tf.train.get_checkpoint_state(self.dir_path) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print 'Successfully loaded:', checkpoint.model_checkpoint_path else: print 'Could not find old network weights' return def save_network(self): print 'save actor-critic network...', self.time_step self.saver.save(self.sess, self.dir_path + '/ddpg', global_step=self.time_step) return
def main(): experiment = 'MountainCarContinuous-v0' env = gym.make(experiment) steps = env.spec.timestep_limit assert isinstance(env.observation_space, Box) assert isinstance(env.action_space, Box) agent = DDPG(env, is_batch_norm) #这个在循环前面,所以所有的weight都有继承 #也就是说,整个过程只训练了一个模型出来。 exploration_noise = OUNoise(env.action_space.shape[0]) reward_per_episode = 0 total_reward = 0 counter = 0 num_states = env.observation_space.shape[0] - 1 num_actions = env.action_space.shape[0] #这是state的维度和action的维度 print 'Number of States:', num_states print 'Number of Actions:', num_actions print 'Number of steps per episode:', steps if is_exploration == True: print("\nExploration phase for {} steps. ".format(exploration_steps)) e_steps = 0 while e_steps < exploration_steps: s = env.reset() one_step = 0 done = False exploration_noise.reset() exp = [] while not done: a = exploration_noise.noise() ss, r, done, _ = env.step(a) exp.append((s[:-1], a, ss[:-1], r, done)) s = ss one_step += 1 if one_step > 998: break agent.add_experience(exp) e_steps += 1 reward_st = np.array([0]) #这个是用来存每一次的rewards的 for i in xrange(episodes): #一共要循环1000次 print '====starting episode no:', i, '====', '\n' observation = env.reset() #每个情节初始化,但是模型参数不初始化 reward_per_episode = 0 LSTM_SIZE = 40 statec_t1 = np.zeros((BATCH_SIZE, LSTM_SIZE)) stateh_t1 = np.zeros((BATCH_SIZE, LSTM_SIZE)) exp = [] for t in xrange(steps): #env.render() x = [observation[0:num_states]] x = np.reshape(x * BATCH_SIZE, [BATCH_SIZE, num_states]) actor, statec_t1, stateh_t1 = agent.evaluate_actor( x, statec_t1, stateh_t1) noise = exploration_noise.noise() #ra = random.random() if (i < 500): action = actor[0] + noise else: action = actor[0] observation, reward, done, info = env.step(action) #print 'Action at step',t,':',action,'reward:',reward,'\n' exp.append((x, action, observation[0:num_states], reward, done)) if counter > 64: agent.train() counter += 1 reward_per_episode += reward if (done or (t == steps - 1)): #一个情节结束了~ agent.add_experience(exp) print 'EPISODE:', i, 'Steps', t, 'Total Reward:', reward_per_episode print 'Printing reward to file' exploration_noise.reset() reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, newline='\n') print '\n\n' break total_reward += reward_per_episode #这里是计算平均值的 print "Average reward per episode {}".format(total_reward / episodes)
class DDPG: """docstring for DDPG""" def __init__(self, state_space, action_dim): self.name = 'DDPG' # name for uploading results self.sess = tf.Session() # Randomly initialize actor network and critic network # with both their target networks self.state_space = state_space self.action_dim = action_dim # 1 self.ac_network = ActorCriticNetwork(self.sess, self.state_space, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Get Q target label # maxQ(s',a') q_value_batch = self.ac_network.target_q(next_state_batch) # Calculate target maxQ(s,a): y = reward + GAMMA * maxQ(s',a') y_batch = [] batch_size = len(minibatch) for i in range(batch_size): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [batch_size, 1]) # Update eval critic network by minimizing the loss L cost = self.ac_network.train_critic(y_batch, state_batch, action_batch) print('step_%d critic cost:' % self.ac_network.time_step, cost) # Update eval actor policy using the sampled gradient: self.ac_network.train_actor(state_batch) # Update the target networks self.ac_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.ac_network.actions(state) return action[0] + self.exploration_noise.noise() def action(self, state): action = self.ac_network.actions([state]) return action[0] def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def sparse_tensor(self, state_batch, state_space): row = len(state_batch) indices = [] for r in range(row): indices += [(r, c) for c in state_batch[r]] values = [1.0 for i in range(len(indices))] return tf.SparseTensorValue(indices=indices, values=values, dense_shape=[row, state_space])
a_t = np.reshape(a_t,[1,N_ACTIONS]) r_t = np.reshape(r_t,[1,1]) if t == 0: #initializing history at time, t = 0 h_t = np.hstack([o_t,a_t,r_t]) else: h_t = np.append(h_t,np.hstack([o_t,a_t,r_t]),axis = 0) reward_per_episode += r_t #appending history: o_t = o_t1 if (done or (t == STEPS-1)): print('EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode) print("Printing reward to file") exploration_noise.reset() #reinitializing random noise for action exploration reward_st = np.append(reward_st,reward_per_episode) np.savetxt('episode_reward.txt',reward_st, newline="\n") print('\n\n') agent.add_to_replay(h_t,i) break if i == 0: #store episodes: R.append(h_t) #R = np.zeros([1,STEPS,NUM_ACTIONS+NUM_OUTPUTS+1]) #R = np.append(R,np.reshape(h_t,[1,STEPS,NUM_ACTIONS+NUM_OUTPUTS+1]),axis = 0) #R = np.delete(R, (0), axis=0) #Initialing a zero array with size and deleting it back else: R.append(h_t)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, state_size_full, action_size_full, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.state_size_full = state_size_full self.action_size_full = action_size_full self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(hyperparameters.device) self.actor_target = Actor(state_size, action_size, random_seed).to(hyperparameters.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyperparameters.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size_full, action_size_full, random_seed).to(hyperparameters.device) self.critic_target = Critic(state_size_full, action_size_full, random_seed).to(hyperparameters.device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=hyperparameters.LR_CRITIC, weight_decay=hyperparameters.WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) def act(self, state, eps, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(hyperparameters.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += eps * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG: """docstring for DDPG""" def __init__(self, env, results_file): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) results_file.write(ActorNetwork.get_settings()) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def trainer(epochs=1000, MINIBATCH_SIZE=40, GAMMA = 0.99, epsilon=1.0, min_epsilon=0.01, BUFFER_SIZE=10000, train_indicator=True, render=False): with tf.Session() as sess: # configuring environment env = gym.make(ENV_NAME) # configuring the random processes np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) # info of the environment to pass to the agent state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = np.float64(10) # I choose this number since the mountain continuos does not have a boundary # Creating agent ruido = OUNoise(action_dim, mu = 0.4) # this is the Ornstein-Uhlenbeck Noise actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU, DEVICE) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars(), DEVICE) sess.run(tf.global_variables_initializer()) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) goal = 0 max_state = -1. try: critic.recover_critic() actor.recover_actor() print('********************************') print('models restored succesfully') print('********************************') except: pass # print('********************************') # print('Failed to restore models') # print('********************************') for i in range(epochs): state = env.reset() state = np.hstack(state) ep_reward = 0 ep_ave_max_q = 0 done = False step = 0 max_state_episode = -1 epsilon -= (epsilon/EXPLORE) epsilon = np.maximum(min_epsilon,epsilon) while (not done): if render: env.render() #print('step', step) # 1. get action with actor, and add noise action_original = actor.predict(np.reshape(state,(1,state_dim))) # + (10. / (10. + i))* np.random.randn(1) action = action_original + max(epsilon,0)*ruido.noise() # remove comment if you want to see a step by step update # print(step,'a',action_original, action,'s', state[0], 'max state', max_state_episode) # 2. take action, see next state and reward : next_state, reward, done, info = env.step(action) if train_indicator: # 3. Save in replay buffer: replay_buffer.add(np.reshape(state, (actor.s_dim,)), np.reshape(action, (actor.a_dim,)), reward, done, np.reshape(next_state, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: # 4. sample random minibatch of transitions: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets # 5. Train critic Network (states,actions, R + gamma* V(s', a')): # 5.1 Get critic prediction = V(s', a') # the a' is obtained using the actor prediction! or in other words : a' = actor(s') target_q = critic.predict_target(s2_batch, actor.predict_target(s2_batch)) # 5.2 get y_t where: y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # 5.3 Train Critic! predicted_q_value, _ = critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # 6 Compute Critic gradient (depends on states and actions) # 6.1 therefore I first need to calculate the actions the current actor would take. a_outs = actor.predict(s_batch) # 6.2 I calculate the gradients grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() state = next_state if next_state[0] > max_state_episode: max_state_episode = next_state[0] ep_reward = ep_reward + reward step +=1 if done: ruido.reset() if state[0] > 0.45: #print('****************************************') #print('got it!') #print('****************************************') goal += 1 if max_state_episode > max_state: max_state = max_state_episode print('th',i+1,'n steps', step,'R:', round(ep_reward,3),'Pos', round(epsilon,3),'Efficiency', round(100.*((goal)/(i+1.)),3) ) # print('Efficiency', 100.*((goal)/(i+1.))) print('*************************') print('now we save the model') critic.save_critic() actor.save_actor() print('model saved succesfuly') print('*************************')
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.InteractiveSession() self.actor_network = ActorNetwork(self.sess,self.state_dim,self.action_dim) self.critic_network = CriticNetwork(self.sess,self.state_dim,self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch,[BATCH_SIZE,self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch,next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else : y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch,[BATCH_SIZE,1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients) self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action+self.exploration_noise.noise() def action(self,state): action = self.actor_network.action(state) return action def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
class PilotNode(object): """Node to listen to ROS topics like depth, rgb input and supervised control. The node also publishes to pilot control and predicted depth for visualization. """ def __init__(self, FLAGS, model, logfolder): print('initialize pilot node') self.FLAGS=FLAGS # Initialize fields self.logfolder = logfolder f=open(os.path.join(self.logfolder,'tf_log'),'a') f.write(self.FLAGS.log_tag) f.write('\n') f.close() self.model = model self.ready=False self.finished=True self.training=False self.last_pose=[] # previous pose, used for accumulative distance self.world_name = '' self.runs={'train':0, 'test':0} # number of online training run (used for averaging) # self.accumlosses = {} # gather losses and info over the run in a dictionary self.current_distance=0 # accumulative distance travelled from beginning of run used at evaluation self.furthest_point=0 # furthest point reached from spawning point at the beginning of run self.average_distances={'train':0, 'test':0} # running average over different runs self.target_control = [] # field to keep the latest supervised control self.target_depth = [] # field to keep the latest supervised depth self.nfc_images =[] #used by n_fc networks for building up concatenated frames self.exploration_noise = OUNoise(4, 0, self.FLAGS.ou_theta,1) if not self.FLAGS.dont_show_depth: self.depth_pub = rospy.Publisher('/depth_prediction', numpy_msg(Floats), queue_size=1) self.action_pub=rospy.Publisher('/nn_vel', Twist, queue_size=1) self.model.reset_metrics() rospy.Subscriber('/nn_start', Empty, self.ready_callback) rospy.Subscriber('/nn_stop', Empty, self.finished_callback) # extract imitation loss from supervised velocity rospy.Subscriber('/supervised_vel', Twist, self.supervised_callback) self.start_time = 0 self.imitation_loss=[] self.depth_prediction=[] self.depth_loss=[] self.driving_duration=-1 self.skip_frames = 0 self.img_index = 0 self.fsm_index = 0 if rospy.has_param('rgb_image'): image_topic=rospy.get_param('rgb_image') if 'compressed' in image_topic: rospy.Subscriber(image_topic, CompressedImage, self.compressed_image_callback) else: rospy.Subscriber(image_topic, Image, self.image_callback) if rospy.has_param('depth_image'): depth_topic = rospy.get_param('depth_image') if 'scan' in depth_topic: rospy.Subscriber(depth_topic, LaserScan, self.scan_depth_callback) else: rospy.Subscriber(depth_topic, Image, self.depth_callback) if not self.FLAGS.real: # initialize the replay buffer self.replay_buffer = ReplayBuffer(self.FLAGS, self.FLAGS.random_seed) self.accumloss = 0 if rospy.has_param('gt_info'): rospy.Subscriber(rospy.get_param('gt_info'), Odometry, self.gt_callback) # Add some lines to debug delays: self.time_im_received=[] self.time_ctr_send=[] rospy.init_node('pilot', anonymous=True) def ready_callback(self,msg): """ callback function that makes DNN policy starts the ready flag is set on 1 (for 3s)""" if not self.ready and self.finished: print('Neural control activated.') self.ready = True self.start_time = rospy.get_time() self.finished = False self.exploration_noise.reset() # choose one speed for this flight self.FLAGS.speed=self.FLAGS.speed + (not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_x, self.FLAGS.sigma_x) if rospy.has_param('evaluate'): self.FLAGS.evaluate = rospy.get_param('evaluate') print '--> set evaluate to: {0} with speed {1}'.format(self.FLAGS.evaluate, self.FLAGS.speed) if rospy.has_param('skip_frames'): self.skip_frames = rospy.get_param('skip_frames') print '--> set skip_frames to: {0}'.format(self.skip_frames) if rospy.has_param('world_name') : self.world_name = rospy.get_param('world_name') time.sleep(1) # wait one second, otherwise create_dataset can't follow... def gt_callback(self, data): """Callback function that keeps track of positions for logging""" if not self.ready or self.training: return current_pos=[data.pose.pose.position.x, data.pose.pose.position.y, data.pose.pose.position.z] if len(self.last_pose)!= 0: self.current_distance += np.sqrt((self.last_pose[0,3]-current_pos[0])**2+(self.last_pose[1,3]-current_pos[1])**2) self.furthest_point=max([self.furthest_point, np.sqrt(current_pos[0]**2+current_pos[1]**2)]) # Get pose (rotation and translation) [DEPRECATED: USED FOR ODOMETRY] quaternion = (data.pose.pose.orientation.x, data.pose.pose.orientation.y, data.pose.pose.orientation.z, data.pose.pose.orientation.w) self.last_pose = transformations.quaternion_matrix(quaternion) # orientation of current frame relative to global frame self.last_pose[0:3,3]=current_pos def process_rgb(self, msg): """ Convert RGB serial data to opencv image of correct size""" try: # Convert your ROS Image message to OpenCV2 # changed to normal RGB order as i ll use matplotlib and PIL instead of opencv img =bridge.imgmsg_to_cv2(msg, 'rgb8') except CvBridgeError as e: print(e) else: img = img[::2,::5,:] size = self.model.input_size[1:] img = sm.resize(img,size,mode='constant').astype(float) return img def process_rgb_compressed(self, msg): """ Convert RGB serial data to opencv image of correct size""" # if not self.ready or self.finished: return [] try: img = bridge.compressed_imgmsg_to_cv2(msg, desired_encoding='passthrough') except CvBridgeError as e: print(e) else: # 308x410 to 128x128 img = img[::2,::3,:] size = self.model.input_size[1:] img = sm.resize(img,size,mode='constant').astype(float) return img def process_depth(self, msg): """ Convert depth serial data to opencv image of correct size""" # if not self.ready or self.finished: return [] try: # Convert your ROS Image message to OpenCV2 de = bridge.imgmsg_to_cv2(msg, desired_encoding='passthrough')#gets float of 32FC1 depth image except CvBridgeError as e: print(e) else: de = de[::6,::8] shp=de.shape # # assume that when value is not a number it is due to a too large distance (set to 5m) # # values can be nan for when they are closer than 0.5m but than the evaluate node should # # kill the run anyway. de=np.asarray([ e*1.0 if not np.isnan(e) else 5 for e in de.flatten()]).reshape(shp) # clipping nans: dur: 0.010 size = (55,74) # print 'DEPTH: min: ',np.amin(de),' and max: ',np.amax(de) de = sm.resize(de,size,order=1,mode='constant', preserve_range=True) return de def process_scan(self, msg): """Preprocess serial scan: clip horizontal field of view, clip at 1's and ignore 0's, smooth over 4 bins.""" # field of view should follow camera: # wide-angle camera: -60 to 60. # normal camera: -35 to 35. ranges=[1 if r > 1 or r==0 else r for r in msg.ranges] # clip left 45degree range from 0:45 reversed with right 45degree range from the last 45: ranges=list(reversed(ranges[:self.FLAGS.field_of_view/2]))+list(reversed(ranges[-self.FLAGS.field_of_view/2:])) # add some smoothing by averaging over 4 neighboring bins ranges = [sum(ranges[i*self.FLAGS.smooth_scan:i*self.FLAGS.smooth_scan+self.FLAGS.smooth_scan])/self.FLAGS.smooth_scan for i in range(int(len(ranges)/self.FLAGS.smooth_scan))] # make it a numpy array de = np.asarray(ranges).reshape((1,-1)) # if list(de.shape) != self.model.output_size: # reshape if necessary # de = sm.resize(de,self.model.output_size,order=1,mode='constant', preserve_range=True) return de def compressed_image_callback(self, msg): """ Process serial image data with process_rgb and concatenate frames if necessary""" im = self.process_rgb_compressed(msg) if len(im)!=0: self.process_input(im) def image_callback(self, msg): """ Process serial image data with process_rgb and concatenate frames if necessary""" self.time_im_received.append(time.time()) im = self.process_rgb(msg) if len(im)!=0: if 'nfc' in self.FLAGS.network: # when features are concatenated, multiple images should be kept. self.nfc_images.append(im) if len(self.nfc_images) < self.FLAGS.n_frames: return else: # concatenate last n-frames im = np.concatenate(np.asarray(self.nfc_images[-self.FLAGS.n_frames:]),axis=2) self.nfc_images = self.nfc_images[-self.FLAGS.n_frames+1:] # concatenate last n-1-frames self.process_input(im) def depth_callback(self, msg): im = self.process_depth(msg) if len(im)!=0 and self.FLAGS.auxiliary_depth: self.target_depth = im def scan_depth_callback(self, msg): im = self.process_scan(msg) if len(im)!=0: self.depth = im # calculate depth loss on the fly if len(self.depth_prediction) != 0: # print("pred: {0} trg: {1}".format(self.depth_prediction, self.depth)) self.depth_loss.append(np.mean((self.depth_prediction - self.depth.flatten())**2)) def process_input(self, im): """Process the inputs: images, targets, auxiliary tasks Predict control based on the inputs. Plot auxiliary predictions. Fill replay buffer. """ # skip a number of frames to lower the actual control rate # independently of the image frame rate if self.skip_frames != 0: self.img_index+=1 if self.img_index % (self.skip_frames+1) != 0: return aux_depth=[] # variable to keep predicted depth trgt = -100. inpt=im if self.FLAGS.evaluate: ### EVALUATE trgt=np.array([[self.target_control[5]]]) if len(self.target_control) != 0 else [] trgt_depth = np.array([copy.deepcopy(self.target_depth)]) if len(self.target_depth) !=0 and self.FLAGS.auxiliary_depth else [] control, aux_results = self.model.forward([inpt], auxdepth= not self.FLAGS.dont_show_depth,targets=trgt, depth_targets=trgt_depth) if not self.FLAGS.dont_show_depth and self.FLAGS.auxiliary_depth and len(aux_results)>0: aux_depth = aux_results['d'] else: ###TRAINING # Get necessary labels, if label is missing wait... def check_field(target_name): if len (target_name) == 0: # print('Waiting for target {}'.format(target_name)) return False else: return True if not check_field(self.target_control): return else: trgt = self.target_control[5] if self.FLAGS.auxiliary_depth: if not check_field(self.target_depth): return else: trgt_depth = copy.deepcopy(self.target_depth) control, aux_results = self.model.forward([inpt], auxdepth=not self.FLAGS.dont_show_depth) if not self.FLAGS.dont_show_depth and self.FLAGS.auxiliary_depth: aux_depth = aux_results['d'] ### SEND CONTROL control = control[0] # print control if trgt != -100 and not self.FLAGS.evaluate: # policy mixing with self.FLAGS.alpha action = trgt if np.random.binomial(1, self.FLAGS.alpha**(self.runs['train']+1)) else control else: action = control msg = Twist() msg.linear.x = self.FLAGS.speed if self.FLAGS.noise == 'ou': noise = self.exploration_noise.noise() msg.linear.y = (not self.FLAGS.evaluate)*noise[1]*self.FLAGS.sigma_y msg.linear.z = (not self.FLAGS.evaluate)*noise[2]*self.FLAGS.sigma_z msg.angular.z = max(-1,min(1,action+(not self.FLAGS.evaluate)*self.FLAGS.sigma_yaw*noise[3])) elif self.FLAGS.noise == 'uni': # msg.linear.x = self.FLAGS.speed + (not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_x, self.FLAGS.sigma_x) msg.linear.y = (not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_y, self.FLAGS.sigma_y) msg.linear.z = (not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_z, self.FLAGS.sigma_z) msg.angular.z = max(-1,min(1,action+(not self.FLAGS.evaluate)*np.random.uniform(-self.FLAGS.sigma_yaw, self.FLAGS.sigma_yaw))) else: raise IOError( 'Type of noise is unknown: {}'.format(self.FLAGS.noise)) # if np.abs(msg.angular.z) > 0.3: msg.linear.x = 0. if np.abs(msg.angular.z) > 0.3 and self.FLAGS.break_and_turn: msg.linear.x = 0. + np.random.binomial(1, 0.1) self.action_pub.publish(msg) self.time_ctr_send.append(time.time()) ### keep track of imitation loss on the fly if len(self.target_control) != 0: self.imitation_loss.append((self.target_control[5]-action)**2) if not self.FLAGS.dont_show_depth and len(aux_depth) != 0 and not self.finished: aux_depth = aux_depth.flatten() self.depth_pub.publish(aux_depth) aux_depth = [] # ADD EXPERIENCE REPLAY if not self.FLAGS.evaluate and trgt != -100: experience={'state':im, 'action':action, 'trgt':trgt} if self.FLAGS.auxiliary_depth: experience['target_depth']=trgt_depth self.replay_buffer.add(experience) # print("added experience: {0} vs {1}".format(action, trgt)) def supervised_callback(self, data): """Get target control from the /supervised_vel node""" # print 'received control' if not self.ready: return self.target_control = [data.linear.x, data.linear.y, data.linear.z, data.angular.x, data.angular.y, data.angular.z] def finished_callback(self,msg): """When run is finished: sample 10 batches from the replay buffer, apply gradient descent on the model, write log file and checkpoints away """ if self.ready and not self.finished: print('neural control deactivated. @ time: {}'.format(time.time())) self.ready=False self.finished=True if self.start_time!=0: self.driving_duration = rospy.get_time() - self.start_time # Train model from experience replay: # Train the model with batchnormalization out of the image callback loop depth_predictions = [] losses_train = {} if self.replay_buffer.size()>self.FLAGS.batch_size and not self.FLAGS.evaluate: for b in range(min(int(self.replay_buffer.size()/self.FLAGS.batch_size), 10)): inputs, targets, aux_info = self.replay_buffer.sample_batch(self.FLAGS.batch_size) if b==0: if self.FLAGS.plot_depth and self.FLAGS.auxiliary_depth: depth_predictions = tools.plot_depth(inputs, aux_info['target_depth'].reshape(-1,55,74)) depth_targets=[] if self.FLAGS.auxiliary_depth: depth_targets=aux_info['target_depth'].reshape(-1,55,74) losses = self.model.backward(inputs,targets[:].reshape(-1,1),depth_targets) for k in losses.keys(): try: losses_train[k].append(losses[k]) except: losses_train[k]=[losses[k]] # Gather all info to build a proper summary and string of results k='train' if not self.FLAGS.evaluate else 'test' self.average_distances[k]= self.average_distances[k]-self.average_distances[k]/(self.runs[k]+1) self.average_distances[k] = self.average_distances[k]+self.current_distance/(self.runs[k]+1) self.runs[k]+=1 sumvar={} result_string='{0}: run {1}'.format(time.strftime('%H:%M'),self.runs[k]) vals={'current':self.current_distance, 'furthest':self.furthest_point} for d in ['current', 'furthest']: name='Distance_{0}_{1}'.format(d,'train' if not self.FLAGS.evaluate else 'test') if len(self.world_name)!=0: name='{0}_{1}'.format(name,self.world_name) sumvar[name]=vals[d] result_string='{0}, {1}:{2}'.format(result_string, name, vals[d]) for k in losses_train.keys(): name={'total':'Loss_train_total'} sumvar[name[k]]=np.mean(losses_train[k]) result_string='{0}, {1}:{2}'.format(result_string, name[k], np.mean(losses_train[k])) # get all metrics of this episode and add them to var results = self.model.get_metrics() for k in results.keys(): sumvar[k] = results[k] result_string='{0}, {1}:{2}'.format(result_string, k, results[k]) if self.FLAGS.plot_depth and self.FLAGS.auxiliary_depth: sumvar["depth_predictions"]=depth_predictions # add driving duration (collision free) if self.driving_duration != -1: result_string='{0}, driving_duration: {1:0.3f}'.format(result_string, self.driving_duration) sumvar['driving_time']=self.driving_duration # add imitation loss if len(self.imitation_loss)!=0: result_string='{0}, imitation_loss: {1:0.3}'.format(result_string, np.mean(self.imitation_loss)) sumvar['imitation_loss']=np.mean(self.imitation_loss) # add depth loss if len(self.depth_loss)!=0: result_string='{0}, depth_loss: {1:0.3f}, depth_loss_var: {2:0.3f}'.format(result_string, np.mean(self.depth_loss), np.var(self.depth_loss)) sumvar['depth_loss']=np.mean(self.depth_loss) if len(self.time_ctr_send) > 10 and len(self.time_im_received) > 10: # calculate control-rates and rgb-rates from differences avg_ctr_rate = 1/np.mean([self.time_ctr_send[i+1]-self.time_ctr_send[i] for i in range(len(self.time_ctr_send)-1)]) std_ctr_delays = np.std([self.time_ctr_send[i+1]-self.time_ctr_send[i] for i in range(len(self.time_ctr_send)-1)]) avg_im_rate = 1/np.mean([self.time_im_received[i+1]-self.time_im_received[i] for i in range(1,len(self.time_im_received)-1)]) #skip first image delay as network still needs to 'startup' std_im_delays = np.std([self.time_ctr_send[i+1]-self.time_ctr_send[i] for i in range(len(self.time_ctr_send)-1)]) result_string='{0}, control_rate: {1:0.3f}, image_rate: {2:0.3f} , control_delay_std: {1:0.3f}, image_delay_std: {2:0.3f} '.format(result_string, avg_ctr_rate, avg_im_rate, std_ctr_delays, std_im_delays) try: self.model.summarize(sumvar) except Exception as e: print('failed to write', e) pass else: print(result_string) # ! Note: tf_log is used by evaluate_model train_model and train_and_evaluate_model in simulation_supervised/scripts # Script starts next run once this file is updated. try: f=open(os.path.join(self.logfolder,'tf_log'),'a') f.write(result_string) f.write('\n') f.close() except Exception as e: print('failed to write txt tf_log {}'.format(e)) print('retry after sleep 60') time.sleep(60) f=open(os.path.join(self.logfolder,'tf_log'),'a') f.write(result_string) f.write('\n') f.close() # self.accumlosses = {} self.current_distance = 0 self.last_pose = [] self.nfc_images = [] self.furthest_point = 0 self.world_name = '' if self.runs['train']%20==0 and not self.FLAGS.evaluate: # Save a checkpoint every 20 runs. self.model.save(self.logfolder) print('model saved [run {0}]'.format(self.runs['train'])) self.time_im_received=[] self.time_ctr_send=[] self.model.reset_metrics() self.start_time=0 self.imitation_loss=[] self.depth_loss=[] self.driving_duration=-1 self.img_index=0 self.fsm_index = 0
s_dim1 = env.s_dim a_dim1 = env.a_dim a_bound1 = env.a_bound ddpg = DDPG(a_dim1, s_dim1, a_bound1, MAP_DIM, att_dim=32) exploration_noise = OUNoise(a_dim1) # control exploration t1 = time.time() replay_num = 0 env.set_map_seed(187) for i in range(MAX_EPISODES): t_start = time.time() sd = i * 3 + 100 m_sd, s, gm, loc = env.set_state_seed(sd) lm = env.get_local_map(loc) exploration_noise.reset() ep_reward = 0 ave_dw = 0 j = 0 r = 0 for j in range(MAX_EP_STEPS): # Add exploration noise a = ddpg.choose_action(s, gm, lm) ave_dw += np.linalg.norm(a) a += exploration_noise.noise( ) # add randomness to action selection for exploration a = np.minimum(a_bound1, np.maximum(-a_bound1, a)) a[0:4] /= max(np.linalg.norm(a[0:4]), 1e-8) s_, loc_, r, done = env.step(a)
class DDPG(): """Reinforcement Learning agent , learning using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.08 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor 0.99 self.tau = 0.001 # for soft update of target parameters 0.01 # Score tracker and learning parameters self.total_reward = None self.count = 0 self.score = 0 self.best_score = -np.inf self.last_state = None def reset_episode(self): self.total_reward = None self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): if self.total_reward: self.total_reward += reward else: self.total_reward = reward self.count += 1 # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(states)[0] # add some noise for exploration return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters using given batch of reward tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted actions of next-state and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) # track best score self.score = self.total_reward / float( self.count) if self.count else -np.inf if self.best_score < self.score: self.best_score = self.score def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG: """docstring for DDPG""" def __init__(self, environment): self.name = 'DDPG' # name for uploading results self.environment = environment # Randomly initialize actor network and critic network # with both their target networks self.actor_network = ActorNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) self.critic_network = CriticNetwork(state_size = environment.observation_space.shape[0],action_size = environment.action_space.shape[0]) # initialize replay buffer self.replay_buffer = deque() # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(environment.action_space.shape[0]) # Initialize time step self.time_step = 0 def set_init_observation(self,observation): # receive initial observation state self.state = observation def train(self): # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch,[BATCH_SIZE,1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.target_evaluate(next_state_batch) q_value_batch = self.critic_network.target_evaluate(next_state_batch,next_action_batch) for i in range(0,BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) # Update critic by minimizing the loss L self.critic_network.train(y_batch,state_batch,action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.evaluate(state_batch) q_gradient_batch = self.critic_network.gradients(state_batch,action_batch_for_gradients)/BATCH_SIZE self.actor_network.train(q_gradient_batch,state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action+self.exploration_noise.noise(),self.environment.action_space.low,self.environment.action_space.high) def set_feedback(self,observation,action,reward,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer next_state = observation self.replay_buffer.append((self.state,action,reward,next_state,done)) # Update current state self.state = next_state # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() if self.time_step % 10000 == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
def s2l(): #Randomly initialize critic,actor,target critic, target actor network and replay buffer num_states = feature_size #num_states = env.observation_space.shape[0] num_actions = env.action_space.shape[0] print("Number of States:", num_states) print("Number of Actions:", num_actions) agent = DDPG(env, is_batch_norm, num_states, num_actions) exploration_noise = OUNoise(env.action_space.shape[0]) counter = 0 total_reward = 0 print("Number of Rollouts per episode:", num_rollouts) print("Number of Steps per roll out:", steps) reward_st = np.array([0]) #saving reward eval_metric_st = np.array([0]) reward_st_all = np.array([0]) #saving reward after every step frame_obj = Frame_Feature() #activity_obj=Vid_Feature() demo_vid_array = demo_array_extractor(demo_folder) demo_features = frame_obj.video_feature_extractor(demo_vid_array) for episode in range(num_episodes): print("==== Starting episode no:", episode, "====", "\n") env.reset() # Reset env in the begining of each episode env.render() obs_img = env.render(mode='rgb_array') # Get the observation obs_img = np.array(misc.imresize(obs_img, [112, 112, 3])) observation = np.array(frame_obj.frame_feature_extractor(obs_img)) observation = observation.reshape(-1) reward_per_episode = 0 for t in range(num_rollouts): reward_per_rollout = 0 vid_robo_ = [] for i in range(steps): x = observation action = agent.evaluate_actor(np.reshape(x, [1, num_states])) noise = exploration_noise.noise() action = action[ 0] + noise #Select action according to current policy and exploration noise print('Action at episode-', episode, 'rollout-', t, 'step-', i, " :", action) _, _, done, info = env.step(action) env.render() obs_robo_ = env.render(mode='rgb_array') # Get the observation obs_robo = misc.imresize(obs_robo_, [112, 112, 3]) vid_robo_.append(obs_robo) observation = np.array( frame_obj.frame_feature_extractor(np.array(obs_robo))) observation = observation.reshape(-1) #pasue() if (i == 15): vid_robo = np.array(vid_robo_) robo_features = frame_obj.video_feature_extractor(vid_robo) reward = -(distance(demo_features, robo_features)) reward = np.array(reward) print('reward: ', reward) else: reward = 0 reward = np.array(reward) print('reward: ', reward) # Printing eval_metric after every rollout eval_metric = np.array(env.get_eval()) eval_metric = eval_metric.reshape(-1) print('Distance to goal:', eval_metric) eval_metric_st = np.append(eval_metric_st, eval_metric) np.savetxt('eval_metric_per_step.txt', eval_metric_st, newline="\n") # Storing reward after every rollout reward_st_all = np.append(reward_st_all, reward) np.savetxt('reward_all.txt', reward_st_all, newline="\n") #add s_t,s_t+1,action,reward to experience memory agent.add_experience(x, observation, action, reward, False) reward_per_rollout += reward counter += 1 #train critic and actor network if counter > start_training: agent.train() print('\n\n') #Saving policy if ((episode % 50) == 0 and t == num_rollouts - 1): print('saving policy...........................!') agent.save_actor(episode) reward_per_episode += reward_per_rollout #check if episode ends: print('EPISODE: ', episode, ' Total Reward: ', reward_per_episode) print("Printing reward to file") exploration_noise.reset( ) #reinitializing random noise for action exploration reward_st = np.append(reward_st, reward_per_episode) np.savetxt('episode_reward.txt', reward_st, fmt='%f', newline="\n") print('\n\n') total_reward += reward_per_episode print("Average reward per episode {}".format(total_reward / num_episodes))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load(self, model_dir, agent_id): # Load Actor and Critic network weights self.actor_local.load_state_dict( torch.load( os.path.join(model_dir, 'agent_{0}_actor.pth'.format(agent_id)))) self.critic_local.load_state_dict( torch.load( os.path.join(model_dir, 'agent_{0}_critic.pth'.format(agent_id)))) def save(self, model_dir, agent_id): # Save Actor and Critic network weights torch.save( self.actor_local.state_dict(), os.path.join(model_dir, 'agent_{0}_actor.pth'.format(agent_id))) torch.save( self.critic_local.state_dict(), os.path.join(model_dir, 'agent_{0}_critic.pth'.format(agent_id)))
class AgentDDPG: def __init__(self, env, state_size, action_size): self.env = env self.replay_memory = deque() self.actor_network = actor_network.ActorNetwork( state_size, action_size) self.critic_network = critic_network.CriticNetwork( state_size, action_size) self.ou_noise = OUNoise(action_size) self.time_step = 0 def set_state(self, obs): self.state = obs def get_action(self): # Select action a_t according to the current policy and exploration noise action = self.actor_network.get_action(self.state) return np.clip(action + self.ou_noise.noise(), self.env.action_space.low, self.env.action_space.high) def set_feedback(self, obs, action, reward, done): next_state = obs self.replay_memory.append( (self.state, action, reward, next_state, done)) self.state = next_state self.time_step += 1 if len(self.replay_memory) > config.MEMORY_SIZE: self.replay_memory.popleft() # Store transitions to replay start size then start training if self.time_step > config.OBSERVATION_STEPS: self.train() if self.time_step % config.SAVE_EVERY_X_STEPS == 0: self.actor_network.save_network(self.time_step) self.critic_network.save_network(self.time_step) # reinit the random process when an episode ends if done: self.ou_noise.reset() def train(self): minibatch = random.sample(self.replay_memory, config.MINI_BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] action_batch = np.resize(action_batch, [config.MINI_BATCH_SIZE, 1]) # Calculate y y_batch = [] next_action_batch = self.actor_network.get_target_action_batch( next_state_batch) q_value_batch = self.critic_network.get_target_q_batch( next_state_batch, next_action_batch) for i in range(0, config.MINI_BATCH_SIZE): done = minibatch[i][4] if done: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + config.FUTURE_REWARD_DISCOUNT * q_value_batch[i]) y_batch = np.array(y_batch) y_batch = np.reshape(y_batch, [len(y_batch), 1]) # Update critic by minimizing the loss self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch = self.actor_network.get_action_batch(state_batch) q_gradient_batch = self.critic_network.get_gradients( state_batch, action_batch) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target()
class DDPG: def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] # Initialize time step self.time_step = 0 # initialize replay buffer self.replay_buffer = deque() # initialize networks self.create_networks_and_training_method(state_dim,action_dim) self.sess = tf.InteractiveSession() self.sess.run(tf.initialize_all_variables()) # loading networks self.saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("saved_networks") if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print "Successfully loaded:", checkpoint.model_checkpoint_path else: print "Could not find old network weights" global summary_writer summary_writer = tf.train.SummaryWriter('~/logs',graph=self.sess.graph) def create_networks_and_training_method(self,state_dim,action_dim): theta_p = networks.theta_p(state_dim,action_dim) theta_q = networks.theta_q(state_dim,action_dim) target_theta_p,target_update_p = self.exponential_moving_averages(theta_p,TAU) target_theta_q,target_update_q = self.exponential_moving_averages(theta_q,TAU) self.state = tf.placeholder(tf.float32,[None,state_dim],'state') self.action_test = networks.policy_network(self.state,theta_p) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration = OUNoise(action_dim) noise = self.exploration.noise() self.action_exploration = self.action_test + noise q = networks.q_network(self.state,self.action_test,theta_q) # policy optimization mean_q = tf.reduce_mean(q) weight_decay_p = tf.add_n([L2_POLICY * tf.nn.l2_loss(var) for var in theta_p]) loss_p = -mean_q + weight_decay_p optim_p = tf.train.AdamOptimizer(P_LEARNING_RATE) grads_and_vars_p = optim_p.compute_gradients(loss_p, var_list=theta_p) optimize_p = optim_p.apply_gradients(grads_and_vars_p) with tf.control_dependencies([optimize_p]): self.train_p = tf.group(target_update_p) # q optimization self.action_train = tf.placeholder(tf.float32,[None,action_dim],'action_train') self.reward = tf.placeholder(tf.float32,[None],'reward') self.next_state = tf.placeholder(tf.float32,[None,state_dim],'next_state') self.done = tf.placeholder(tf.bool,[None],'done') q_train = networks.q_network(self.state,self.action_train,theta_q) next_action = networks.policy_network(self.next_state,theta=target_theta_p) next_q = networks.q_network(self.next_state,next_action,theta=target_theta_q) q_target = tf.stop_gradient(tf.select(self.done,self.reward,self.reward + GAMMA * next_q)) # q loss q_error = tf.reduce_mean(tf.square(q_target - q_train)) weight_decay_q = tf.add_n([L2_Q * tf.nn.l2_loss(var) for var in theta_q]) loss_q = q_error + weight_decay_q optim_q = tf.train.AdamOptimizer(Q_LEARNING_RATE) grads_and_vars_q = optim_q.compute_gradients(loss_q, var_list=theta_q) optimize_q = optim_q.apply_gradients(grads_and_vars_q) with tf.control_dependencies([optimize_q]): self.train_q = tf.group(target_update_q) tf.scalar_summary("loss_q",loss_q) tf.scalar_summary("loss_p",loss_p) tf.scalar_summary("q_mean",mean_q) global merged_summary_op merged_summary_op = tf.merge_all_summaries() def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = random.sample(self.replay_buffer,BATCH_SIZE) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] done_batch = [data[4] for data in minibatch] _,_,summary_str = self.sess.run([self.train_p,self.train_q,merged_summary_op],feed_dict={ self.state:state_batch, self.action_train:action_batch, self.reward:reward_batch, self.next_state:next_state_batch, self.done:done_batch }) summary_writer.add_summary(summary_str,self.time_step) # save network every 1000 iteration if self.time_step % 1000 == 0: self.saver.save(self.sess, 'saved_networks/' + 'network' + '-ddpg', global_step = self.time_step) def noise_action(self,state): # Select action a_t according to the current policy and exploration noise action = self.sess.run(self.action_exploration,feed_dict={ self.state:[state] })[0] return np.clip(action,self.environment.action_space.low,self.environment.action_space.high) def action(self,state): action = self.sess.run(self.action_test,feed_dict={ self.state:[state] })[0] return np.clip(action,self.environment.action_space.low,self.environment.action_space.high) def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.append((state,action,reward,next_state,done)) # Update time step self.time_step += 1 # Limit the replay buffer size if len(self.replay_buffer) > REPLAY_BUFFER_SIZE: self.replay_buffer.popleft() # Store transitions to replay start size then start training if self.time_step > REPLAY_START_SIZE: self.train() # Re-iniitialize the random process when an episode ends if done: self.exploration.reset() # f fan-in size def exponential_moving_averages(self,theta, tau=0.001): ema = tf.train.ExponentialMovingAverage(decay=1 - tau) update = ema.apply(theta) # also creates shadow vars averages = [ema.average(x) for x in theta] return averages, update