def __init__(self, layer_number, FLAGS, env, sess, agent_params): self.layer_number = layer_number self.FLAGS = FLAGS self.sess = sess # Set time limit for each layer. If agent uses only 1 layer, time limit is the max number of low-level actions allowed in the episode (i.e, env.max_actions). if FLAGS.layers > 1: self.time_limit = FLAGS.time_scale else: self.time_limit = env.max_actions self.current_state = None self.goal = None # Initialize Replay Buffer. Below variables determine size of replay buffer. # Ceiling on buffer size self.buffer_size_ceiling = 10**7 # Number of full episodes stored in replay buffer self.episodes_to_store = agent_params["episodes_to_store"] # Set number of transitions to serve as replay goals during goal replay self.num_replay_goals = 3 # Number of the transitions created for each attempt (i.e, action replay + goal replay + subgoal testing) if self.layer_number == 0: self.trans_per_attempt = (1 + self.num_replay_goals) * self.time_limit else: self.trans_per_attempt = (1 + self.num_replay_goals) * self.time_limit + int(self.time_limit/3) # Buffer size = transitions per attempt * # attempts per episode * num of episodes stored self.buffer_size = min(self.trans_per_attempt * self.time_limit**(self.FLAGS.layers-1 - self.layer_number) * self.episodes_to_store, self.buffer_size_ceiling) # self.buffer_size = 10000000 self.batch_size = 1024 self.replay_buffer = ExperienceBuffer(self.buffer_size, self.batch_size) # Create buffer to store not yet finalized goal replay transitions self.temp_goal_replay_storage = [] # Initialize actor and critic networks self.actor = Actor(sess, env, self.batch_size, self.layer_number, FLAGS) self.critic = Critic(sess, env, self.layer_number, FLAGS) # Parameter determines degree of noise added to actions during training # self.noise_perc = noise_perc if self.layer_number == 0: self.noise_perc = agent_params["atomic_noise"] else: self.noise_perc = agent_params["subgoal_noise"] # Create flag to indicate when layer has ran out of attempts to achieve goal. This will be important for subgoal testing self.maxed_out = False self.subgoal_penalty = agent_params["subgoal_penalty"]
def __init__(self, network, sess, enabled=config.is_training, scope=config.dir_name, discount_factor=config.discount_factor, batch_size=config.batch_size, max_buffer_size=config.max_buffer_size, tau=config.tau): super(DoubleQNetworkTrainer, self).__init__(discount_factor) self._network = network self.sess = sess self._batch_size = batch_size self._tau = tau if enabled: self._target_network = Network(network.shape, sess, load_model=False) self._target_ops = self._update_target_graph( tf.trainable_variables(scope=scope)) self._experience_buffer = ExperienceBuffer(max_buffer_size)
class Layer(): def __init__(self, layer_number, FLAGS, env, sess, agent_params): self.layer_number = layer_number self.FLAGS = FLAGS self.sess = sess # Set time limit for each layer. If agent uses only 1 layer, time limit is the max number of low-level actions allowed in the episode (i.e, env.max_actions). if FLAGS.layers > 1: self.time_limit = FLAGS.time_scale else: self.time_limit = env.max_actions self.current_state = None self.goal = None # Initialize Replay Buffer. Below variables determine size of replay buffer. # Ceiling on buffer size self.buffer_size_ceiling = 10**7 # Number of full episodes stored in replay buffer self.episodes_to_store = agent_params["episodes_to_store"] # Set number of transitions to serve as replay goals during goal replay self.num_replay_goals = 3 # Number of the transitions created for each attempt (i.e, action replay + goal replay + subgoal testing) if self.layer_number == 0: self.trans_per_attempt = (1 + self.num_replay_goals) * self.time_limit else: self.trans_per_attempt = (1 + self.num_replay_goals) * self.time_limit + int(self.time_limit/3) # Buffer size = transitions per attempt * # attempts per episode * num of episodes stored self.buffer_size = min(self.trans_per_attempt * self.time_limit**(self.FLAGS.layers-1 - self.layer_number) * self.episodes_to_store, self.buffer_size_ceiling) # self.buffer_size = 10000000 self.batch_size = 1024 self.replay_buffer = ExperienceBuffer(self.buffer_size, self.batch_size) # Create buffer to store not yet finalized goal replay transitions self.temp_goal_replay_storage = [] # Initialize actor and critic networks self.actor = Actor(sess, env, self.batch_size, self.layer_number, FLAGS) self.critic = Critic(sess, env, self.layer_number, FLAGS) # Parameter determines degree of noise added to actions during training # self.noise_perc = noise_perc if self.layer_number == 0: self.noise_perc = agent_params["atomic_noise"] else: self.noise_perc = agent_params["subgoal_noise"] # Create flag to indicate when layer has ran out of attempts to achieve goal. This will be important for subgoal testing self.maxed_out = False self.subgoal_penalty = agent_params["subgoal_penalty"] # Add noise to provided action def add_noise(self,action, env): # Noise added will be percentage of range if self.layer_number == 0: action_bounds = env.action_bounds action_offset = env.action_offset else: action_bounds = env.subgoal_bounds_symmetric action_offset = env.subgoal_bounds_offset assert len(action) == len(action_bounds), "Action bounds must have same dimension as action" assert len(action) == len(self.noise_perc), "Noise percentage vector must have same dimension as action" # Add noise to action and ensure remains within bounds for i in range(len(action)): action[i] += np.random.normal(0,self.noise_perc[i] * action_bounds[i]) action[i] = max(min(action[i], action_bounds[i]+action_offset[i]), -action_bounds[i]+action_offset[i]) return action # Select random action def get_random_action(self, env): if self.layer_number == 0: action = np.zeros((env.action_dim)) else: action = np.zeros((env.subgoal_dim)) # Each dimension of random action should take some value in the dimension's range for i in range(len(action)): if self.layer_number == 0: action[i] = np.random.uniform(-env.action_bounds[i] + env.action_offset[i], env.action_bounds[i] + env.action_offset[i]) else: action[i] = np.random.uniform(env.subgoal_bounds[i][0],env.subgoal_bounds[i][1]) return action # Function selects action using an epsilon-greedy policy def choose_action(self,agent, env, subgoal_test): # If testing mode or testing subgoals, action is output of actor network without noise if agent.FLAGS.test or subgoal_test: # return self.actor.get_action(np.reshape(self.current_state,(1,len(self.current_state))), np.reshape(self.goal,(1,len(self.goal))))[0], "Policy", subgoal_test return self.get_actions(obs['observation'], obs['achieved_goal'], obs['desired_goal']) else: if np.random.random_sample() > 0.2: # Choose noisy action action = self.add_noise(self.actor.get_action(np.reshape(self.current_state,(1,len(self.current_state))), np.reshape(self.goal,(1,len(self.goal))))[0],env) action_type = "Noisy Policy" # Otherwise, choose random action else: action = self.get_random_action(env) action_type = "Random" # Determine whether to test upcoming subgoal if np.random.random_sample() < agent.subgoal_test_perc: next_subgoal_test = True else: next_subgoal_test = False return action, action_type, next_subgoal_test # Create action replay transition by evaluating hindsight action given original goal def perform_action_replay(self, hindsight_action, next_state, goal_status): # Determine reward (0 if goal achieved, -1 otherwise) and finished boolean. The finished boolean is used for determining the target for Q-value updates if goal_status[self.layer_number]: reward = 0 finished = True else: reward = -1 finished = False # Transition will take the form [old state, hindsight_action, reward, next_state, goal, terminate boolean, None] transition = [self.current_state, hindsight_action, reward, next_state, self.goal, finished, None] # print("AR Trans: ", transition) # Add action replay transition to layer's replay buffer self.replay_buffer.add(np.copy(transition)) # Create initial goal replay transitions def create_prelim_goal_replay_trans(self, hindsight_action, next_state, env, total_layers): # Create transition evaluating hindsight action for some goal to be determined in future. Goal will be ultimately be selected from states layer has traversed through. Transition will be in the form [old state, hindsight action, reward = None, next state, goal = None, finished = None, next state projeted to subgoal/end goal space] if self.layer_number == total_layers - 1: hindsight_goal = env.project_state_to_end_goal(env.sim, next_state) else: hindsight_goal = env.project_state_to_subgoal(env.sim, next_state) transition = [self.current_state, hindsight_action, None, next_state, None, None, hindsight_goal] # print("\nPrelim GR A: ", transition) self.temp_goal_replay_storage.append(np.copy(transition)) """ # Designer can create some additional goal replay transitions. For instance, higher level transitions can be replayed with the subgoal achieved in hindsight as the original goal. if self.layer_number > 0: transition_b = [self.current_state, hindsight_action, 0, next_state, hindsight_goal, True, None] # print("\nGoal Replay B: ", transition_b) self.replay_buffer.add(np.copy(transition_b)) """ # Return reward given provided goal and goal achieved in hindsight def get_reward(self,new_goal, hindsight_goal, goal_thresholds): assert len(new_goal) == len(hindsight_goal) == len(goal_thresholds), "Goal, hindsight goal, and goal thresholds do not have same dimensions" # If the difference in any dimension is greater than threshold, goal not achieved for i in range(len(new_goal)): if np.absolute(new_goal[i]-hindsight_goal[i]) > goal_thresholds[i]: return -1 # Else goal is achieved return 0 # Finalize goal replay by filling in goal, reward, and finished boolean for the preliminary goal replay transitions created before def finalize_goal_replay(self,goal_thresholds): # Choose transitions to serve as goals during goal replay. The last transition will always be used num_trans = len(self.temp_goal_replay_storage) num_replay_goals = self.num_replay_goals # If fewer transitions that ordinary number of replay goals, lower number of replay goals if num_trans < self.num_replay_goals: num_replay_goals = num_trans """ if self.layer_number == 1: print("\n\nPerforming Goal Replay\n\n") print("Num Trans: ", num_trans, ", Num Replay Goals: ", num_replay_goals) """ indices = np.zeros((num_replay_goals)) indices[:num_replay_goals-1] = np.random.randint(num_trans,size=num_replay_goals-1) indices[num_replay_goals-1] = num_trans - 1 indices = np.sort(indices) # if self.layer_number == 1: # print("Selected Indices: ", indices) # For each selected transition, update the goal dimension of the selected transition and all prior transitions by using the next state of the selected transition as the new goal. Given new goal, update the reward and finished boolean as well. for i in range(len(indices)): trans_copy = np.copy(self.temp_goal_replay_storage) # if self.layer_number == 1: # print("GR Iteration: %d, Index %d" % (i, indices[i])) new_goal = trans_copy[int(indices[i])][6] # for index in range(int(indices[i])+1): for index in range(num_trans): # Update goal to new goal trans_copy[index][4] = new_goal # Update reward trans_copy[index][2] = self.get_reward(new_goal, trans_copy[index][6], goal_thresholds) # Update finished boolean based on reward if trans_copy[index][2] == 0: trans_copy[index][5] = True else: trans_copy[index][5] = False # Add finished transition to replay buffer # if self.layer_number == 1: # print("\nNew Goal: ", new_goal) # print("Upd Trans %d: " % index, trans_copy[index]) self.replay_buffer.add(trans_copy[index]) # Clear storage for preliminary goal replay transitions at end of goal replay self.temp_goal_replay_storage = [] # Create transition penalizing subgoal if necessary. The target Q-value when this transition is used will ignore next state as the finished boolena = True. Change the finished boolean to False, if you would like the subgoal penalty to depend on the next state. def penalize_subgoal(self, subgoal, next_state, high_level_goal_achieved): transition = [self.current_state, subgoal, self.subgoal_penalty, next_state, self.goal, True, None] self.replay_buffer.add(np.copy(transition)) # Determine whether layer is finished training def return_to_higher_level(self, max_lay_achieved, agent, env, attempts_made): # Return to higher level if (i) a higher level goal has been reached, (ii) maxed out episode time steps (env.max_actions), (iii) not testing and layer is out of attempts, and (iv) testing, layer is not the highest level, and layer is out of attempts. NOTE: during testing, highest level will continue to ouput subgoals until either (i) the maximum number of episdoe time steps or (ii) the end goal has been achieved. # Return to previous level when any higher level goal achieved. NOTE: if not testing and agent achieves end goal, training will continue until out of time (i.e., out of time steps or highest level runs out of attempts). This will allow agent to experience being around the end goal. if max_lay_achieved is not None and max_lay_achieved >= self.layer_number: return True # Return when out of time elif agent.steps_taken >= env.max_actions: return True # Return when layer has maxed out attempts elif not agent.FLAGS.test and attempts_made >= self.time_limit: return True # NOTE: During testing, agent will have env.max_action attempts to achieve goal elif agent.FLAGS.test and self.layer_number < agent.FLAGS.layers-1 and attempts_made >= self.time_limit: return True else: return False # Learn to achieve goals with actions belonging to appropriate time scale. # "goal_array" contains the goal states for the current layer and all higher layers def train(self, agent, env, subgoal_test = False, episode_num = None): # print("\nTraining Layer %d" % self.layer_number) # Set layer's current state and new goal state self.goal = agent.goal_array[self.layer_number] self.current_state = agent.current_state # Reset flag indicating whether layer has ran out of attempts. This will be used for subgoal testing. self.maxed_out = False # Display all subgoals if visualizing training and current layer is bottom layer if self.layer_number == 0 and agent.FLAGS.show and agent.FLAGS.layers > 1: env.display_subgoals(agent.goal_array) # env.sim.data.mocap_pos[3] = env.project_state_to_end_goal(env.sim,self.current_state) # print("Subgoal Pos: ", env.sim.data.mocap_pos[1]) # Current layer has self.time_limit attempts to each its goal state. attempts_made = 0 while True: # Select action to achieve goal state using epsilon-greedy policy or greedy policy if in test mode action, action_type, next_subgoal_test = self.choose_action(agent, env, subgoal_test) """ if self.layer_number == agent.FLAGS.layers - 1: # print("\nLayer %d Action: " % self.layer_number, action) print("Q-Value: ", self.critic.get_Q_value(np.reshape(self.current_state,(1,len(self.current_state))), np.reshape(self.goal,(1,len(self.goal))), np.reshape(action,(1,len(action))))) """ # If next layer is not bottom level, propose subgoal for next layer to achieve and determine whether that subgoal should be tested if self.layer_number > 0: agent.goal_array[self.layer_number - 1] = action ## 액션을 그냥 골로 만들어줌? goal_status, max_lay_achieved = agent.layers[self.layer_number - 1].train(agent, env, next_subgoal_test, episode_num) # If layer is bottom level, execute low-level action else: next_state = env.execute_action(action) # Increment steps taken agent.steps_taken += 1 # print("Num Actions Taken: ", agent.steps_taken) if agent.steps_taken >= env.max_actions: print("Out of actions (Steps: %d)" % agent.steps_taken) agent.current_state = next_state # Determine whether any of the goals from any layer was achieved and, if applicable, the highest layer whose goal was achieved goal_status, max_lay_achieved = agent.check_goals(env) attempts_made += 1 # Print if goal from current layer as been achieved if goal_status[self.layer_number]: if self.layer_number < agent.FLAGS.layers - 1: print("SUBGOAL ACHIEVED") print("\nEpisode %d, Layer %d, Attempt %d Goal Achieved" % (episode_num, self.layer_number, attempts_made)) print("Goal: ", self.goal) if self.layer_number == agent.FLAGS.layers - 1: print("Hindsight Goal: ", env.project_state_to_end_goal(env.sim, agent.current_state)) else: print("Hindsight Goal: ", env.project_state_to_subgoal(env.sim, agent.current_state)) # Perform hindsight learning using action actually executed (low-level action or hindsight subgoal) if self.layer_number == 0: hindsight_action = action else: # If subgoal action was achieved by layer below, use this as hindsight action if goal_status[self.layer_number-1]: hindsight_action = action # Otherwise, use subgoal that was achieved in hindsight else: hindsight_action = env.project_state_to_subgoal(env.sim, agent.current_state) # Next, create hindsight transitions if not testing if not agent.FLAGS.test: # Create action replay transition by evaluating hindsight action given current goal self.perform_action_replay(hindsight_action, agent.current_state, goal_status) # Create preliminary goal replay transitions. The goal and reward in these transitions will be finalized when this layer has run out of attempts or the goal has been achieved. self.create_prelim_goal_replay_trans(hindsight_action, agent.current_state, env, agent.FLAGS.layers) # Penalize subgoals if subgoal testing and subgoal was missed by lower layers after maximum number of attempts if self.layer_number > 0 and next_subgoal_test and agent.layers[self.layer_number-1].maxed_out: self.penalize_subgoal(action, agent.current_state, goal_status[self.layer_number]) # Print summary of transition if agent.FLAGS.verbose: print("\nEpisode %d, Training Layer %d, Attempt %d" % (episode_num, self.layer_number,attempts_made)) # print("Goal Array: ", agent.goal_array, "Max Lay Achieved: ", max_lay_achieved) print("Old State: ", self.current_state) print("Hindsight Action: ", hindsight_action) print("Original Action: ", action) print("Next State: ", agent.current_state) print("Goal: ", self.goal) if self.layer_number == agent.FLAGS.layers - 1: print("Hindsight Goal: ", env.project_state_to_end_goal(env.sim, agent.current_state)) else: print("Hindsight Goal: ", env.project_state_to_subgoal(env.sim, agent.current_state)) print("Goal Status: ", goal_status, "\n") print("All Goals: ", agent.goal_array) # Update state of current layer self.current_state = agent.current_state # Return to previous level to receive next subgoal if applicable # if self.return_to_higher_level(max_lay_achieved, agent, env, attempts_made): if (max_lay_achieved is not None and max_lay_achieved >= self.layer_number) or agent.steps_taken >= env.max_actions or attempts_made >= self.time_limit: if self.layer_number == agent.FLAGS.layers-1: print("HL Attempts Made: ", attempts_made) # If goal was not achieved after max number of attempts, set maxed out flag to true if attempts_made >= self.time_limit and not goal_status[self.layer_number]: self.maxed_out = True # print("Layer %d Out of Attempts" % self.layer_number) # If not testing, finish goal replay by filling in missing goal and reward values before returning to prior level. if not agent.FLAGS.test: if self.layer_number == agent.FLAGS.layers - 1: goal_thresholds = env.end_goal_thresholds else: goal_thresholds = env.subgoal_thresholds self.finalize_goal_replay(goal_thresholds) # Under certain circumstances, the highest layer will not seek a new end goal if self.return_to_higher_level(max_lay_achieved, agent, env, attempts_made): return goal_status, max_lay_achieved # Update actor and critic networks def learn(self, num_updates): for _ in range(num_updates): # Update weights of non-target networks if self.replay_buffer.size >= self.batch_size: old_states, actions, rewards, new_states, goals, is_terminals = self.replay_buffer.get_batch() self.critic.update(old_states, actions, rewards, new_states, goals, self.actor.get_action(new_states,goals), is_terminals) action_derivs = self.critic.get_gradients(old_states, goals, self.actor.get_action(old_states, goals)) self.actor.update(old_states, goals, action_derivs) """
def train(sess, env, actor, actor_target, critic, critic_target): # Arguments init_episode = args['init_episode'] epsilon_rate = args['epsilon_rate'] max_episodes = args['max_episodes'] max_steps = args['max_steps'] batch_size = args['batch_size'] buffer_size = args['buffer_size'] gamma = args['gamma'] seed = args['seed'] tau = args['tau'] # Restore model or run init tensorflow saver = tf.train.Saver() sess.run(tf.global_variables_initializer( )) if init_episode == 0 else saver.restore( sess, os.path.join(args['resources'], "network") + "/" + args['file'] + "_model") # Add tensorboard visualization # $ tensorboard --logdir=./resources/logdir # https://localghost:6006 if args['tensorboard']: logdir = os.path.join(args['resources'], 'logdir') if not os.path.exists(logdir): os.mkdir(logdir) tf.summary.FileWriter(logdir, sess.graph) # Init experience buffer buffer = ExperienceBuffer(buffer_size, seed) counter = 0 epsilon = 1.0 transfer_network_params_every = 500 relaunch_torcs_every = 1 save_networks_every = 50 epsilon_decay = calculate_epsilon_decay(max_episodes, epsilon_rate) ############################################################### for i in range(init_episode, max_episodes): # relaunch TORCS every N episodes due to a memory leak error ob = env.reset(relaunch=True) if np.mod( i, relaunch_torcs_every) == 0 else env.reset() # Init reward counter and Q max value. reward_total = 0 q_max = 0 # Decrease noise every episode # epsilon -= epsilon_decay # Get enviroment state state_t0 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) ############################################################### for j in range(max_steps): t0 = time() counter += 1 # Action noise: decrease noise (epsilon) for each episode to get better results action = actor.predict(np.reshape(state_t0, (1, actor.state_size))) action[0, :] += noise(x=action[0, :], epsilon_decay=epsilon) # The first 5 episodes full acceleration. if (i < 10): action[0][0] = 0.0 action[0][1] = 1.0 action[0][2] = 0.0 # Run step and get data for enviroment. ob, reward, fin, info = env.step(action[0]) # Update target networks if ((counter % transfer_network_params_every) == 0): transfer_network_params(sess, actor, actor_target, tau) transfer_network_params(sess, critic, critic_target, tau) print( "\n***************************************************************************************" ) print( "---------------------------- UPDATE TARGET NETWORK PARAMS -----------------------------" ) print( "*************************************{:2.2E}**************************************************\n" .format(time() - t0)) else: state_t1 = np.hstack( (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) # Add new experience to buffer. buffer.append(np.reshape(state_t0, (actor.state_size, )), np.reshape(action, (actor.action_size, )), reward, fin, np.reshape(state_t1, (actor.state_size, ))) if len(buffer) > batch_size: states_t0_batch, actions_batch, rewards_batch, fin_batch, states_t1_batch = buffer.get_batch( batch_size) #***********************************************# ## Calculate target_q for train critic network ## #***********************************************# # Predict target q; predict target_y actor & target_y critic and combine both. actor_q_target = actor_target.predict(states_t1_batch) _critic_q_target = critic_target.predict( states_t1_batch, actor_q_target) # Calculate target q with gamma and rewards: Bellman ecuation. critic_q_target = [ x if fin_batch[i] else (x + gamma * _critic_q_target[i]) for i, x in enumerate(rewards_batch) ] # Train critic network with targets Q values. critic_q_predicted, _ = critic.train( states_t0_batch, actions_batch, np.reshape(critic_q_target, (batch_size, 1))) # Get max Q value predicted by critic. q_max += np.amax(critic_q_predicted) #*****************************************************************# ## Calculate Q actions and get gradients for train actor network ## #*****************************************************************# # Train actor network with critic gradients. actor_action_predict = actor.predict(states_t0_batch) gradients = critic.calculate_gradients( states_t0_batch, actor_action_predict) actor.train(states_t0_batch, gradients[0]) # Compute actor loss: MSE # _actor_action_predict = actor.predict(states_t0_batch) # actor_loss = actor.calculate_loss(actor_action_predict, _actor_action_predict) # print("a-loss: ", actor_loss, "c-loss", critic_loss) state_t0 = state_t1 reward_total += reward print( '| Buffer {:7} | Episode: {:5d} | Step: {:5d} | Stel: {:2.2f}\t| Accel: {:2.2f}\t| Brake: {:2.2f}\t| Reward: {:6d} | Qmax: {:6d} | Time: {:2.2E}' .format(len(buffer), i, j, action[0][0], action[0][1], action[0][2], int(reward_total), int(q_max / float(j + 1)), time() - t0)) ############################################################### if fin: print('| Reward: {:d} | Episode: {:d} | Q-max: {:.4f}'. format(int(reward_total), i, (q_max / float(j)))) with open( os.path.join(args['resources'], "data") + "/" + args['file'] + "_train.txt", "a") as f: f.write( str(i) + " " + str(j) + " " + str(reward_total) + " " + str(q_max / float(j + 1)) + "\n") break ############################################################### if ((i % save_networks_every) == 0 and i > 1): saver.save( sess, os.path.join(args['resources'], "network") + "/" + args['file'] + "_model") print( "--------------------------------- SAVED MODEL ------------------------------------" )
class DoubleQNetworkTrainer(Trainer): def __init__(self, network, sess, enabled=config.is_training, scope=config.dir_name, discount_factor=config.discount_factor, batch_size=config.batch_size, max_buffer_size=config.max_buffer_size, tau=config.tau): super(DoubleQNetworkTrainer, self).__init__(discount_factor) self._network = network self.sess = sess self._batch_size = batch_size self._tau = tau if enabled: self._target_network = Network(network.shape, sess, load_model=False) self._target_ops = self._update_target_graph( tf.trainable_variables(scope=scope)) self._experience_buffer = ExperienceBuffer(max_buffer_size) def _update_target_graph(self, tf_vars): total_vars = len(tf_vars) op_holder = [] for (idx, var) in enumerate(tf_vars[0:total_vars // 2]): op_holder.append(tf_vars[idx + total_vars // 2].assign( (var.value() * self._tau) + ((1 - self._tau) * tf_vars[idx + total_vars // 2].value()))) return op_holder def _update_target(self): for op in self._target_ops: self.sess.run(op) def train_network(self, state, action, reward, next_state, is_done): self._experience_buffer.add( np.reshape(np.array([state, action, reward, next_state, is_done]), [1, 5])) if len(self._experience_buffer) >= self._batch_size: train_batch = self._experience_buffer.sample(self._batch_size) ''' Q1 is the best action for next state predicted from main network Q2 is all the actions for next state predicted from target network DoubleQ is the value of Q1 from Q2 targetQ is the output from the neural network for the previous features improved by changing the Q using DoubleQ's value end_multiplier ensures that if an action caused the episode to end, then its Q value is only affected by the reward and not doubleQ ''' Q1 = self.sess.run(self._network.predict, feed_dict={ self._network.layers[0]: np.vstack(train_batch[:, 3]) }) Q2 = self.sess.run(self._target_network.layers[-1], feed_dict={ self._target_network.layers[0]: np.vstack(train_batch[:, 3]) }) end_multiplier = 1 - train_batch[:, 4] double_q = Q2[range(self._batch_size), Q1] target_q = train_batch[:, 2] + (self._discount_factor * double_q * end_multiplier) # Update the network with our target values _ = self.sess.run(self._network.train_step, feed_dict={ self._network.layers[0]: np.vstack(train_batch[:, 0]), self._network.target_q: target_q, self._network.actions: train_batch[:, 1] }) self._update_target()
def training(params, eparams, netsize): ''' training is the main method used to train an agent Before running a main training loop, several objects need to be initialised First, a neural network is initialised based on imports from the dgn_model file. Then, a single agent and an environment are initialised. In this training code, the agent plays against itself, but is unaware of this fact. In the training loop, the agent selects optimal actions with probability epsilon (element in params), and randomises otherwise. It then observes transition dynamics and receives rewards for these actions. These transitions are learned by the neural network provided to the agent. INPUT: params....A dictionary of hyperparameters eparams...Another dictionary of parameters determining the economic env netsize...the size of the two hidden layers in the neural network this is an important parameter that I want to present results for when it is being varied OUTPUT: agent...an instance of class Agent that is equipped with a trained neural network. agent can then be tested by calling the file testing.py ''' Experience = collections.namedtuple( 'Experience', field_names=['state', 'action', 'reward', 'done', 'new_state']) BATCH_SIZE = params['batch_size'] REPLAY_SIZE = params['replay_size'] REPLAY_START_SIZE = params['replay_start_size'] LEARNING_RATE = params['learning_rate'] SYNC_TARGET_FRAMES = params['sync_target_frames'] EPSILON_DECAY_LAST_FRAME = params['epsilon_decay_last_frame'] EPSILON_START = params['epsilon_start'] EPSILON_FINAL = params['epsilon_final'] nA = params['nA'] dO_a = params['dO_a'] FRAMES = params['frames'] NODES = params['nodes'] SEED = params['seed'] PATH = params['path'] GAMMA = params['gamma'] A0 = eparams['a0'] MU = eparams['mu'] FIRMLIST = eparams['firmlist'] NODES = netsize print(params) # PyTorch setup use_cuda = torch.cuda.is_available() device = torch.device( 'cuda' if use_cuda else 'cpu') # CPU when GPU is not available <-> device agnostic torch.manual_seed(SEED) if use_cuda: torch.cuda.manual_seed(SEED) # Neural network model: net = Net(dO_a, nA, NODES).to(device) print(net) tgt_net = Net(dO_a, nA, NODES).to(device) # Prediction target. criterion = nn.MSELoss() optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) buffer = ExperienceBuffer(REPLAY_SIZE) # Reinforcement learning environment firm0 = random.sample(FIRMLIST, 1)[0] firm1 = random.sample(FIRMLIST, 1)[0] env = ContBertrand(firm0, firm1) # RL agent agent = Agent(env, buffer, net, tgt_net, optimizer) # Write output statistics to tensorboard writer = SummaryWriter(comment="-") # Initialize variables env.seed(SEED) # TODO; is this used? torch.manual_seed(SEED) frame_idx = 0 ep_idx = 0 epsilon = EPSILON_START firmlist_cartesian = itertools.product(FIRMLIST, FIRMLIST) firmlist = [] for element in firmlist_cartesian: firmlist.append(element) # Training – Main loop firm0 = random.sample(FIRMLIST, 1)[0] firm1 = random.sample(FIRMLIST, 1)[0] # Make econ variables #dict_key = str((firm0, firm1)) #nash_action = eparams['nash_actions'][dict_key] #monopoly_action = eparams['monopoly_actions'][dict_key] #colab_action = eparams['colab_actions'][dict_key] #nash_profit = profit(nash_action, A0, MU, firm0, firm1, nA) #monopoly_profit = profit(monopoly_action, A0, MU, firm0, firm1, nA) #colab_profit = profit(colab_action, A0, MU, firm0, firm1, nA) # Initiate new env and amTFT agent s_next = env.reset(firm0, firm1) done = False for t in range(1, FRAMES): if done: # episode ends with probability gamma # Save episodal reward mean_pg = np.mean(agent.total_pg) writer.add_scalar("Agent_avg_profit", mean_pg, ep_idx) agent.total_pg = [] ep_idx += 1 # Randomize the firms firm0 = random.sample(FIRMLIST, 1)[0] firm1 = random.sample(FIRMLIST, 1)[0] # Make econ variables # dict_key = str((firm0, firm1)) #nash_action = eparams['nash_actions'][dict_key] #monopoly_action = eparams['monopoly_actions'][dict_key] #colab_action = eparams['colab_actions'][dict_key] #nash_profit = profit(nash_action, A0, MU, firm0, firm1, nA) #monopoly_profit = profit(monopoly_action, A0, MU, firm0, firm1, nA) #colab_profit = profit(colab_action, A0, MU, firm0, firm1, nA) # Initiate new env and amTFT agent s_next = env.reset(firm0, firm1) done = False frame_idx += 1 epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME) if frame_idx > ((EPSILON_FINAL + FRAMES) / 2): epsilon = 0 s = s_next # TODO: this used to be simply net. SHOULD IT NOT BE agent.net? action0 = agent.act(s[np.array([0, 1, 4, 5])], epsilon, device=device.type) action1 = agent.act(s[np.array([2, 3, 4, 5])], epsilon, device=device.type) # LARGEST CHANGE s_next, reward_n, done, _ = env.step(action0, action1) exp = Experience(s[np.array([0, 1, 4, 5])], action0, reward_n[0], done, s_next[np.array([0, 1, 4, 5])]) exp1 = Experience(s[np.array([2, 3, 4, 5])], action1, reward_n[1], done, s_next[np.array([2, 3, 4, 5])]) # ANOTHER CHANGE agent.exp_buffer.append(exp) agent.exp_buffer.append(exp1) if reward_n is not None: # TODO: add for both firms reward = reward_n[0] pg = reward #pg = profit_gain(reward, nash_profit, colab_profit)[0] # important to index here agent.total_pg.append(pg) if len(agent.exp_buffer) < REPLAY_START_SIZE: continue if frame_idx % SYNC_TARGET_FRAMES == 0: # Update target network agent.tgt_net.load_state_dict(agent.net.state_dict()) batch = agent.exp_buffer.sample(BATCH_SIZE) agent.optimizer.zero_grad() loss_t = calc_loss(batch, agent.net, agent.tgt_net, device=device) loss_t.backward() # Gradient clipping for param in agent.net.parameters(): param.grad.clamp_(-1, 1) agent.optimizer.step() writer.add_scalar("loss", loss_t, frame_idx) if frame_idx % 100_000 == 0: print(frame_idx) if frame_idx % 500_000 == 0: print(frame_idx) torch.save( { 'agent_state_dict': agent.net.state_dict(), 'optimizer_dict': agent.optimizer.state_dict(), 'epsilon': epsilon, 'frame_idx': frame_idx, 'env_state': s_next }, str(frame_idx) + PATH)
def collect_entropy_policies(env, epochs, T, MODEL_DIR=''): video_dir = 'videos/' + args.exp_name direct = os.getcwd() + '/data/' experiment_directory = direct + args.exp_name print(experiment_directory) print(sys.argv) if not os.path.exists(experiment_directory): os.makedirs(experiment_directory) f = open(experiment_directory + '/args', 'w') f.write(' '.join(sys.argv)) f.flush() indexes = [1, 5, 10, 15] states_visited_indexes = [0, 5, 10, 15] states_visited_cumulative = [] states_visited_cumulative_baseline = [] running_avg_p = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_p_xy = np.zeros(shape=(tuple(ant_utils.num_states_2d))) running_avg_ent = 0 running_avg_ent_xy = 0 running_avg_p_baseline = np.zeros(shape=(tuple(ant_utils.num_states))) running_avg_p_baseline_xy = np.zeros( shape=(tuple(ant_utils.num_states_2d))) running_avg_ent_baseline = 0 running_avg_ent_baseline_xy = 0 pct_visited = [] pct_visited_baseline = [] pct_visited_xy = [] pct_visited_xy_baseline = [] running_avg_entropies = [] running_avg_entropies_xy = [] running_avg_ps_xy = [] avg_ps_xy = [] running_avg_entropies_baseline = [] running_avg_entropies_baseline_xy = [] running_avg_ps_baseline_xy = [] avg_ps_baseline_xy = [] policies = [] distributions = [] initial_state = init_state(env) prebuf = ExperienceBuffer() env.reset() for t in range(10000): action = env.action_space.sample() obs, reward, done, _ = env.step(action) prebuf.store(get_state(env, obs)) if done: env.reset() done = False prebuf.normalize() normalization_factors = prebuf.normalization_factors utils.log_statement(normalization_factors) prebuf = None if not args.gaussian: normalization_factors = [] reward_fn = np.zeros(shape=(tuple(ant_utils.num_states))) for i in range(epochs): utils.log_statement("*** ------- EPOCH %d ------- ***" % i) # clear initial state if applicable. if not args.initial_state: initial_state = [] else: utils.log_statement(initial_state) utils.log_statement("max reward: " + str(np.max(reward_fn))) logger_kwargs = setup_logger_kwargs("model%02d" % i, data_dir=experiment_directory) # Learn policy that maximizes current reward function. print("Learning new oracle...") seed = random.randint(1, 100000) sac = AntSoftActorCritic(lambda: gym.make(args.env), reward_fn=reward_fn, xid=i + 1, seed=seed, gamma=args.gamma, ac_kwargs=dict(hidden_sizes=[args.hid] * args.l), logger_kwargs=logger_kwargs, normalization_factors=normalization_factors) # The first policy is random if i == 0: sac.soft_actor_critic(epochs=0) else: sac.soft_actor_critic(epochs=args.episodes, initial_state=initial_state, start_steps=args.start_steps) policies.append(sac) p, _ = sac.test_agent(T, normalization_factors=normalization_factors) distributions.append(p) weights = utils.get_weights(distributions) epoch = 'epoch_%02d' % (i) if args.render: if i < 10: sac.record(T=args.record_steps, n=1, video_dir=video_dir + '/baseline/' + epoch, on_policy=False) sac.record(T=args.record_steps, n=1, video_dir=video_dir + '/entropy/' + epoch, on_policy=True) # Execute the cumulative average policy thus far. # Estimate distribution and entropy. print("Executing mixed policy...") average_p, average_p_xy, initial_state, states_visited, states_visited_xy = \ execute_average_policy(env, policies, T, weights, reward_fn=reward_fn, norm=normalization_factors, initial_state=initial_state, n=args.n, render=args.render, video_dir=video_dir+'/mixed/'+epoch, epoch=i, record_steps=args.record_steps) print("Calculating maxEnt entropy...") round_entropy = entropy(average_p.ravel()) round_entropy_xy = entropy(average_p_xy.ravel()) # Update running averages for maxEnt. print("Updating maxEnt running averages...") running_avg_ent = running_avg_ent * ( i) / float(i + 1) + round_entropy / float(i + 1) running_avg_ent_xy = running_avg_ent_xy * ( i) / float(i + 1) + round_entropy_xy / float(i + 1) running_avg_p *= (i) / float(i + 1) running_avg_p += average_p / float(i + 1) running_avg_p_xy *= (i) / float(i + 1) running_avg_p_xy += average_p_xy / float(i + 1) # update reward function print("Update reward function") eps = 1 / np.sqrt(ant_utils.total_state_space) if args.cumulative: reward_fn = grad_ent(running_avg_p) else: reward_fn = 1. average_p += eps reward_fn /= average_p average_p = None # delete big array # (save for plotting) running_avg_entropies.append(running_avg_ent) running_avg_entropies_xy.append(running_avg_ent_xy) if i in indexes: running_avg_ps_xy.append(np.copy(running_avg_p_xy)) avg_ps_xy.append(np.copy(average_p_xy)) print("Collecting baseline experience....") p_baseline, p_baseline_xy, states_visited_baseline, states_visited_xy_baseline = sac.test_agent_random( T, normalization_factors=normalization_factors, n=args.n) plotting.states_visited_over_time(states_visited, states_visited_baseline, i) plotting.states_visited_over_time(states_visited_xy, states_visited_xy_baseline, i, ext='_xy') # save for cumulative plot. if i in states_visited_indexes: # average over a whole bunch of rollouts # slow: so only do this when needed. print("Averaging unique xy states visited....") states_visited_xy = compute_states_visited_xy( env, policies, norm=normalization_factors, T=T, n=args.n, N=args.avg_N) states_visited_xy_baseline = compute_states_visited_xy( env, policies, norm=normalization_factors, T=T, n=args.n, N=args.avg_N, initial_state=initial_state, baseline=True) states_visited_cumulative.append(states_visited_xy) states_visited_cumulative_baseline.append( states_visited_xy_baseline) print("Compute baseline entropy....") round_entropy_baseline = entropy(p_baseline.ravel()) round_entropy_baseline_xy = entropy(p_baseline_xy.ravel()) # Update baseline running averages. print("Updating baseline running averages...") running_avg_ent_baseline = running_avg_ent_baseline * ( i) / float(i + 1) + round_entropy_baseline / float(i + 1) running_avg_ent_baseline_xy = running_avg_ent_baseline_xy * ( i) / float(i + 1) + round_entropy_baseline_xy / float(i + 1) running_avg_p_baseline *= (i) / float(i + 1) running_avg_p_baseline += p_baseline / float(i + 1) running_avg_p_baseline_xy *= (i) / float(i + 1) running_avg_p_baseline_xy += p_baseline_xy / float(i + 1) p_baseline = None # (save for plotting) running_avg_entropies_baseline.append(running_avg_ent_baseline) running_avg_entropies_baseline_xy.append(running_avg_ent_baseline_xy) if i in indexes: running_avg_ps_baseline_xy.append( np.copy(running_avg_p_baseline_xy)) avg_ps_baseline_xy.append(np.copy(p_baseline_xy)) utils.log_statement(average_p_xy) utils.log_statement(p_baseline_xy) # Calculate percent of state space visited. pct = np.count_nonzero(running_avg_p) / float(running_avg_p.size) pct_visited.append(pct) pct_xy = np.count_nonzero(running_avg_p_xy) / float( running_avg_p_xy.size) pct_visited_xy.append(pct_xy) pct_baseline = np.count_nonzero(running_avg_p_baseline) / float( running_avg_p_baseline.size) pct_visited_baseline.append(pct_baseline) pct_xy_baseline = np.count_nonzero(running_avg_p_baseline_xy) / float( running_avg_p_baseline_xy.size) pct_visited_xy_baseline.append(pct_xy_baseline) # Print round summary. col_headers = ["", "baseline", "maxEnt"] col1 = [ "round_entropy_xy", "running_avg_ent_xy", "round_entropy", "running_avg_ent", "% state space xy", "% total state space" ] col2 = [ round_entropy_baseline_xy, running_avg_ent_baseline_xy, round_entropy_baseline, running_avg_ent_baseline, pct_xy_baseline, pct_baseline ] col3 = [ round_entropy_xy, running_avg_ent_xy, round_entropy, running_avg_ent, pct_xy, pct ] table = tabulate(np.transpose([col1, col2, col3]), col_headers, tablefmt="fancy_grid", floatfmt=".4f") utils.log_statement(table) # Plot from round. plotting.heatmap(running_avg_p_xy, average_p_xy, i) plotting.heatmap1(running_avg_p_baseline_xy, i) if i == states_visited_indexes[3]: plotting.states_visited_over_time_multi( states_visited_cumulative, states_visited_cumulative_baseline, states_visited_indexes) # save final expert weights to use with the trained oracles. weights_file = experiment_directory + '/policy_weights' np.save(weights_file, weights) # cumulative plots. plotting.running_average_entropy(running_avg_entropies, running_avg_entropies_baseline) plotting.running_average_entropy(running_avg_entropies_xy, running_avg_entropies_baseline_xy, ext='_xy') plotting.heatmap4(running_avg_ps_xy, running_avg_ps_baseline_xy, indexes, ext="cumulative") plotting.heatmap4(avg_ps_xy, avg_ps_baseline_xy, indexes, ext="epoch") plotting.percent_state_space_reached(pct_visited, pct_visited_baseline, ext='_total') plotting.percent_state_space_reached(pct_visited_xy, pct_visited_xy_baseline, ext="_xy") return policies
args.device = torch.device('cuda' if use_cuda else 'cpu') args.gamma = args.gamma**args.act_every args.num_steps = int(args.num_steps / args.act_every) args.options = options try: mp.set_start_method('forkserver', force=True) # print("forkserver init") except RuntimeError: pass processes = [] # Buffer used for sharing rollouts from actors to learner experience_buffer = ExperienceBuffer(args.batch_size) p = mp.Process(target=experience_buffer.listening) p.start() processes.append(p) # Getting action dim and observation dim from Env env = Env(args, device='cpu', options=options, dummy=True) observation_dim = env.observation_dim args.action_dim = env.action_dim env.close() print('Observation Space: {} / Action Dim: {}'.format( observation_dim, args.action_dim)) # Initializing shared memory used between workers and learner that contains the actor parameters shared_state_dict = Policy(args.action_dim) if args.load_model is not None:
def main(): global UPDATE_FREQ tf.reset_default_graph() env = gym.make('Freeway-v0') num_actions = env.action_space.n env_shape = env.observation_space.shape image_shape = [env_shape[0], env_shape[1], env_shape[2]] # init our double dqn main_q_network = Qnetwork(image_shape, FCL_SIZE, num_actions) target_q_network = Qnetwork(image_shape, FCL_SIZE, num_actions) # check ckpt dirs check_or_create_dirs() glbal_ops, saver, target_ops = init_tf_variables() ep = START_EP step_decrease = (START_EP - END_EP) / REDUCE_EP_STEPS ex_buffer = ExperienceBuffer() j_list = [] r_list = [] total_steps = 0 with tf.Session() as sess: sess.run(glbal_ops) if LOAD_MODEL: print('LOADING MODEL ... ') ckpt = tf.train.get_checkpoint_state(PATH) saver.restore(sess, ckpt.model_checkpoint_path) for episode in range(NUM_EPISODES): episode_buffer = ExperienceBuffer() observation = env.reset() done = False reward_all = 0 j = 0 while j < MAX_STEPS: j += 1 env.render() action = ep_greedy(sess, ep, total_steps, main_q_network, observation) new_observation, reward, done, info = env.step(action) total_steps += 1 episode_results = np.array( [observation, action, reward, new_observation, done]) episode_experience = np.reshape(episode_results, [1, 5]) episode_buffer.add(episode_experience) #print("accumulated episdoes experience %d" % len(episode_buffer.buffer)) ep = decrease_ep(total_steps, ep, step_decrease) #print("epsilon %f" % ep) #print(total_steps) if total_steps > PRE_TRAIN_STEPS: if total_steps % (UPDATE_FREQ) == 0: train_batch = ex_buffer.sample(BATCH_SIZE) # take our new observation to perform the double-dqn q1 = sess.run(main_q_network.predict, feed_dict={ main_q_network.image_inputs: np.stack(train_batch[:, 3]) }) q2 = sess.run(target_q_network.q_output, feed_dict={ target_q_network.image_inputs: np.stack(train_batch[:, 3]) }) end_multiplier = -(train_batch[:, 4] - 1) # NOTE: choose q_values from target q network using main q network double_q = q2[range(BATCH_SIZE), q1] target_q = train_batch[:, 2] + (GAMMA * double_q * end_multiplier) # NOTE: update our main q network after calculate q target from double q _ = sess.run(main_q_network.train_op, feed_dict={ main_q_network.image_inputs: np.stack(train_batch[:, 0]), main_q_network.target_q: target_q, main_q_network.actions: train_batch[:, 1] }) # NOTE: we update our target network separately update_target(sess, target_ops) reward_all += reward observation = new_observation if done == True: break ex_buffer.add(episode_buffer.buffer) j_list.append(j) r_list.append(reward_all) if episode % 1000 == 0: saver.save(sess, PATH + '/model-' + str(episode) + '.ckpt') print("saved model at %d" % episode) if len(r_list) % 10 == 0: print(total_steps, np.mean(r_list[-10:]), ep) saver.save(sess, PATH + '/model-' + str(episode) + '.ckpt') print("Percent of succesful episodes: " + str(sum(rList) / num_episodes) + "%")
def playAndLearn(agentNet, targetNet, player): expBuffer = ExperienceBuffer(exp_capacity, PRIO_ALPHA) expUnroller = ExperienceUnroller(EXP_UNROLL_STEPS, GAMMA) qGamma = GAMMA**(EXP_UNROLL_STEPS + 1) device = AgentNet.device writer = SummaryWriter(logdir=os.path.join( 'tensorboard', datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) reportInterval = 2000 sampleCountTotal = 0 sampleLastReport = initial_exp_gathering episodeCountTotal = 0 timeLastReport = time.perf_counter() epsilon = epsilon_initial beta = PRIO_BETA_INITIAL lossFunc = torch.nn.MSELoss() optim = torch.optim.Adam(agentNet.parameters(), lr=learning_rate) lossAcc = 0 lossCnt = 0 epLen = 0 episodeLengths = collections.deque(maxlen=20) evalScMin = evalScAvg = evalScMax = evalToMin = evalToAvg = evalToMax = 0 try: while True: if sampleCountTotal > initial_exp_gathering: part = min(1.0, (sampleCountTotal - initial_exp_gathering) / epsilon_decay_time) epsilon = epsilon_initial - epsilon_decay_amount * part beta = min( 1.0, PRIO_BETA_INITIAL + (1 - PRIO_BETA_INITIAL) * (sampleCountTotal - initial_exp_gathering) / PRIO_BETA_RISE_TIME) s, a, r, term, s1 = player.makeTurn(epsilon) sampleCountTotal += 1 epLen += 1 if term: episodeCountTotal += 1 episodeLengths.append(epLen) epLen = 0 game.reset() s, a, r, term, s1 = expUnroller.add(s, a, r, term, s1) if s is not None: expBuffer.add(s, a, r, term, s1) if expBuffer.count() < initial_exp_gathering: timeLastReport = time.perf_counter() continue if sampleCountTotal - sampleLastReport > reportInterval: timeCur = time.perf_counter() speed = (sampleCountTotal - sampleLastReport) / (timeCur - timeLastReport) timeLastReport = timeCur sampleLastReport += reportInterval epLengthAvg = np.mean(episodeLengths) writer.add_scalar('Training/Eps', epsilon, sampleCountTotal) writer.add_scalar('Training/Speed', speed, sampleCountTotal) writer.add_scalar('Training/Episode Length', epLengthAvg, sampleCountTotal) if lossCnt > 0: writer.add_scalar('Training/Loss', lossAcc / lossCnt, sampleCountTotal) print( f'Played {sampleLastReport} steps ({episodeCountTotal} episodes) ({speed:8.2f} samples/s): Average steps {epLengthAvg:7.2f}, Evaluation score {evalScMin:2}, {evalScAvg:4.1f}, {evalScMax:2}, total {evalToMin:5}, {evalToAvg:7.1f}, {evalToMax:5}' ) lossAcc = lossCnt = 0 if sampleCountTotal % evaluation_interval == 0: evalScMin, evalScAvg, evalScMax, evalToMin, evalToAvg, evalToMax = playSomeGames( Game2048(), agentNet, EVAL_GAMES) writer.add_scalar('Evaluation/Eval Score Avg', evalScAvg, sampleCountTotal) writer.add_scalar('Evaluation/Eval Score Min', evalScMin, sampleCountTotal) writer.add_scalar('Evaluation/Eval Score Max', evalScMax, sampleCountTotal) writer.add_scalar('Evaluation/Eval Total Score Avg', evalToAvg, sampleCountTotal) writer.add_scalar('Evaluation/Eval Total Score Min', evalToMin, sampleCountTotal) writer.add_scalar('Evaluation/Eval Total Score Max', evalToMax, sampleCountTotal) if sampleCountTotal % target_sync_interval == 0: targetNet.load_state_dict(agentNet.state_dict()) if sampleCountTotal % 2 == 0: continue optim.zero_grad() states, actions, rewards, terms, newStates, idxs, weights = expBuffer.sample( batch_size, beta) states_t = agentNet.prepareInputs(states) newStates_t = agentNet.prepareInputs(newStates) actions_t = torch.from_numpy(actions).to(device) rewards_t = torch.from_numpy(rewards).to(device) terms_t = torch.from_numpy(terms).to(device) weights_t = torch.from_numpy(weights).to(device) stateActionQs = agentNet(states_t) stateActionQs = torch.gather(stateActionQs, 1, actions_t.unsqueeze(-1)).squeeze(-1) nextActions = agentNet(newStates_t).max(1)[1] nextStateQs = targetNet(newStates_t).gather( 1, nextActions.unsqueeze(-1)).squeeze(-1) nextStateQs[terms_t] = 0.0 nextStateQs = nextStateQs.detach() totalRewards_t = nextStateQs * qGamma + rewards_t losses_t = weights_t * (stateActionQs - totalRewards_t)**2 loss = losses_t.mean() loss.backward() torch.nn.utils.clip_grad_norm_(agentNet.parameters(), GRAD_CLIP) optim.step() expBuffer.updatePriorities(idxs, losses_t.data.cpu().numpy() + 1e-5) lossAcc += loss.item() lossCnt += 1 except KeyboardInterrupt: print( f'Playing stopped after {sampleCountTotal} steps ({episodeCountTotal} episodes).' ) torch.save( agentNet.state_dict(), os.path.join( 'models', datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + f' SC {sampleCountTotal}')) writer.close()
def updateTarget(op_holder, sess): for op in op_holder: sess.run(op) tf.logging.set_verbosity(tf.logging.INFO) tf.reset_default_graph() mainQN = QNetwork(sensor_size, action_size, h_size, l_rate, init_value) targetQN = QNetwork(sensor_size, action_size, h_size, l_rate, init_value) init = tf.initialize_all_variables() saver = tf.train.Saver() trainables = tf.trainable_variables() targetOps = updateTargetGraph(trainables, tau) myBuffer = ExperienceBuffer(total_size, resolution, buffersize) env = VrepEnvironment(motor_speed, turn_speed, resolution, reset_distance, pub_rate, dvs_queue, resize_factor, crop) # Set the rate of random action decrease. e = startE stepDrop = (startE - endE) / anneling_steps jList = [] rList = [] total_steps = 0 # Make a path for model to be saved in. if not os.path.exists(path): os.makedirs(path)