def __init__(self, policy_net_path, value_net_path, time_limit=20): self.time_limit = time_limit self.game = None self.root = None policy_model = PolicyNet('./train/', '/val/') value_model = ValueNet('./train/', '/val/') g_policy = tf.Graph() with g_policy.as_default(): self.policy_board = tf.placeholder(dtype=tf.float32) self.p_is_training = tf.placeholder(dtype=tf.bool) self.policy_out = policy_model.inference( self.policy_board, is_training=self.p_is_training) self.policy_loader = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.policy_sess = tf.Session(config=config) print('load policy model:', policy_net_path) self.policy_loader.restore(self.policy_sess, policy_net_path) g_value = tf.Graph() with g_value.as_default(): self.value_board = tf.placeholder(dtype=tf.float32, shape=(None, 19, 19, 21)) self.v_is_training = tf.placeholder(dtype=tf.bool) _, self.value_out = value_model.inference(self.value_board, self.v_is_training) self.value_loader = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.value_sess = tf.Session(config=config) print('load value model:', value_net_path) self.value_loader.restore(self.value_sess, value_net_path)
def __init__(self, policy_net_path, value_net_path, time_limit=20): self.time_limit = time_limit self.game = None self.root = None policy_model = PolicyNet('./train/', '/val/') value_model = ValueNet('./train/', '/val/') g_policy = tf.Graph() with g_policy.as_default(): self.policy_board = tf.placeholder(dtype=tf.float32) self.p_is_training = tf.placeholder(dtype=tf.bool) self.policy_out = policy_model.inference(self.policy_board, is_training=self.p_is_training) self.policy_loader = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.policy_sess = tf.Session(config=config) print('load policy model:', policy_net_path) self.policy_loader.restore(self.policy_sess, policy_net_path) g_value = tf.Graph() with g_value.as_default(): self.value_board = tf.placeholder(dtype=tf.float32, shape=(None, 19, 19, 21)) self.v_is_training = tf.placeholder(dtype=tf.bool) _, self.value_out = value_model.inference(self.value_board, self.v_is_training) self.value_loader = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.value_sess = tf.Session(config=config) print('load value model:', value_net_path) self.value_loader.restore(self.value_sess, value_net_path)
def main(): net = PolicyNet() net.read_weights_from_file('./weights/policy_2.0_2017-12-04T21:08:08.381535') player2 = RandomPlayer() player1 = MCTPlayer() game = Game(player1, player2) print() game.play(log=True)
def __init__(self, args, debug=False): self.policy_net = PolicyNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) self.value_net = ValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) self.q_value_net1 = QValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) self.q_value_net2 = QValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) self.target_value_net = ValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) if debug: if type(debug) == str: self.load_net(debug) else: if args.input_network: self.load_net(args.input_network) self.soft_q_optimizer1 = optim.Adam(self.q_value_net1.parameters(), lr=args.q_lr) self.soft_q_optimizer2 = optim.Adam(self.q_value_net2.parameters(), lr=args.q_lr) self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.value_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.policy_lr) self.q_criterion1 = nn.MSELoss() self.q_criterion2 = nn.MSELoss() self.value_criterion = nn.MSELoss() self.to_cuda() self.args = args
def __init__(self, state_size, action_size, lr=1e-3, gamma=0.99, clipping_epsilon=0.1, ppo_epochs=10, minibatch_size=64, rollout_length=1000, gae_lambda=0.95): self.lr = lr self.clipping_epsilon = clipping_epsilon self.ppo_epochs = ppo_epochs self.minibatch_size = minibatch_size self.rollout_length = rollout_length self.policy = PolicyNet(state_size, action_size) self.value_estimator = ValueNet(state_size) self.rollout = Rollout(gamma=gamma, gae_lambda=gae_lambda)
def eval_node(self, node, game, is_value=True, width=8): board_mtx = game.boards[-1].board_mtx if is_value: t0 = time.time() value_query = ValueNet.preprocess_board( board_mtx, { 'next_to_play': game.next_to_play, 'ko_state:': game.ko_state[-1], 'current_move': game.current_moves[-1] }, random=False, contain_liberty=True) value_query = np.asarray([value_query], dtype=np.float32) t1 = time.time() black_win_rate, = self.value_sess.run([self.value_out], feed_dict={ self.value_board: value_query, self.v_is_training: False }) black_win_rate = black_win_rate.reshape((1, ))[0] t2 = time.time() print('TIME', t1 - t0, t2 - t1) node.black_win_rate = black_win_rate else: label_y = { 'next_to_play': game.next_to_play, 'ko_state:': game.ko_state[-1], 'current_move': game.current_moves[-1] } policy_query = PolicyNet.preprocess_board(board_mtx, label_y, random=False, contain_liberty=True) policy_query = np.asarray([policy_query], dtype=np.float32) p, = self.policy_sess.run(self.policy_out, feed_dict={ self.policy_board: policy_query, self.p_is_training: False }) probs = np.reshape(p, (19, 19)) probs -= np.max(probs) probs = np.exp(probs) / np.sum(np.exp(probs)) ids = np.dstack( np.unravel_index(np.argsort(probs.ravel()), (19, 19)))[0] ids = ids[::-1][:width, :] moves = [([move[0], move[1]], probs[move[0]][move[1]]) for move in ids] node.moves = [move for move in moves if game.legal_place(*move[0])]
def eval_node(self, node, game, is_value=True, width=8): board_mtx = game.boards[-1].board_mtx if is_value: t0 = time.time() value_query = ValueNet.preprocess_board(board_mtx, {'next_to_play': game.next_to_play, 'ko_state:': game.ko_state[-1], 'current_move': game.current_moves[-1]}, random=False, contain_liberty=True) value_query = np.asarray([value_query], dtype=np.float32) t1 = time.time() black_win_rate, = self.value_sess.run([self.value_out], feed_dict={self.value_board: value_query, self.v_is_training: False}) black_win_rate = black_win_rate.reshape((1, ))[0] t2 = time.time() print('TIME', t1-t0, t2-t1) node.black_win_rate = black_win_rate else: label_y = {'next_to_play': game.next_to_play, 'ko_state:': game.ko_state[-1], 'current_move': game.current_moves[-1]} policy_query = PolicyNet.preprocess_board(board_mtx, label_y, random=False, contain_liberty=True) policy_query = np.asarray([policy_query], dtype=np.float32) p, = self.policy_sess.run(self.policy_out, feed_dict={self.policy_board:policy_query, self.p_is_training: False}) probs = np.reshape(p, (19, 19)) probs -= np.max(probs) probs = np.exp(probs) / np.sum(np.exp(probs)) ids = np.dstack(np.unravel_index(np.argsort(probs.ravel()), (19, 19)))[0] ids = ids[::-1][:width, :] moves = [([move[0], move[1]], probs[move[0]][move[1]]) for move in ids] node.moves = [move for move in moves if game.legal_place(*move[0])]
for t in range(len(rewards)): total_discounted_reward = 0 discount = 1 for k in range(t, len(rewards)): total_discounted_reward += rewards[k] * discount discount *= discount_factor # Don't count rewards from subsequent rounds if rewards[k] != 0: break discounted_rewards[t] = total_discounted_reward return discounted_rewards env = gym.make('Pong-v4') pongNet = PolicyNet(hidden_layer_size, learning_rate, checkpoints_dir) if load_checkpoint: pongNet.load_checkpoint() batch_feature_vector = [] #Vector of state, action, and reward smoothed_reward = None episode_count = 1 while True: print("Starting episode {}".format(episode_count)) episode_done = False episode_reward_sum = 0 round_num = 1
class PPOAgent(): def __init__(self, state_size, action_size, lr=1e-3, gamma=0.99, clipping_epsilon=0.1, ppo_epochs=10, minibatch_size=64, rollout_length=1000, gae_lambda=0.95): self.lr = lr self.clipping_epsilon = clipping_epsilon self.ppo_epochs = ppo_epochs self.minibatch_size = minibatch_size self.rollout_length = rollout_length self.policy = PolicyNet(state_size, action_size) self.value_estimator = ValueNet(state_size) self.rollout = Rollout(gamma=gamma, gae_lambda=gae_lambda) def start_episode(self): self.episode_rewards = [] self.rollout.start_rollout() def act(self, state): # Check if the rollout is full and needs processing if len(self.rollout) == self.rollout_length: self.learn() self.rollout.start_rollout() # Derive action distribution from policy network means, sigmas = self.policy(state) action_distribution = torch.distributions.Normal(means, sigmas) action = action_distribution.sample() action_log_prob = action_distribution.log_prob(action) # Derive state value estimate from value network state_value = self.value_estimator(state).squeeze() # Record decision and return sampled action self.rollout.record_decision(state, state_value, action, action_log_prob) return action def finish_episode(self): self.learn() def record_outcome(self, reward): self.episode_rewards.append(reward) self.rollout.record_outcome(reward) def average_episode_return(self): return sum([r.mean().item() for r in self.episode_rewards]) def get_current_policy_probs(self, states, actions): # For the given state/action pairs, create a distribution from the policy and get the log probs means, sigmas = self.policy(states) action_distribution = torch.distributions.Normal(means, sigmas) current_policy_log_probs = action_distribution.log_prob(actions) # Sum log probs over the possible actions current_policy_log_probs = current_policy_log_probs.sum(-1) return torch.exp(current_policy_log_probs) def learn(self): (states, actions, future_returns, normalised_advantages, original_policy_probs) = \ self.rollout.flatten_trajectories() # Run through PPO epochs policy_optimiser = optim.Adam(self.policy.parameters(), lr=self.lr, eps=1e-5) value_estimator_optimiser = optim.Adam( self.value_estimator.parameters(), lr=self.lr, eps=1e-5) for ppo_epoch in range(self.ppo_epochs): # Sample the trajectories randomly in mini-batches for indices in random_sample(np.arange(states.shape[0]), self.minibatch_size): # Sample using sample indices states_sample = states[indices] actions_sample = actions[indices] future_returns_sample = future_returns[indices] normalised_advantages_sample = normalised_advantages[indices] original_policy_probs_sample = original_policy_probs[indices] # Use the current policy to get the probabilities for the sample states and actions # We use these to weight the likehoods, allowing resuse of the rollout current_policy_probs_sample = self.get_current_policy_probs( states_sample, actions_sample) # Define PPO surrogate and clip to get the policy loss sampling_ratio = current_policy_probs_sample / original_policy_probs_sample clipped_ratio = torch.clamp(sampling_ratio, 1 - self.clipping_epsilon, 1 + self.clipping_epsilon) clipped_surrogate = torch.min( sampling_ratio * normalised_advantages_sample, clipped_ratio * normalised_advantages_sample) policy_loss = -torch.mean(clipped_surrogate) # Define value estimator loss state_values_sample = self.value_estimator( states_sample).squeeze() value_estimator_loss = nn.MSELoss()(state_values_sample, future_returns_sample) # Update value estimator value_estimator_optimiser.zero_grad() value_estimator_loss.backward() nn.utils.clip_grad_norm_(self.value_estimator.parameters(), 0.75) value_estimator_optimiser.step() # Update policy policy_optimiser.zero_grad() policy_loss.backward() nn.utils.clip_grad_norm_(self.policy.parameters(), 0.75) policy_optimiser.step()
ACTION_DIM = env.action_space.shape[0] INPUT_DIM = env.observation_space.shape[0] # disable GPU memory usuage here os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # create the summary writer here summary_writer = tf.summary.FileWriter(os.path.join(MODEL_DIR, "train")) # to run stuff on cpu with tf.device("/cpu:0"): # Keeps track of the number of updates we've performed global_step = tf.Variable(0, name="global_step", trainable=False) global_actor_net = PolicyNet(HIDDEN_LAYER, ACTION_DIM, name="global_actor") global_critic_net = AdvantageValueNet(HIDDEN_LAYER, name="global_critic") # connecting stuff tmp_x = tf.placeholder(dtype=tf.float32, shape=(BATCH_SIZE, INPUT_DIM), name="tmp_x") tmp_a = tf.placeholder(dtype=tf.float32, shape=(BATCH_SIZE, ACTION_DIM), name="tmp_a") global_average_actor_net = PolicyNet(HIDDEN_LAYER, ACTION_DIM, name="global_Average_actor") _, tmp_policy = global_actor_net(tmp_x) _ = global_critic_net(tmp_x, tmp_a, tmp_policy)
def __init__(self): self.policy_net = PolicyNet() self.eval_net = EvalNet()
def build_graph(self): """ builds a local graph """ # place holders for inputs here HIDDEN_LAYER = self.FLAGS.feature_layer_size self.x_i = tf.placeholder(dtype=tf.float32, shape=(None, self.INPUT_DIM), name="x_i") self.a_i = tf.placeholder(dtype=tf.float32, shape=(None, self.ACTION_DIM), name="a_i") self.q_opc = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="q_opc") self.q_ret = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="q_ret") self.c = self.FLAGS.c # truncation threshold constant self.actor_net = PolicyNet(HIDDEN_LAYER, self.ACTION_DIM, name=self.name + "_actor", co_var=self.co_var) self.critic_net = AdvantageValueNet(HIDDEN_LAYER, name=self.name + "_critic") self.policy_xi_stats, self.policy_xi_dist = self.actor_net(self.x_i) self.val_xi, self.adv_xi_ai = self.critic_net(self.x_i, self.a_i, self.policy_xi_dist) #sample a' now self.a_i_ = tf.reshape(self.policy_xi_dist.sample(1), shape=[-1, self.ACTION_DIM]) _, self.adv_xi_ai_ = self.critic_net( self.x_i, self.a_i_, self.policy_xi_dist) # val will be the same for _, self.average_policy_xi_dist = self.average_actor_net( self.x_i) # can this be done better ? self.prob_a_i = tf.reshape(self.policy_xi_dist.prob(self.a_i), shape=[-1, 1]) + 1e-8 self.prob_a_i_ = tf.reshape(self.policy_xi_dist.prob(self.a_i_), shape=[-1, 1]) + 1e-8 self.log_prob_a_i = tf.log(self.prob_a_i) self.log_prob_a_i_ = tf.log(self.prob_a_i_) # for predicting 1-step a_i', p_i, p_i', self.u_i = tf.placeholder(dtype=tf.float32, shape=(None, self.ACTION_DIM)) self.u_i_dist = tf.contrib.distributions.MultivariateNormalDiag( loc=self.u_i, scale_diag=tf.ones_like(self.u_i) * self.co_var) self.u_i_prob_a_i = tf.reshape(self.u_i_dist.prob(self.a_i), shape=[-1, 1]) + 1e-8 self.u_i_prob_a_i_ = tf.reshape(self.u_i_dist.prob(self.a_i_), shape=[-1, 1]) + 1e-8 self.p_i = tf.divide(self.prob_a_i, self.u_i_prob_a_i) self.p_i_ = tf.divide(self.prob_a_i_, self.u_i_prob_a_i_) # take care of NaNs here, for importance sampling weights (might be an extra step) self.p_i = tf.where(tf.is_nan(self.p_i), tf.zeros_like(self.p_i), self.p_i) self.p_i_ = tf.where(tf.is_nan(self.p_i_), tf.zeros_like(self.p_i_), self.p_i_) self.c_i = tf.minimum(1., tf.pow(self.p_i, 1.0 / self.ACTION_DIM)) # for verification about checking if params are getting synched self.local_actor_vars = self.actor_net.local_params() self.global_actor_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global_actor') self.local_critic_vars = self.critic_net.local_params() self.global_critic_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global_critic') # Sync ops from global self.sync_local_actor_op = self.actor_net.update_local_params_op( 'global_actor') # global actor self.sync_local_critic_op = self.critic_net.update_local_params_op( 'global_critic') # soft update the average network self.soft_update_average_actor_op = self.average_actor_net.soft_update_from_target_params( 'global_actor', self.FLAGS.tau) #Get gradients from local network using local losses g1 = tf.reshape(tf.gradients( (self.log_prob_a_i * (self.q_opc - self.val_xi)), self.policy_xi_stats, name=self.name + "g1_grads"), shape=[-1, self.ACTION_DIM]) g2 = (self.adv_xi_ai_ - self.val_xi) * tf.reshape( tf.gradients((self.log_prob_a_i_), self.policy_xi_stats, name=self.name + "g2_grads"), shape=[-1, self.ACTION_DIM]) self.g = tf.minimum(self.c, self.p_i) * g1 + tf.nn.relu( 1 - tf.divide(self.c, self.p_i_)) * g2 self.k = tf.reshape(tf.gradients( tf.contrib.distributions.kl_divergence(self.average_policy_xi_dist, self.policy_xi_dist), self.policy_xi_stats), shape=[-1, self.ACTION_DIM]) self.kg = tf.reduce_sum(tf.multiply(self.g, self.k), 1, keep_dims=True) #print "kg", self.kg self.k2 = tf.reduce_sum(tf.multiply(self.k, self.k), 1, keep_dims=True) self.reg_g = self.g - tf.maximum( tf.zeros_like(self.g), tf.divide((self.kg - self.FLAGS.delta), self.k2)) * self.k # take gradients wrt to the local params self.actor_grads = tf.gradients(self.policy_xi_stats, self.local_actor_vars, grad_ys=-self.reg_g, name="actor_grads") #for ti,tj in zip(self.actor_grads, self.global_actor_vars): # print ti, "\n", tj , "\n", "===========" # apply local gradients to the global network self.actor_train_op = self.optimizer.apply_gradients( zip(self.actor_grads, self.global_actor_vars), global_step=tf.train.get_global_step()) # critic loss function and updates # take gradient wrt to local variables self.critic_loss_1 = ((self.q_ret - self.adv_xi_ai)**2.0) / 2.0 # for predicting 1-step a_i', p_i, p_i', self.v_target = tf.placeholder(dtype=tf.float32, shape=(None, 1)) #self.v_trunc = tf.minimum(self.p_i, 1.0) * (self.q_ret - self.adv_xi_ai) + self.val_xi self.critic_loss_2 = ((self.v_target - self.val_xi)**2.0) / 2.0 self.critic_loss = self.critic_loss_1 + self.critic_loss_2 #Apply local gradients to global network self.critic_grads = tf.gradients(self.critic_loss, self.local_critic_vars) self.critic_train_op = self.optimizer.apply_gradients( zip(self.critic_grads, self.global_critic_vars), global_step=tf.train.get_global_step()) # critic_summaries op critic_grads_summary = [] for grad, var in zip(self.critic_grads, self.local_critic_vars): critic_grads_summary.append( tf.summary.histogram(var.name + '/gradient', grad)) critic_grads_summary.append( tf.summary.histogram(var.name + '/weight', var)) self.critic_summary_op = tf.summary.merge([ tf.summary.scalar(self.name + "_critc_mean_loss_Q", tf.reduce_mean(self.critic_loss_1)), tf.summary.scalar(self.name + "_critc_mean_loss_V", tf.reduce_mean(self.critic_loss_2)), tf.summary.scalar(self.name + "_critc_sum_loss_Q", tf.reduce_sum(self.critic_loss_1)), tf.summary.scalar(self.name + "_critc_sum_loss_V", tf.reduce_sum(self.critic_loss_2)), tf.summary.scalar(self.name + "_critc_mean_loss", tf.reduce_mean(self.critic_loss)), tf.summary.scalar(self.name + "_critc_sum_loss", tf.reduce_sum(self.critic_loss)), tf.summary.histogram(self.name + "_val_target", self.v_target), tf.summary.histogram(self.name + "_val_pred", self.val_xi), tf.summary.histogram(self.name + "_Q_pred", self.adv_xi_ai), tf.summary.histogram(self.name + "_Q_ret", self.q_ret), tf.summary.histogram(self.name + "_Q_opc", self.q_opc), ] + critic_grads_summary) # actor summaries op actor_grads_summary = [] for grad, var in zip(self.actor_grads, self.local_actor_vars): actor_grads_summary.append( tf.summary.histogram(var.name + '/gradient', grad)) actor_grads_summary.append( tf.summary.histogram(var.name + '/weight', var)) self.actor_summary_op = tf.summary.merge([ tf.summary.scalar(self.name + "_actor_mean_loss_reg_g", tf.reduce_mean(self.reg_g)), tf.summary.scalar(self.name + "_actor_neg_mean_loss_reg_g", tf.reduce_mean(-self.reg_g)), tf.summary.scalar(self.name + "_actor_sum_loss_reg_g", tf.reduce_sum(self.reg_g)), tf.summary.scalar(self.name + "_actor_neg_sum_reg_g", tf.reduce_sum(-self.reg_g)), tf.summary.scalar(self.name + "_actor_sum_g", tf.reduce_sum(self.g)), tf.summary.scalar(self.name + "_actor_neg_sum_g", tf.reduce_sum(-self.g)), tf.summary.scalar(self.name + "_actor_mean_kl", tf.reduce_mean(self.k)), tf.summary.scalar(self.name + "_actor_sum_kl", tf.reduce_sum(self.k)), tf.summary.histogram(self.name + "_policy_stats", self.policy_xi_stats), ] + actor_grads_summary)
class Agent(): def __init__(self, name=-1, environment=None, global_counter=0, average_actor_net=None, co_var=0.3, summary_writer=None, saver=None, optimizer=None, flags=None): self.name = "acer_agent_" + name self.memory = Memory(5000) # each worker has its own memory # all the flags variables here self.FLAGS = flags # for dumping info about this agent # self.file_dump = open("./dump/" + self.name + "_dump", 'w', 0) # average net copied self.average_actor_net = average_actor_net # if shared optimizer given use that or else create its own if optimizer is None: self.optimizer = tf.train.RMSPropOptimizer( learning_rate=self.FLAGS.lr) else: self.optimizer = optimizer # env here self.env = environment self.ACTION_DIM = self.env.action_space.shape[0] self.INPUT_DIM = self.env.observation_space.shape[0] # summary, saver, checkpointing self.summary_writer = summary_writer self.saver = saver if summary_writer is not None: self.checkpoint_path = os.path.abspath( os.path.join(summary_writer.get_logdir(), "../checkpoints/model")) # diagonal co var for policy self.co_var = co_var # counter self.local_counter = itertools.count() self.global_counter = global_counter #next(self.global_counter) # loss function and optimizer in build graphs self.build_graph() def build_graph(self): """ builds a local graph """ # place holders for inputs here HIDDEN_LAYER = self.FLAGS.feature_layer_size self.x_i = tf.placeholder(dtype=tf.float32, shape=(None, self.INPUT_DIM), name="x_i") self.a_i = tf.placeholder(dtype=tf.float32, shape=(None, self.ACTION_DIM), name="a_i") self.q_opc = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="q_opc") self.q_ret = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="q_ret") self.c = self.FLAGS.c # truncation threshold constant self.actor_net = PolicyNet(HIDDEN_LAYER, self.ACTION_DIM, name=self.name + "_actor", co_var=self.co_var) self.critic_net = AdvantageValueNet(HIDDEN_LAYER, name=self.name + "_critic") self.policy_xi_stats, self.policy_xi_dist = self.actor_net(self.x_i) self.val_xi, self.adv_xi_ai = self.critic_net(self.x_i, self.a_i, self.policy_xi_dist) #sample a' now self.a_i_ = tf.reshape(self.policy_xi_dist.sample(1), shape=[-1, self.ACTION_DIM]) _, self.adv_xi_ai_ = self.critic_net( self.x_i, self.a_i_, self.policy_xi_dist) # val will be the same for _, self.average_policy_xi_dist = self.average_actor_net( self.x_i) # can this be done better ? self.prob_a_i = tf.reshape(self.policy_xi_dist.prob(self.a_i), shape=[-1, 1]) + 1e-8 self.prob_a_i_ = tf.reshape(self.policy_xi_dist.prob(self.a_i_), shape=[-1, 1]) + 1e-8 self.log_prob_a_i = tf.log(self.prob_a_i) self.log_prob_a_i_ = tf.log(self.prob_a_i_) # for predicting 1-step a_i', p_i, p_i', self.u_i = tf.placeholder(dtype=tf.float32, shape=(None, self.ACTION_DIM)) self.u_i_dist = tf.contrib.distributions.MultivariateNormalDiag( loc=self.u_i, scale_diag=tf.ones_like(self.u_i) * self.co_var) self.u_i_prob_a_i = tf.reshape(self.u_i_dist.prob(self.a_i), shape=[-1, 1]) + 1e-8 self.u_i_prob_a_i_ = tf.reshape(self.u_i_dist.prob(self.a_i_), shape=[-1, 1]) + 1e-8 self.p_i = tf.divide(self.prob_a_i, self.u_i_prob_a_i) self.p_i_ = tf.divide(self.prob_a_i_, self.u_i_prob_a_i_) # take care of NaNs here, for importance sampling weights (might be an extra step) self.p_i = tf.where(tf.is_nan(self.p_i), tf.zeros_like(self.p_i), self.p_i) self.p_i_ = tf.where(tf.is_nan(self.p_i_), tf.zeros_like(self.p_i_), self.p_i_) self.c_i = tf.minimum(1., tf.pow(self.p_i, 1.0 / self.ACTION_DIM)) # for verification about checking if params are getting synched self.local_actor_vars = self.actor_net.local_params() self.global_actor_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global_actor') self.local_critic_vars = self.critic_net.local_params() self.global_critic_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global_critic') # Sync ops from global self.sync_local_actor_op = self.actor_net.update_local_params_op( 'global_actor') # global actor self.sync_local_critic_op = self.critic_net.update_local_params_op( 'global_critic') # soft update the average network self.soft_update_average_actor_op = self.average_actor_net.soft_update_from_target_params( 'global_actor', self.FLAGS.tau) #Get gradients from local network using local losses g1 = tf.reshape(tf.gradients( (self.log_prob_a_i * (self.q_opc - self.val_xi)), self.policy_xi_stats, name=self.name + "g1_grads"), shape=[-1, self.ACTION_DIM]) g2 = (self.adv_xi_ai_ - self.val_xi) * tf.reshape( tf.gradients((self.log_prob_a_i_), self.policy_xi_stats, name=self.name + "g2_grads"), shape=[-1, self.ACTION_DIM]) self.g = tf.minimum(self.c, self.p_i) * g1 + tf.nn.relu( 1 - tf.divide(self.c, self.p_i_)) * g2 self.k = tf.reshape(tf.gradients( tf.contrib.distributions.kl_divergence(self.average_policy_xi_dist, self.policy_xi_dist), self.policy_xi_stats), shape=[-1, self.ACTION_DIM]) self.kg = tf.reduce_sum(tf.multiply(self.g, self.k), 1, keep_dims=True) #print "kg", self.kg self.k2 = tf.reduce_sum(tf.multiply(self.k, self.k), 1, keep_dims=True) self.reg_g = self.g - tf.maximum( tf.zeros_like(self.g), tf.divide((self.kg - self.FLAGS.delta), self.k2)) * self.k # take gradients wrt to the local params self.actor_grads = tf.gradients(self.policy_xi_stats, self.local_actor_vars, grad_ys=-self.reg_g, name="actor_grads") #for ti,tj in zip(self.actor_grads, self.global_actor_vars): # print ti, "\n", tj , "\n", "===========" # apply local gradients to the global network self.actor_train_op = self.optimizer.apply_gradients( zip(self.actor_grads, self.global_actor_vars), global_step=tf.train.get_global_step()) # critic loss function and updates # take gradient wrt to local variables self.critic_loss_1 = ((self.q_ret - self.adv_xi_ai)**2.0) / 2.0 # for predicting 1-step a_i', p_i, p_i', self.v_target = tf.placeholder(dtype=tf.float32, shape=(None, 1)) #self.v_trunc = tf.minimum(self.p_i, 1.0) * (self.q_ret - self.adv_xi_ai) + self.val_xi self.critic_loss_2 = ((self.v_target - self.val_xi)**2.0) / 2.0 self.critic_loss = self.critic_loss_1 + self.critic_loss_2 #Apply local gradients to global network self.critic_grads = tf.gradients(self.critic_loss, self.local_critic_vars) self.critic_train_op = self.optimizer.apply_gradients( zip(self.critic_grads, self.global_critic_vars), global_step=tf.train.get_global_step()) # critic_summaries op critic_grads_summary = [] for grad, var in zip(self.critic_grads, self.local_critic_vars): critic_grads_summary.append( tf.summary.histogram(var.name + '/gradient', grad)) critic_grads_summary.append( tf.summary.histogram(var.name + '/weight', var)) self.critic_summary_op = tf.summary.merge([ tf.summary.scalar(self.name + "_critc_mean_loss_Q", tf.reduce_mean(self.critic_loss_1)), tf.summary.scalar(self.name + "_critc_mean_loss_V", tf.reduce_mean(self.critic_loss_2)), tf.summary.scalar(self.name + "_critc_sum_loss_Q", tf.reduce_sum(self.critic_loss_1)), tf.summary.scalar(self.name + "_critc_sum_loss_V", tf.reduce_sum(self.critic_loss_2)), tf.summary.scalar(self.name + "_critc_mean_loss", tf.reduce_mean(self.critic_loss)), tf.summary.scalar(self.name + "_critc_sum_loss", tf.reduce_sum(self.critic_loss)), tf.summary.histogram(self.name + "_val_target", self.v_target), tf.summary.histogram(self.name + "_val_pred", self.val_xi), tf.summary.histogram(self.name + "_Q_pred", self.adv_xi_ai), tf.summary.histogram(self.name + "_Q_ret", self.q_ret), tf.summary.histogram(self.name + "_Q_opc", self.q_opc), ] + critic_grads_summary) # actor summaries op actor_grads_summary = [] for grad, var in zip(self.actor_grads, self.local_actor_vars): actor_grads_summary.append( tf.summary.histogram(var.name + '/gradient', grad)) actor_grads_summary.append( tf.summary.histogram(var.name + '/weight', var)) self.actor_summary_op = tf.summary.merge([ tf.summary.scalar(self.name + "_actor_mean_loss_reg_g", tf.reduce_mean(self.reg_g)), tf.summary.scalar(self.name + "_actor_neg_mean_loss_reg_g", tf.reduce_mean(-self.reg_g)), tf.summary.scalar(self.name + "_actor_sum_loss_reg_g", tf.reduce_sum(self.reg_g)), tf.summary.scalar(self.name + "_actor_neg_sum_reg_g", tf.reduce_sum(-self.reg_g)), tf.summary.scalar(self.name + "_actor_sum_g", tf.reduce_sum(self.g)), tf.summary.scalar(self.name + "_actor_neg_sum_g", tf.reduce_sum(-self.g)), tf.summary.scalar(self.name + "_actor_mean_kl", tf.reduce_mean(self.k)), tf.summary.scalar(self.name + "_actor_sum_kl", tf.reduce_sum(self.k)), tf.summary.histogram(self.name + "_policy_stats", self.policy_xi_stats), ] + actor_grads_summary) def run(self, sess, coord): """ Main method, the ACER algorithm runs via this method """ # use the prev session with sess.as_default(), sess.graph.as_default(): # run stuff here try: # keep running the agents in a while loop while not coord.should_stop(): # gather experiences for i in range(self.FLAGS.pure_exploration_steps): eps_reward, eps_len, local_t, global_t = self.random_exploration_step( sess) # use eps-greedy # 1 time current policy for i in range(self.FLAGS.current_policy_steps): eps_reward, eps_len, local_t, global_t = self.current_policy_step( sess) # train off policy for i in range(self.FLAGS.update_steps): self.train_off_policy(sess) except tf.errors.CancelledError: return def train_off_policy(self, sess): """ ACER algorithm updates here """ # sync the local nets from the global sess.run([self.sync_local_actor_op, self.sync_local_critic_op]) # sample trajectory from the replay memory traj = self.memory.get_trajectory(self.FLAGS.k_steps) k = len(traj) # empty list to store targets q_ret_list = [] q_opc_list = [] states = [] actions = [] mu_dist = [] val_s = [] # if last episode is not terminal state, use value function to get an estimate Q_ret = 0.0 if not traj[-1].done: Q_ret = sess.run( [self.val_xi], feed_dict={self.x_i: np.reshape(traj[-1].next_state, (1, -1))})[0][0, 0] Q_opc = Q_ret # reverse loop for transition in traj[::-1]: Q_ret = transition.reward + self.FLAGS.gamma * Q_ret Q_opc = transition.reward + self.FLAGS.gamma * Q_opc # x_t = np.reshape(transition.state, (1, -1)) a_t = np.reshape(transition.action, (1, -1)) u_t = np.reshape(transition.distribution, (1, -1)) # add to minibatch q_ret_list.append(Q_ret) q_opc_list.append(Q_opc) states.append(x_t) actions.append(a_t) mu_dist.append(u_t) # get estimates from existing function approximators v_t, c_t, p_t, q_t = sess.run( [self.val_xi, self.c_i, self.p_i, self.adv_xi_ai], feed_dict={ self.x_i: x_t, self.a_i: a_t, self.u_i: u_t }) # add the target V_pi val_s.append((min(p_t[0, 0], 1.0) * (Q_ret - q_t[0, 0])) + v_t[0, 0]) # update again Q_ret = c_t[0, 0] * (Q_ret - q_t[0, 0]) + v_t[0, 0] Q_opc = (Q_opc - q_t[0, 0]) + v_t[0, 0] # create mini-batch here opt_feed_dict = { self.x_i: np.asarray(states).reshape(-1, self.INPUT_DIM), self.a_i: np.asarray(actions).reshape(-1, self.ACTION_DIM), self.q_opc: np.asarray(q_opc_list).reshape(-1, 1), self.q_ret: np.asarray(q_ret_list).reshape(-1, 1), self.u_i: np.asarray(mu_dist).reshape(-1, self.ACTION_DIM), self.v_target: np.asarray(val_s).reshape(-1, 1) } # Train the global estimators using local gradients _, _, global_step, critic_summaries, actor_summaries = sess.run( [ self.actor_train_op, self.critic_train_op, tf.contrib.framework.get_global_step(), self.critic_summary_op, self.actor_summary_op ], feed_dict=opt_feed_dict) # Write summaries if self.summary_writer is not None: self.summary_writer.add_summary(critic_summaries, global_step) self.summary_writer.add_summary(actor_summaries, global_step) self.summary_writer.flush() # update the average policy network _ = sess.run([self.soft_update_average_actor_op]) # that's it def random_exploration_step(self, sess): """ follow a random uniform policy to gather experiences and add them to the replay memory """ episode_reward = 0.0 episode_len = 0 #num of action # random policy random_policy = np.zeros((1, self.ACTION_DIM)) #for each episode reset first state = self.env.reset() for t in range(self.FLAGS.max_episode_len): action = self.env.action_space.sample() # random action next_state, reward, done, info = self.env.step( action) # next state, reward, terminal # insert this in memory with a uniform distribution over actions self.memory.add( Transition(state=state, action=action, reward=reward, done=done, distribution=random_policy, next_state=next_state)) # accumulate rewards episode_reward += reward episode_len += 1 local_t = next(self.local_counter) global_t = next(self.global_counter) # update the state state = next_state if done: # print("Episode finished after {} timesteps".format(t+1)) break return episode_reward, episode_len, local_t, global_t def current_policy_step(self, sess, add_to_mem=True): """ follow the current policy network and gather trajectories and update them in the replay memory return the reward for this epiosde here # plot the reward over the trajectories """ episode_reward = 0.0 episode_len = 0 #num of action #for each episode reset first state = self.env.reset() for t in range(self.FLAGS.max_episode_len): # take action according to current policy action, policy_stats = sess.run( [self.a_i_, self.policy_xi_stats], feed_dict={self.x_i: np.array([state])}) action = np.reshape(action, (self.ACTION_DIM, )) next_state, reward, done, info = self.env.step( action) # next state, reward, terminal # insert this in memory with a uniform distribution over actions if add_to_mem: # can also remove this and still work/optimization self.memory.add( Transition(state=state, action=action, reward=reward, done=done, distribution=policy_stats, next_state=next_state)) # accumulate rewards episode_reward += reward episode_len += 1 local_t = next(self.local_counter) global_t = next(self.global_counter) # update the state state = next_state if done: # print("Episode finished after {} timesteps".format(t+1)) break return episode_reward, episode_len, local_t, global_t def evaluate_policy(self, sess, eval_every=3600, coord=None): """ follow the current policy network and gather trajectories and update them in the replay memory return the reward for this epiosde here # plot the reward over the trajectories """ self.video_dir = os.path.join(self.summary_writer.get_logdir(), "../videos") self.video_dir = os.path.abspath(self.video_dir) try: os.makedirs(self.video_dir) except Exception: pass self.env._max_episode_steps = self.FLAGS.max_episode_len self.env = Monitor(self.env, directory=self.video_dir, video_callable=lambda x: True, resume=True) with sess.as_default(), sess.graph.as_default(): # run stuff here try: while not coord.should_stop(): # sync the actor global_step, _ = sess.run([ tf.contrib.framework.get_global_step(), self.sync_local_actor_op ]) #for each episode reset first eps_reward, eps_len, _, global_t = self.current_policy_step( sess, add_to_mem=False) # Add summaries if self.summary_writer is not None: episode_summary = tf.Summary() episode_summary.value.add(simple_value=eps_reward, tag=self.name + "/total_reward") episode_summary.value.add(simple_value=eps_len, tag=self.name + "/episode_length") self.summary_writer.add_summary( episode_summary, global_step) episode_summary_frame = tf.Summary() episode_summary_frame.value.add( simple_value=eps_reward, tag=self.name + "/frame/total_reward") episode_summary_frame.value.add( simple_value=eps_len, tag=self.name + "/frame/episode_length") self.summary_writer.add_summary( episode_summary_frame, global_t) self.summary_writer.flush() if self.saver is not None: self.saver.save(sess, self.checkpoint_path) tf.logging.info( "Eval results at step {}: total_reward {}, episode_length {}" .format(global_step, eps_reward, eps_len)) tf.logging.info("Total steps taken so far: {}".format( self.global_counter)) # # Sleep until next evaluation cycle time.sleep(eval_every) # for stopping once # coord.request_stop() # return except tf.errors.CancelledError: return