def __init__(self, policy_net_path, value_net_path, time_limit=20): self.time_limit = time_limit self.game = None self.root = None policy_model = PolicyNet('./train/', '/val/') value_model = ValueNet('./train/', '/val/') g_policy = tf.Graph() with g_policy.as_default(): self.policy_board = tf.placeholder(dtype=tf.float32) self.p_is_training = tf.placeholder(dtype=tf.bool) self.policy_out = policy_model.inference( self.policy_board, is_training=self.p_is_training) self.policy_loader = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.policy_sess = tf.Session(config=config) print('load policy model:', policy_net_path) self.policy_loader.restore(self.policy_sess, policy_net_path) g_value = tf.Graph() with g_value.as_default(): self.value_board = tf.placeholder(dtype=tf.float32, shape=(None, 19, 19, 21)) self.v_is_training = tf.placeholder(dtype=tf.bool) _, self.value_out = value_model.inference(self.value_board, self.v_is_training) self.value_loader = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True self.value_sess = tf.Session(config=config) print('load value model:', value_net_path) self.value_loader.restore(self.value_sess, value_net_path)
def __init__(self, args, debug=False): self.policy_net = PolicyNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) self.value_net = ValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) self.q_value_net1 = QValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) self.q_value_net2 = QValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) self.target_value_net = ValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM, ROW_DIM * COLUMN_DIM, 128) if debug: if type(debug) == str: self.load_net(debug) else: if args.input_network: self.load_net(args.input_network) self.soft_q_optimizer1 = optim.Adam(self.q_value_net1.parameters(), lr=args.q_lr) self.soft_q_optimizer2 = optim.Adam(self.q_value_net2.parameters(), lr=args.q_lr) self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=args.value_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=args.policy_lr) self.q_criterion1 = nn.MSELoss() self.q_criterion2 = nn.MSELoss() self.value_criterion = nn.MSELoss() self.to_cuda() self.args = args
def main(): net = PolicyNet() net.read_weights_from_file('./weights/policy_2.0_2017-12-04T21:08:08.381535') player2 = RandomPlayer() player1 = MCTPlayer() game = Game(player1, player2) print() game.play(log=True)
def __init__(self, state_size, action_size, lr=1e-3, gamma=0.99, clipping_epsilon=0.1, ppo_epochs=10, minibatch_size=64, rollout_length=1000, gae_lambda=0.95): self.lr = lr self.clipping_epsilon = clipping_epsilon self.ppo_epochs = ppo_epochs self.minibatch_size = minibatch_size self.rollout_length = rollout_length self.policy = PolicyNet(state_size, action_size) self.value_estimator = ValueNet(state_size) self.rollout = Rollout(gamma=gamma, gae_lambda=gae_lambda)
for t in range(len(rewards)): total_discounted_reward = 0 discount = 1 for k in range(t, len(rewards)): total_discounted_reward += rewards[k] * discount discount *= discount_factor # Don't count rewards from subsequent rounds if rewards[k] != 0: break discounted_rewards[t] = total_discounted_reward return discounted_rewards env = gym.make('Pong-v4') pongNet = PolicyNet(hidden_layer_size, learning_rate, checkpoints_dir) if load_checkpoint: pongNet.load_checkpoint() batch_feature_vector = [] #Vector of state, action, and reward smoothed_reward = None episode_count = 1 while True: print("Starting episode {}".format(episode_count)) episode_done = False episode_reward_sum = 0 round_num = 1
ACTION_DIM = env.action_space.shape[0] INPUT_DIM = env.observation_space.shape[0] # disable GPU memory usuage here os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # create the summary writer here summary_writer = tf.summary.FileWriter(os.path.join(MODEL_DIR, "train")) # to run stuff on cpu with tf.device("/cpu:0"): # Keeps track of the number of updates we've performed global_step = tf.Variable(0, name="global_step", trainable=False) global_actor_net = PolicyNet(HIDDEN_LAYER, ACTION_DIM, name="global_actor") global_critic_net = AdvantageValueNet(HIDDEN_LAYER, name="global_critic") # connecting stuff tmp_x = tf.placeholder(dtype=tf.float32, shape=(BATCH_SIZE, INPUT_DIM), name="tmp_x") tmp_a = tf.placeholder(dtype=tf.float32, shape=(BATCH_SIZE, ACTION_DIM), name="tmp_a") global_average_actor_net = PolicyNet(HIDDEN_LAYER, ACTION_DIM, name="global_Average_actor") _, tmp_policy = global_actor_net(tmp_x) _ = global_critic_net(tmp_x, tmp_a, tmp_policy)
def __init__(self): self.policy_net = PolicyNet() self.eval_net = EvalNet()
def build_graph(self): """ builds a local graph """ # place holders for inputs here HIDDEN_LAYER = self.FLAGS.feature_layer_size self.x_i = tf.placeholder(dtype=tf.float32, shape=(None, self.INPUT_DIM), name="x_i") self.a_i = tf.placeholder(dtype=tf.float32, shape=(None, self.ACTION_DIM), name="a_i") self.q_opc = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="q_opc") self.q_ret = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="q_ret") self.c = self.FLAGS.c # truncation threshold constant self.actor_net = PolicyNet(HIDDEN_LAYER, self.ACTION_DIM, name=self.name + "_actor", co_var=self.co_var) self.critic_net = AdvantageValueNet(HIDDEN_LAYER, name=self.name + "_critic") self.policy_xi_stats, self.policy_xi_dist = self.actor_net(self.x_i) self.val_xi, self.adv_xi_ai = self.critic_net(self.x_i, self.a_i, self.policy_xi_dist) #sample a' now self.a_i_ = tf.reshape(self.policy_xi_dist.sample(1), shape=[-1, self.ACTION_DIM]) _, self.adv_xi_ai_ = self.critic_net( self.x_i, self.a_i_, self.policy_xi_dist) # val will be the same for _, self.average_policy_xi_dist = self.average_actor_net( self.x_i) # can this be done better ? self.prob_a_i = tf.reshape(self.policy_xi_dist.prob(self.a_i), shape=[-1, 1]) + 1e-8 self.prob_a_i_ = tf.reshape(self.policy_xi_dist.prob(self.a_i_), shape=[-1, 1]) + 1e-8 self.log_prob_a_i = tf.log(self.prob_a_i) self.log_prob_a_i_ = tf.log(self.prob_a_i_) # for predicting 1-step a_i', p_i, p_i', self.u_i = tf.placeholder(dtype=tf.float32, shape=(None, self.ACTION_DIM)) self.u_i_dist = tf.contrib.distributions.MultivariateNormalDiag( loc=self.u_i, scale_diag=tf.ones_like(self.u_i) * self.co_var) self.u_i_prob_a_i = tf.reshape(self.u_i_dist.prob(self.a_i), shape=[-1, 1]) + 1e-8 self.u_i_prob_a_i_ = tf.reshape(self.u_i_dist.prob(self.a_i_), shape=[-1, 1]) + 1e-8 self.p_i = tf.divide(self.prob_a_i, self.u_i_prob_a_i) self.p_i_ = tf.divide(self.prob_a_i_, self.u_i_prob_a_i_) # take care of NaNs here, for importance sampling weights (might be an extra step) self.p_i = tf.where(tf.is_nan(self.p_i), tf.zeros_like(self.p_i), self.p_i) self.p_i_ = tf.where(tf.is_nan(self.p_i_), tf.zeros_like(self.p_i_), self.p_i_) self.c_i = tf.minimum(1., tf.pow(self.p_i, 1.0 / self.ACTION_DIM)) # for verification about checking if params are getting synched self.local_actor_vars = self.actor_net.local_params() self.global_actor_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global_actor') self.local_critic_vars = self.critic_net.local_params() self.global_critic_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global_critic') # Sync ops from global self.sync_local_actor_op = self.actor_net.update_local_params_op( 'global_actor') # global actor self.sync_local_critic_op = self.critic_net.update_local_params_op( 'global_critic') # soft update the average network self.soft_update_average_actor_op = self.average_actor_net.soft_update_from_target_params( 'global_actor', self.FLAGS.tau) #Get gradients from local network using local losses g1 = tf.reshape(tf.gradients( (self.log_prob_a_i * (self.q_opc - self.val_xi)), self.policy_xi_stats, name=self.name + "g1_grads"), shape=[-1, self.ACTION_DIM]) g2 = (self.adv_xi_ai_ - self.val_xi) * tf.reshape( tf.gradients((self.log_prob_a_i_), self.policy_xi_stats, name=self.name + "g2_grads"), shape=[-1, self.ACTION_DIM]) self.g = tf.minimum(self.c, self.p_i) * g1 + tf.nn.relu( 1 - tf.divide(self.c, self.p_i_)) * g2 self.k = tf.reshape(tf.gradients( tf.contrib.distributions.kl_divergence(self.average_policy_xi_dist, self.policy_xi_dist), self.policy_xi_stats), shape=[-1, self.ACTION_DIM]) self.kg = tf.reduce_sum(tf.multiply(self.g, self.k), 1, keep_dims=True) #print "kg", self.kg self.k2 = tf.reduce_sum(tf.multiply(self.k, self.k), 1, keep_dims=True) self.reg_g = self.g - tf.maximum( tf.zeros_like(self.g), tf.divide((self.kg - self.FLAGS.delta), self.k2)) * self.k # take gradients wrt to the local params self.actor_grads = tf.gradients(self.policy_xi_stats, self.local_actor_vars, grad_ys=-self.reg_g, name="actor_grads") #for ti,tj in zip(self.actor_grads, self.global_actor_vars): # print ti, "\n", tj , "\n", "===========" # apply local gradients to the global network self.actor_train_op = self.optimizer.apply_gradients( zip(self.actor_grads, self.global_actor_vars), global_step=tf.train.get_global_step()) # critic loss function and updates # take gradient wrt to local variables self.critic_loss_1 = ((self.q_ret - self.adv_xi_ai)**2.0) / 2.0 # for predicting 1-step a_i', p_i, p_i', self.v_target = tf.placeholder(dtype=tf.float32, shape=(None, 1)) #self.v_trunc = tf.minimum(self.p_i, 1.0) * (self.q_ret - self.adv_xi_ai) + self.val_xi self.critic_loss_2 = ((self.v_target - self.val_xi)**2.0) / 2.0 self.critic_loss = self.critic_loss_1 + self.critic_loss_2 #Apply local gradients to global network self.critic_grads = tf.gradients(self.critic_loss, self.local_critic_vars) self.critic_train_op = self.optimizer.apply_gradients( zip(self.critic_grads, self.global_critic_vars), global_step=tf.train.get_global_step()) # critic_summaries op critic_grads_summary = [] for grad, var in zip(self.critic_grads, self.local_critic_vars): critic_grads_summary.append( tf.summary.histogram(var.name + '/gradient', grad)) critic_grads_summary.append( tf.summary.histogram(var.name + '/weight', var)) self.critic_summary_op = tf.summary.merge([ tf.summary.scalar(self.name + "_critc_mean_loss_Q", tf.reduce_mean(self.critic_loss_1)), tf.summary.scalar(self.name + "_critc_mean_loss_V", tf.reduce_mean(self.critic_loss_2)), tf.summary.scalar(self.name + "_critc_sum_loss_Q", tf.reduce_sum(self.critic_loss_1)), tf.summary.scalar(self.name + "_critc_sum_loss_V", tf.reduce_sum(self.critic_loss_2)), tf.summary.scalar(self.name + "_critc_mean_loss", tf.reduce_mean(self.critic_loss)), tf.summary.scalar(self.name + "_critc_sum_loss", tf.reduce_sum(self.critic_loss)), tf.summary.histogram(self.name + "_val_target", self.v_target), tf.summary.histogram(self.name + "_val_pred", self.val_xi), tf.summary.histogram(self.name + "_Q_pred", self.adv_xi_ai), tf.summary.histogram(self.name + "_Q_ret", self.q_ret), tf.summary.histogram(self.name + "_Q_opc", self.q_opc), ] + critic_grads_summary) # actor summaries op actor_grads_summary = [] for grad, var in zip(self.actor_grads, self.local_actor_vars): actor_grads_summary.append( tf.summary.histogram(var.name + '/gradient', grad)) actor_grads_summary.append( tf.summary.histogram(var.name + '/weight', var)) self.actor_summary_op = tf.summary.merge([ tf.summary.scalar(self.name + "_actor_mean_loss_reg_g", tf.reduce_mean(self.reg_g)), tf.summary.scalar(self.name + "_actor_neg_mean_loss_reg_g", tf.reduce_mean(-self.reg_g)), tf.summary.scalar(self.name + "_actor_sum_loss_reg_g", tf.reduce_sum(self.reg_g)), tf.summary.scalar(self.name + "_actor_neg_sum_reg_g", tf.reduce_sum(-self.reg_g)), tf.summary.scalar(self.name + "_actor_sum_g", tf.reduce_sum(self.g)), tf.summary.scalar(self.name + "_actor_neg_sum_g", tf.reduce_sum(-self.g)), tf.summary.scalar(self.name + "_actor_mean_kl", tf.reduce_mean(self.k)), tf.summary.scalar(self.name + "_actor_sum_kl", tf.reduce_sum(self.k)), tf.summary.histogram(self.name + "_policy_stats", self.policy_xi_stats), ] + actor_grads_summary)