Beispiel #1
0
    def __init__(self, policy_net_path, value_net_path, time_limit=20):
        self.time_limit = time_limit
        self.game = None
        self.root = None
        policy_model = PolicyNet('./train/', '/val/')
        value_model = ValueNet('./train/', '/val/')

        g_policy = tf.Graph()
        with g_policy.as_default():
            self.policy_board = tf.placeholder(dtype=tf.float32)
            self.p_is_training = tf.placeholder(dtype=tf.bool)
            self.policy_out = policy_model.inference(
                self.policy_board, is_training=self.p_is_training)
            self.policy_loader = tf.train.Saver()

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.policy_sess = tf.Session(config=config)
            print('load policy model:', policy_net_path)
            self.policy_loader.restore(self.policy_sess, policy_net_path)

        g_value = tf.Graph()
        with g_value.as_default():
            self.value_board = tf.placeholder(dtype=tf.float32,
                                              shape=(None, 19, 19, 21))
            self.v_is_training = tf.placeholder(dtype=tf.bool)
            _, self.value_out = value_model.inference(self.value_board,
                                                      self.v_is_training)
            self.value_loader = tf.train.Saver()

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.value_sess = tf.Session(config=config)
            print('load value model:', value_net_path)
            self.value_loader.restore(self.value_sess, value_net_path)
Beispiel #2
0
    def __init__(self, policy_net_path, value_net_path, time_limit=20):
        self.time_limit = time_limit
        self.game = None
        self.root = None
        policy_model = PolicyNet('./train/', '/val/')
        value_model = ValueNet('./train/', '/val/')

        g_policy = tf.Graph()
        with g_policy.as_default():
            self.policy_board = tf.placeholder(dtype=tf.float32)
            self.p_is_training = tf.placeholder(dtype=tf.bool)
            self.policy_out = policy_model.inference(self.policy_board, is_training=self.p_is_training)
            self.policy_loader = tf.train.Saver()

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.policy_sess = tf.Session(config=config)
            print('load policy model:', policy_net_path)
            self.policy_loader.restore(self.policy_sess, policy_net_path)

        g_value = tf.Graph()
        with g_value.as_default():
            self.value_board = tf.placeholder(dtype=tf.float32, shape=(None, 19, 19, 21))
            self.v_is_training = tf.placeholder(dtype=tf.bool)
            _, self.value_out = value_model.inference(self.value_board, self.v_is_training)
            self.value_loader = tf.train.Saver()

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.value_sess = tf.Session(config=config)
            print('load value model:', value_net_path)
            self.value_loader.restore(self.value_sess, value_net_path)
Beispiel #3
0
def main():
    net = PolicyNet()

    net.read_weights_from_file('./weights/policy_2.0_2017-12-04T21:08:08.381535')
    player2 = RandomPlayer()
    player1 = MCTPlayer()
    game = Game(player1, player2)
    print()
    game.play(log=True)
    def __init__(self, args, debug=False):
        self.policy_net = PolicyNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                    ROW_DIM * COLUMN_DIM, 128)
        self.value_net = ValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                  ROW_DIM * COLUMN_DIM, 128)
        self.q_value_net1 = QValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                      ROW_DIM * COLUMN_DIM, 128)
        self.q_value_net2 = QValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                      ROW_DIM * COLUMN_DIM, 128)
        self.target_value_net = ValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                         ROW_DIM * COLUMN_DIM, 128)

        if debug:
            if type(debug) == str:
                self.load_net(debug)
        else:
            if args.input_network:
                self.load_net(args.input_network)

            self.soft_q_optimizer1 = optim.Adam(self.q_value_net1.parameters(),
                                                lr=args.q_lr)
            self.soft_q_optimizer2 = optim.Adam(self.q_value_net2.parameters(),
                                                lr=args.q_lr)
            self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                              lr=args.value_lr)
            self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                               lr=args.policy_lr)
            self.q_criterion1 = nn.MSELoss()
            self.q_criterion2 = nn.MSELoss()
            self.value_criterion = nn.MSELoss()

        self.to_cuda()
        self.args = args
Beispiel #5
0
    def __init__(self,
                 state_size,
                 action_size,
                 lr=1e-3,
                 gamma=0.99,
                 clipping_epsilon=0.1,
                 ppo_epochs=10,
                 minibatch_size=64,
                 rollout_length=1000,
                 gae_lambda=0.95):
        self.lr = lr
        self.clipping_epsilon = clipping_epsilon
        self.ppo_epochs = ppo_epochs
        self.minibatch_size = minibatch_size
        self.rollout_length = rollout_length

        self.policy = PolicyNet(state_size, action_size)
        self.value_estimator = ValueNet(state_size)
        self.rollout = Rollout(gamma=gamma, gae_lambda=gae_lambda)
Beispiel #6
0
    def eval_node(self, node, game, is_value=True, width=8):
        board_mtx = game.boards[-1].board_mtx
        if is_value:

            t0 = time.time()
            value_query = ValueNet.preprocess_board(
                board_mtx, {
                    'next_to_play': game.next_to_play,
                    'ko_state:': game.ko_state[-1],
                    'current_move': game.current_moves[-1]
                },
                random=False,
                contain_liberty=True)
            value_query = np.asarray([value_query], dtype=np.float32)
            t1 = time.time()
            black_win_rate, = self.value_sess.run([self.value_out],
                                                  feed_dict={
                                                      self.value_board:
                                                      value_query,
                                                      self.v_is_training: False
                                                  })

            black_win_rate = black_win_rate.reshape((1, ))[0]
            t2 = time.time()
            print('TIME', t1 - t0, t2 - t1)

            node.black_win_rate = black_win_rate

        else:
            label_y = {
                'next_to_play': game.next_to_play,
                'ko_state:': game.ko_state[-1],
                'current_move': game.current_moves[-1]
            }
            policy_query = PolicyNet.preprocess_board(board_mtx,
                                                      label_y,
                                                      random=False,
                                                      contain_liberty=True)
            policy_query = np.asarray([policy_query], dtype=np.float32)

            p, = self.policy_sess.run(self.policy_out,
                                      feed_dict={
                                          self.policy_board: policy_query,
                                          self.p_is_training: False
                                      })
            probs = np.reshape(p, (19, 19))
            probs -= np.max(probs)
            probs = np.exp(probs) / np.sum(np.exp(probs))

            ids = np.dstack(
                np.unravel_index(np.argsort(probs.ravel()), (19, 19)))[0]
            ids = ids[::-1][:width, :]
            moves = [([move[0], move[1]], probs[move[0]][move[1]])
                     for move in ids]
            node.moves = [move for move in moves if game.legal_place(*move[0])]
Beispiel #7
0
    def eval_node(self, node, game, is_value=True, width=8):
        board_mtx = game.boards[-1].board_mtx
        if is_value:

            t0 = time.time()
            value_query = ValueNet.preprocess_board(board_mtx, {'next_to_play': game.next_to_play,
                                                                'ko_state:': game.ko_state[-1],
                                                                'current_move': game.current_moves[-1]},
                                                    random=False, contain_liberty=True)
            value_query = np.asarray([value_query], dtype=np.float32)
            t1 = time.time()
            black_win_rate, = self.value_sess.run([self.value_out], feed_dict={self.value_board: value_query,
                                                                               self.v_is_training: False})

            black_win_rate = black_win_rate.reshape((1, ))[0]
            t2 = time.time()
            print('TIME', t1-t0, t2-t1)

            node.black_win_rate = black_win_rate

        else:
            label_y = {'next_to_play': game.next_to_play, 'ko_state:': game.ko_state[-1],
                       'current_move': game.current_moves[-1]}
            policy_query = PolicyNet.preprocess_board(board_mtx, label_y,
                                                      random=False, contain_liberty=True)
            policy_query = np.asarray([policy_query], dtype=np.float32)

            p, = self.policy_sess.run(self.policy_out, feed_dict={self.policy_board:policy_query, self.p_is_training: False})
            probs = np.reshape(p, (19, 19))
            probs -= np.max(probs)
            probs = np.exp(probs) / np.sum(np.exp(probs))

            ids = np.dstack(np.unravel_index(np.argsort(probs.ravel()), (19, 19)))[0]
            ids = ids[::-1][:width, :]
            moves = [([move[0], move[1]], probs[move[0]][move[1]]) for move in ids]
            node.moves = [move for move in moves if game.legal_place(*move[0])]
Beispiel #8
0
    for t in range(len(rewards)):
        total_discounted_reward = 0
        discount = 1
        for k in range(t, len(rewards)):
            total_discounted_reward += rewards[k] * discount
            discount *= discount_factor
            # Don't count rewards from subsequent rounds
            if rewards[k] != 0:
                break
        discounted_rewards[t] = total_discounted_reward
    return discounted_rewards


env = gym.make('Pong-v4')

pongNet = PolicyNet(hidden_layer_size, learning_rate, checkpoints_dir)
if load_checkpoint:
    pongNet.load_checkpoint()

batch_feature_vector = []  #Vector of state, action, and reward
smoothed_reward = None
episode_count = 1

while True:
    print("Starting episode {}".format(episode_count))

    episode_done = False
    episode_reward_sum = 0

    round_num = 1
Beispiel #9
0
class PPOAgent():
    def __init__(self,
                 state_size,
                 action_size,
                 lr=1e-3,
                 gamma=0.99,
                 clipping_epsilon=0.1,
                 ppo_epochs=10,
                 minibatch_size=64,
                 rollout_length=1000,
                 gae_lambda=0.95):
        self.lr = lr
        self.clipping_epsilon = clipping_epsilon
        self.ppo_epochs = ppo_epochs
        self.minibatch_size = minibatch_size
        self.rollout_length = rollout_length

        self.policy = PolicyNet(state_size, action_size)
        self.value_estimator = ValueNet(state_size)
        self.rollout = Rollout(gamma=gamma, gae_lambda=gae_lambda)

    def start_episode(self):
        self.episode_rewards = []
        self.rollout.start_rollout()

    def act(self, state):

        # Check if the rollout is full and needs processing
        if len(self.rollout) == self.rollout_length:
            self.learn()
            self.rollout.start_rollout()

        # Derive action distribution from policy network
        means, sigmas = self.policy(state)
        action_distribution = torch.distributions.Normal(means, sigmas)
        action = action_distribution.sample()
        action_log_prob = action_distribution.log_prob(action)

        # Derive state value estimate from value network
        state_value = self.value_estimator(state).squeeze()

        # Record decision and return sampled action
        self.rollout.record_decision(state, state_value, action,
                                     action_log_prob)
        return action

    def finish_episode(self):
        self.learn()

    def record_outcome(self, reward):
        self.episode_rewards.append(reward)
        self.rollout.record_outcome(reward)

    def average_episode_return(self):
        return sum([r.mean().item() for r in self.episode_rewards])

    def get_current_policy_probs(self, states, actions):

        # For the given state/action pairs, create a distribution from the policy and get the log probs
        means, sigmas = self.policy(states)
        action_distribution = torch.distributions.Normal(means, sigmas)
        current_policy_log_probs = action_distribution.log_prob(actions)

        # Sum log probs over the possible actions
        current_policy_log_probs = current_policy_log_probs.sum(-1)

        return torch.exp(current_policy_log_probs)

    def learn(self):

        (states, actions, future_returns, normalised_advantages, original_policy_probs) = \
            self.rollout.flatten_trajectories()

        # Run through PPO epochs
        policy_optimiser = optim.Adam(self.policy.parameters(),
                                      lr=self.lr,
                                      eps=1e-5)
        value_estimator_optimiser = optim.Adam(
            self.value_estimator.parameters(), lr=self.lr, eps=1e-5)
        for ppo_epoch in range(self.ppo_epochs):

            # Sample the trajectories randomly in mini-batches
            for indices in random_sample(np.arange(states.shape[0]),
                                         self.minibatch_size):

                # Sample using sample indices
                states_sample = states[indices]
                actions_sample = actions[indices]
                future_returns_sample = future_returns[indices]
                normalised_advantages_sample = normalised_advantages[indices]
                original_policy_probs_sample = original_policy_probs[indices]

                # Use the current policy to get the probabilities for the sample states and actions
                # We use these to weight the likehoods, allowing resuse of the rollout
                current_policy_probs_sample = self.get_current_policy_probs(
                    states_sample, actions_sample)

                # Define PPO surrogate and clip to get the policy loss
                sampling_ratio = current_policy_probs_sample / original_policy_probs_sample
                clipped_ratio = torch.clamp(sampling_ratio,
                                            1 - self.clipping_epsilon,
                                            1 + self.clipping_epsilon)
                clipped_surrogate = torch.min(
                    sampling_ratio * normalised_advantages_sample,
                    clipped_ratio * normalised_advantages_sample)
                policy_loss = -torch.mean(clipped_surrogate)

                # Define value estimator loss
                state_values_sample = self.value_estimator(
                    states_sample).squeeze()
                value_estimator_loss = nn.MSELoss()(state_values_sample,
                                                    future_returns_sample)

                # Update value estimator
                value_estimator_optimiser.zero_grad()
                value_estimator_loss.backward()
                nn.utils.clip_grad_norm_(self.value_estimator.parameters(),
                                         0.75)
                value_estimator_optimiser.step()

                # Update policy
                policy_optimiser.zero_grad()
                policy_loss.backward()
                nn.utils.clip_grad_norm_(self.policy.parameters(), 0.75)
                policy_optimiser.step()
Beispiel #10
0
ACTION_DIM = env.action_space.shape[0]
INPUT_DIM = env.observation_space.shape[0]

# disable GPU memory usuage here
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# create the summary writer here
summary_writer = tf.summary.FileWriter(os.path.join(MODEL_DIR, "train"))

# to run stuff on cpu
with tf.device("/cpu:0"):

    # Keeps track of the number of updates we've performed
    global_step = tf.Variable(0, name="global_step", trainable=False)

    global_actor_net = PolicyNet(HIDDEN_LAYER, ACTION_DIM, name="global_actor")
    global_critic_net = AdvantageValueNet(HIDDEN_LAYER, name="global_critic")
    # connecting stuff
    tmp_x = tf.placeholder(dtype=tf.float32,
                           shape=(BATCH_SIZE, INPUT_DIM),
                           name="tmp_x")
    tmp_a = tf.placeholder(dtype=tf.float32,
                           shape=(BATCH_SIZE, ACTION_DIM),
                           name="tmp_a")

    global_average_actor_net = PolicyNet(HIDDEN_LAYER,
                                         ACTION_DIM,
                                         name="global_Average_actor")

    _, tmp_policy = global_actor_net(tmp_x)
    _ = global_critic_net(tmp_x, tmp_a, tmp_policy)
Beispiel #11
0
 def __init__(self):
     self.policy_net = PolicyNet()
     self.eval_net = EvalNet()
Beispiel #12
0
    def build_graph(self):
        """
        builds a local graph
        """
        # place holders for inputs here
        HIDDEN_LAYER = self.FLAGS.feature_layer_size

        self.x_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.INPUT_DIM),
                                  name="x_i")
        self.a_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.ACTION_DIM),
                                  name="a_i")
        self.q_opc = tf.placeholder(dtype=tf.float32,
                                    shape=(None, 1),
                                    name="q_opc")
        self.q_ret = tf.placeholder(dtype=tf.float32,
                                    shape=(None, 1),
                                    name="q_ret")
        self.c = self.FLAGS.c  # truncation threshold constant

        self.actor_net = PolicyNet(HIDDEN_LAYER,
                                   self.ACTION_DIM,
                                   name=self.name + "_actor",
                                   co_var=self.co_var)
        self.critic_net = AdvantageValueNet(HIDDEN_LAYER,
                                            name=self.name + "_critic")

        self.policy_xi_stats, self.policy_xi_dist = self.actor_net(self.x_i)

        self.val_xi, self.adv_xi_ai = self.critic_net(self.x_i, self.a_i,
                                                      self.policy_xi_dist)

        #sample a' now
        self.a_i_ = tf.reshape(self.policy_xi_dist.sample(1),
                               shape=[-1, self.ACTION_DIM])

        _, self.adv_xi_ai_ = self.critic_net(
            self.x_i, self.a_i_,
            self.policy_xi_dist)  # val will be the same for

        _, self.average_policy_xi_dist = self.average_actor_net(
            self.x_i)  # can this be done better ?

        self.prob_a_i = tf.reshape(self.policy_xi_dist.prob(self.a_i),
                                   shape=[-1, 1]) + 1e-8
        self.prob_a_i_ = tf.reshape(self.policy_xi_dist.prob(self.a_i_),
                                    shape=[-1, 1]) + 1e-8

        self.log_prob_a_i = tf.log(self.prob_a_i)
        self.log_prob_a_i_ = tf.log(self.prob_a_i_)

        # for predicting 1-step a_i', p_i, p_i',
        self.u_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.ACTION_DIM))

        self.u_i_dist = tf.contrib.distributions.MultivariateNormalDiag(
            loc=self.u_i, scale_diag=tf.ones_like(self.u_i) * self.co_var)

        self.u_i_prob_a_i = tf.reshape(self.u_i_dist.prob(self.a_i),
                                       shape=[-1, 1]) + 1e-8
        self.u_i_prob_a_i_ = tf.reshape(self.u_i_dist.prob(self.a_i_),
                                        shape=[-1, 1]) + 1e-8

        self.p_i = tf.divide(self.prob_a_i, self.u_i_prob_a_i)
        self.p_i_ = tf.divide(self.prob_a_i_, self.u_i_prob_a_i_)

        # take care of NaNs here, for importance sampling weights (might be an extra step)
        self.p_i = tf.where(tf.is_nan(self.p_i), tf.zeros_like(self.p_i),
                            self.p_i)
        self.p_i_ = tf.where(tf.is_nan(self.p_i_), tf.zeros_like(self.p_i_),
                             self.p_i_)

        self.c_i = tf.minimum(1., tf.pow(self.p_i, 1.0 / self.ACTION_DIM))

        # for verification about checking if params are getting synched
        self.local_actor_vars = self.actor_net.local_params()
        self.global_actor_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'global_actor')

        self.local_critic_vars = self.critic_net.local_params()
        self.global_critic_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'global_critic')

        # Sync ops from global
        self.sync_local_actor_op = self.actor_net.update_local_params_op(
            'global_actor')  # global actor
        self.sync_local_critic_op = self.critic_net.update_local_params_op(
            'global_critic')

        # soft update the average network
        self.soft_update_average_actor_op = self.average_actor_net.soft_update_from_target_params(
            'global_actor', self.FLAGS.tau)

        #Get gradients from local network using local losses

        g1 = tf.reshape(tf.gradients(
            (self.log_prob_a_i * (self.q_opc - self.val_xi)),
            self.policy_xi_stats,
            name=self.name + "g1_grads"),
                        shape=[-1, self.ACTION_DIM])
        g2 = (self.adv_xi_ai_ - self.val_xi) * tf.reshape(
            tf.gradients((self.log_prob_a_i_),
                         self.policy_xi_stats,
                         name=self.name + "g2_grads"),
            shape=[-1, self.ACTION_DIM])

        self.g = tf.minimum(self.c, self.p_i) * g1 + tf.nn.relu(
            1 - tf.divide(self.c, self.p_i_)) * g2

        self.k = tf.reshape(tf.gradients(
            tf.contrib.distributions.kl_divergence(self.average_policy_xi_dist,
                                                   self.policy_xi_dist),
            self.policy_xi_stats),
                            shape=[-1, self.ACTION_DIM])

        self.kg = tf.reduce_sum(tf.multiply(self.g, self.k), 1, keep_dims=True)

        #print "kg", self.kg

        self.k2 = tf.reduce_sum(tf.multiply(self.k, self.k), 1, keep_dims=True)

        self.reg_g = self.g - tf.maximum(
            tf.zeros_like(self.g),
            tf.divide((self.kg - self.FLAGS.delta), self.k2)) * self.k

        # take gradients wrt to the local params
        self.actor_grads = tf.gradients(self.policy_xi_stats,
                                        self.local_actor_vars,
                                        grad_ys=-self.reg_g,
                                        name="actor_grads")

        #for ti,tj in zip(self.actor_grads, self.global_actor_vars):
        #    print ti, "\n", tj , "\n", "==========="

        # apply local gradients to the global network
        self.actor_train_op = self.optimizer.apply_gradients(
            zip(self.actor_grads, self.global_actor_vars),
            global_step=tf.train.get_global_step())

        # critic loss function and updates

        # take gradient wrt to local variables
        self.critic_loss_1 = ((self.q_ret - self.adv_xi_ai)**2.0) / 2.0

        # for predicting 1-step a_i', p_i, p_i',
        self.v_target = tf.placeholder(dtype=tf.float32, shape=(None, 1))

        #self.v_trunc = tf.minimum(self.p_i, 1.0) * (self.q_ret - self.adv_xi_ai) + self.val_xi
        self.critic_loss_2 = ((self.v_target - self.val_xi)**2.0) / 2.0

        self.critic_loss = self.critic_loss_1 + self.critic_loss_2

        #Apply local gradients to global network

        self.critic_grads = tf.gradients(self.critic_loss,
                                         self.local_critic_vars)

        self.critic_train_op = self.optimizer.apply_gradients(
            zip(self.critic_grads, self.global_critic_vars),
            global_step=tf.train.get_global_step())

        # critic_summaries op
        critic_grads_summary = []
        for grad, var in zip(self.critic_grads, self.local_critic_vars):
            critic_grads_summary.append(
                tf.summary.histogram(var.name + '/gradient', grad))
            critic_grads_summary.append(
                tf.summary.histogram(var.name + '/weight', var))

        self.critic_summary_op = tf.summary.merge([
            tf.summary.scalar(self.name + "_critc_mean_loss_Q",
                              tf.reduce_mean(self.critic_loss_1)),
            tf.summary.scalar(self.name + "_critc_mean_loss_V",
                              tf.reduce_mean(self.critic_loss_2)),
            tf.summary.scalar(self.name + "_critc_sum_loss_Q",
                              tf.reduce_sum(self.critic_loss_1)),
            tf.summary.scalar(self.name + "_critc_sum_loss_V",
                              tf.reduce_sum(self.critic_loss_2)),
            tf.summary.scalar(self.name + "_critc_mean_loss",
                              tf.reduce_mean(self.critic_loss)),
            tf.summary.scalar(self.name + "_critc_sum_loss",
                              tf.reduce_sum(self.critic_loss)),
            tf.summary.histogram(self.name + "_val_target", self.v_target),
            tf.summary.histogram(self.name + "_val_pred", self.val_xi),
            tf.summary.histogram(self.name + "_Q_pred", self.adv_xi_ai),
            tf.summary.histogram(self.name + "_Q_ret", self.q_ret),
            tf.summary.histogram(self.name + "_Q_opc", self.q_opc),
        ] + critic_grads_summary)

        # actor summaries op

        actor_grads_summary = []
        for grad, var in zip(self.actor_grads, self.local_actor_vars):
            actor_grads_summary.append(
                tf.summary.histogram(var.name + '/gradient', grad))
            actor_grads_summary.append(
                tf.summary.histogram(var.name + '/weight', var))

        self.actor_summary_op = tf.summary.merge([
            tf.summary.scalar(self.name + "_actor_mean_loss_reg_g",
                              tf.reduce_mean(self.reg_g)),
            tf.summary.scalar(self.name + "_actor_neg_mean_loss_reg_g",
                              tf.reduce_mean(-self.reg_g)),
            tf.summary.scalar(self.name + "_actor_sum_loss_reg_g",
                              tf.reduce_sum(self.reg_g)),
            tf.summary.scalar(self.name + "_actor_neg_sum_reg_g",
                              tf.reduce_sum(-self.reg_g)),
            tf.summary.scalar(self.name +
                              "_actor_sum_g", tf.reduce_sum(self.g)),
            tf.summary.scalar(self.name +
                              "_actor_neg_sum_g", tf.reduce_sum(-self.g)),
            tf.summary.scalar(self.name +
                              "_actor_mean_kl", tf.reduce_mean(self.k)),
            tf.summary.scalar(self.name +
                              "_actor_sum_kl", tf.reduce_sum(self.k)),
            tf.summary.histogram(self.name +
                                 "_policy_stats", self.policy_xi_stats),
        ] + actor_grads_summary)
Beispiel #13
0
class Agent():
    def __init__(self,
                 name=-1,
                 environment=None,
                 global_counter=0,
                 average_actor_net=None,
                 co_var=0.3,
                 summary_writer=None,
                 saver=None,
                 optimizer=None,
                 flags=None):

        self.name = "acer_agent_" + name
        self.memory = Memory(5000)  # each worker has its own memory

        # all the flags variables here
        self.FLAGS = flags

        # for dumping info about this agent
        # self.file_dump = open("./dump/" + self.name + "_dump", 'w', 0)

        # average net copied
        self.average_actor_net = average_actor_net

        # if shared optimizer given use that or else create its own
        if optimizer is None:
            self.optimizer = tf.train.RMSPropOptimizer(
                learning_rate=self.FLAGS.lr)
        else:
            self.optimizer = optimizer

        # env here
        self.env = environment

        self.ACTION_DIM = self.env.action_space.shape[0]
        self.INPUT_DIM = self.env.observation_space.shape[0]

        # summary, saver, checkpointing
        self.summary_writer = summary_writer
        self.saver = saver

        if summary_writer is not None:
            self.checkpoint_path = os.path.abspath(
                os.path.join(summary_writer.get_logdir(),
                             "../checkpoints/model"))

        # diagonal co var for policy
        self.co_var = co_var

        # counter
        self.local_counter = itertools.count()
        self.global_counter = global_counter  #next(self.global_counter)

        # loss function and optimizer in build graphs
        self.build_graph()

    def build_graph(self):
        """
        builds a local graph
        """
        # place holders for inputs here
        HIDDEN_LAYER = self.FLAGS.feature_layer_size

        self.x_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.INPUT_DIM),
                                  name="x_i")
        self.a_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.ACTION_DIM),
                                  name="a_i")
        self.q_opc = tf.placeholder(dtype=tf.float32,
                                    shape=(None, 1),
                                    name="q_opc")
        self.q_ret = tf.placeholder(dtype=tf.float32,
                                    shape=(None, 1),
                                    name="q_ret")
        self.c = self.FLAGS.c  # truncation threshold constant

        self.actor_net = PolicyNet(HIDDEN_LAYER,
                                   self.ACTION_DIM,
                                   name=self.name + "_actor",
                                   co_var=self.co_var)
        self.critic_net = AdvantageValueNet(HIDDEN_LAYER,
                                            name=self.name + "_critic")

        self.policy_xi_stats, self.policy_xi_dist = self.actor_net(self.x_i)

        self.val_xi, self.adv_xi_ai = self.critic_net(self.x_i, self.a_i,
                                                      self.policy_xi_dist)

        #sample a' now
        self.a_i_ = tf.reshape(self.policy_xi_dist.sample(1),
                               shape=[-1, self.ACTION_DIM])

        _, self.adv_xi_ai_ = self.critic_net(
            self.x_i, self.a_i_,
            self.policy_xi_dist)  # val will be the same for

        _, self.average_policy_xi_dist = self.average_actor_net(
            self.x_i)  # can this be done better ?

        self.prob_a_i = tf.reshape(self.policy_xi_dist.prob(self.a_i),
                                   shape=[-1, 1]) + 1e-8
        self.prob_a_i_ = tf.reshape(self.policy_xi_dist.prob(self.a_i_),
                                    shape=[-1, 1]) + 1e-8

        self.log_prob_a_i = tf.log(self.prob_a_i)
        self.log_prob_a_i_ = tf.log(self.prob_a_i_)

        # for predicting 1-step a_i', p_i, p_i',
        self.u_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.ACTION_DIM))

        self.u_i_dist = tf.contrib.distributions.MultivariateNormalDiag(
            loc=self.u_i, scale_diag=tf.ones_like(self.u_i) * self.co_var)

        self.u_i_prob_a_i = tf.reshape(self.u_i_dist.prob(self.a_i),
                                       shape=[-1, 1]) + 1e-8
        self.u_i_prob_a_i_ = tf.reshape(self.u_i_dist.prob(self.a_i_),
                                        shape=[-1, 1]) + 1e-8

        self.p_i = tf.divide(self.prob_a_i, self.u_i_prob_a_i)
        self.p_i_ = tf.divide(self.prob_a_i_, self.u_i_prob_a_i_)

        # take care of NaNs here, for importance sampling weights (might be an extra step)
        self.p_i = tf.where(tf.is_nan(self.p_i), tf.zeros_like(self.p_i),
                            self.p_i)
        self.p_i_ = tf.where(tf.is_nan(self.p_i_), tf.zeros_like(self.p_i_),
                             self.p_i_)

        self.c_i = tf.minimum(1., tf.pow(self.p_i, 1.0 / self.ACTION_DIM))

        # for verification about checking if params are getting synched
        self.local_actor_vars = self.actor_net.local_params()
        self.global_actor_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'global_actor')

        self.local_critic_vars = self.critic_net.local_params()
        self.global_critic_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'global_critic')

        # Sync ops from global
        self.sync_local_actor_op = self.actor_net.update_local_params_op(
            'global_actor')  # global actor
        self.sync_local_critic_op = self.critic_net.update_local_params_op(
            'global_critic')

        # soft update the average network
        self.soft_update_average_actor_op = self.average_actor_net.soft_update_from_target_params(
            'global_actor', self.FLAGS.tau)

        #Get gradients from local network using local losses

        g1 = tf.reshape(tf.gradients(
            (self.log_prob_a_i * (self.q_opc - self.val_xi)),
            self.policy_xi_stats,
            name=self.name + "g1_grads"),
                        shape=[-1, self.ACTION_DIM])
        g2 = (self.adv_xi_ai_ - self.val_xi) * tf.reshape(
            tf.gradients((self.log_prob_a_i_),
                         self.policy_xi_stats,
                         name=self.name + "g2_grads"),
            shape=[-1, self.ACTION_DIM])

        self.g = tf.minimum(self.c, self.p_i) * g1 + tf.nn.relu(
            1 - tf.divide(self.c, self.p_i_)) * g2

        self.k = tf.reshape(tf.gradients(
            tf.contrib.distributions.kl_divergence(self.average_policy_xi_dist,
                                                   self.policy_xi_dist),
            self.policy_xi_stats),
                            shape=[-1, self.ACTION_DIM])

        self.kg = tf.reduce_sum(tf.multiply(self.g, self.k), 1, keep_dims=True)

        #print "kg", self.kg

        self.k2 = tf.reduce_sum(tf.multiply(self.k, self.k), 1, keep_dims=True)

        self.reg_g = self.g - tf.maximum(
            tf.zeros_like(self.g),
            tf.divide((self.kg - self.FLAGS.delta), self.k2)) * self.k

        # take gradients wrt to the local params
        self.actor_grads = tf.gradients(self.policy_xi_stats,
                                        self.local_actor_vars,
                                        grad_ys=-self.reg_g,
                                        name="actor_grads")

        #for ti,tj in zip(self.actor_grads, self.global_actor_vars):
        #    print ti, "\n", tj , "\n", "==========="

        # apply local gradients to the global network
        self.actor_train_op = self.optimizer.apply_gradients(
            zip(self.actor_grads, self.global_actor_vars),
            global_step=tf.train.get_global_step())

        # critic loss function and updates

        # take gradient wrt to local variables
        self.critic_loss_1 = ((self.q_ret - self.adv_xi_ai)**2.0) / 2.0

        # for predicting 1-step a_i', p_i, p_i',
        self.v_target = tf.placeholder(dtype=tf.float32, shape=(None, 1))

        #self.v_trunc = tf.minimum(self.p_i, 1.0) * (self.q_ret - self.adv_xi_ai) + self.val_xi
        self.critic_loss_2 = ((self.v_target - self.val_xi)**2.0) / 2.0

        self.critic_loss = self.critic_loss_1 + self.critic_loss_2

        #Apply local gradients to global network

        self.critic_grads = tf.gradients(self.critic_loss,
                                         self.local_critic_vars)

        self.critic_train_op = self.optimizer.apply_gradients(
            zip(self.critic_grads, self.global_critic_vars),
            global_step=tf.train.get_global_step())

        # critic_summaries op
        critic_grads_summary = []
        for grad, var in zip(self.critic_grads, self.local_critic_vars):
            critic_grads_summary.append(
                tf.summary.histogram(var.name + '/gradient', grad))
            critic_grads_summary.append(
                tf.summary.histogram(var.name + '/weight', var))

        self.critic_summary_op = tf.summary.merge([
            tf.summary.scalar(self.name + "_critc_mean_loss_Q",
                              tf.reduce_mean(self.critic_loss_1)),
            tf.summary.scalar(self.name + "_critc_mean_loss_V",
                              tf.reduce_mean(self.critic_loss_2)),
            tf.summary.scalar(self.name + "_critc_sum_loss_Q",
                              tf.reduce_sum(self.critic_loss_1)),
            tf.summary.scalar(self.name + "_critc_sum_loss_V",
                              tf.reduce_sum(self.critic_loss_2)),
            tf.summary.scalar(self.name + "_critc_mean_loss",
                              tf.reduce_mean(self.critic_loss)),
            tf.summary.scalar(self.name + "_critc_sum_loss",
                              tf.reduce_sum(self.critic_loss)),
            tf.summary.histogram(self.name + "_val_target", self.v_target),
            tf.summary.histogram(self.name + "_val_pred", self.val_xi),
            tf.summary.histogram(self.name + "_Q_pred", self.adv_xi_ai),
            tf.summary.histogram(self.name + "_Q_ret", self.q_ret),
            tf.summary.histogram(self.name + "_Q_opc", self.q_opc),
        ] + critic_grads_summary)

        # actor summaries op

        actor_grads_summary = []
        for grad, var in zip(self.actor_grads, self.local_actor_vars):
            actor_grads_summary.append(
                tf.summary.histogram(var.name + '/gradient', grad))
            actor_grads_summary.append(
                tf.summary.histogram(var.name + '/weight', var))

        self.actor_summary_op = tf.summary.merge([
            tf.summary.scalar(self.name + "_actor_mean_loss_reg_g",
                              tf.reduce_mean(self.reg_g)),
            tf.summary.scalar(self.name + "_actor_neg_mean_loss_reg_g",
                              tf.reduce_mean(-self.reg_g)),
            tf.summary.scalar(self.name + "_actor_sum_loss_reg_g",
                              tf.reduce_sum(self.reg_g)),
            tf.summary.scalar(self.name + "_actor_neg_sum_reg_g",
                              tf.reduce_sum(-self.reg_g)),
            tf.summary.scalar(self.name +
                              "_actor_sum_g", tf.reduce_sum(self.g)),
            tf.summary.scalar(self.name +
                              "_actor_neg_sum_g", tf.reduce_sum(-self.g)),
            tf.summary.scalar(self.name +
                              "_actor_mean_kl", tf.reduce_mean(self.k)),
            tf.summary.scalar(self.name +
                              "_actor_sum_kl", tf.reduce_sum(self.k)),
            tf.summary.histogram(self.name +
                                 "_policy_stats", self.policy_xi_stats),
        ] + actor_grads_summary)

    def run(self, sess, coord):
        """
        Main method, the ACER algorithm runs via this method

        """
        # use the prev session
        with sess.as_default(), sess.graph.as_default():
            # run stuff here
            try:
                # keep running the agents in a while loop
                while not coord.should_stop():

                    # gather experiences
                    for i in range(self.FLAGS.pure_exploration_steps):
                        eps_reward, eps_len, local_t, global_t = self.random_exploration_step(
                            sess)
                        # use eps-greedy

                    # 1 time current policy
                    for i in range(self.FLAGS.current_policy_steps):
                        eps_reward, eps_len, local_t, global_t = self.current_policy_step(
                            sess)

                    # train off policy
                    for i in range(self.FLAGS.update_steps):
                        self.train_off_policy(sess)

            except tf.errors.CancelledError:
                return

    def train_off_policy(self, sess):
        """
        ACER algorithm updates here 
        """

        # sync the local nets from the global
        sess.run([self.sync_local_actor_op, self.sync_local_critic_op])

        # sample trajectory from the replay memory
        traj = self.memory.get_trajectory(self.FLAGS.k_steps)
        k = len(traj)

        # empty list to store targets
        q_ret_list = []
        q_opc_list = []
        states = []
        actions = []
        mu_dist = []
        val_s = []

        # if last episode is not terminal state, use value function to get an estimate
        Q_ret = 0.0
        if not traj[-1].done:
            Q_ret = sess.run(
                [self.val_xi],
                feed_dict={self.x_i: np.reshape(traj[-1].next_state,
                                                (1, -1))})[0][0, 0]

        Q_opc = Q_ret

        # reverse loop
        for transition in traj[::-1]:
            Q_ret = transition.reward + self.FLAGS.gamma * Q_ret
            Q_opc = transition.reward + self.FLAGS.gamma * Q_opc

            #
            x_t = np.reshape(transition.state, (1, -1))
            a_t = np.reshape(transition.action, (1, -1))
            u_t = np.reshape(transition.distribution, (1, -1))

            # add to minibatch
            q_ret_list.append(Q_ret)
            q_opc_list.append(Q_opc)
            states.append(x_t)
            actions.append(a_t)
            mu_dist.append(u_t)

            # get estimates from existing function approximators
            v_t, c_t, p_t, q_t = sess.run(
                [self.val_xi, self.c_i, self.p_i, self.adv_xi_ai],
                feed_dict={
                    self.x_i: x_t,
                    self.a_i: a_t,
                    self.u_i: u_t
                })

            # add the target V_pi
            val_s.append((min(p_t[0, 0], 1.0) * (Q_ret - q_t[0, 0])) +
                         v_t[0, 0])

            # update again
            Q_ret = c_t[0, 0] * (Q_ret - q_t[0, 0]) + v_t[0, 0]
            Q_opc = (Q_opc - q_t[0, 0]) + v_t[0, 0]

        # create mini-batch here
        opt_feed_dict = {
            self.x_i: np.asarray(states).reshape(-1, self.INPUT_DIM),
            self.a_i: np.asarray(actions).reshape(-1, self.ACTION_DIM),
            self.q_opc: np.asarray(q_opc_list).reshape(-1, 1),
            self.q_ret: np.asarray(q_ret_list).reshape(-1, 1),
            self.u_i: np.asarray(mu_dist).reshape(-1, self.ACTION_DIM),
            self.v_target: np.asarray(val_s).reshape(-1, 1)
        }

        # Train the global estimators using local gradients
        _, _, global_step, critic_summaries, actor_summaries = sess.run(
            [
                self.actor_train_op, self.critic_train_op,
                tf.contrib.framework.get_global_step(), self.critic_summary_op,
                self.actor_summary_op
            ],
            feed_dict=opt_feed_dict)

        # Write summaries
        if self.summary_writer is not None:
            self.summary_writer.add_summary(critic_summaries, global_step)
            self.summary_writer.add_summary(actor_summaries, global_step)
            self.summary_writer.flush()

        # update the average policy network
        _ = sess.run([self.soft_update_average_actor_op])

        # that's it

    def random_exploration_step(self, sess):
        """
        follow a random uniform policy to gather experiences and add them to the replay memory
        """
        episode_reward = 0.0
        episode_len = 0  #num of action

        # random policy
        random_policy = np.zeros((1, self.ACTION_DIM))

        #for each episode reset first
        state = self.env.reset()
        for t in range(self.FLAGS.max_episode_len):
            action = self.env.action_space.sample()  # random action

            next_state, reward, done, info = self.env.step(
                action)  # next state, reward, terminal

            # insert this in memory with a uniform distribution over actions

            self.memory.add(
                Transition(state=state,
                           action=action,
                           reward=reward,
                           done=done,
                           distribution=random_policy,
                           next_state=next_state))

            # accumulate rewards
            episode_reward += reward
            episode_len += 1

            local_t = next(self.local_counter)
            global_t = next(self.global_counter)

            # update the state
            state = next_state

            if done:
                # print("Episode finished after {} timesteps".format(t+1))
                break

        return episode_reward, episode_len, local_t, global_t

    def current_policy_step(self, sess, add_to_mem=True):
        """
        follow the current policy network and gather trajectories and update them in the replay memory
        
        return the reward for this epiosde here
        # plot the reward over the trajectories
        """

        episode_reward = 0.0
        episode_len = 0  #num of action

        #for each episode reset first
        state = self.env.reset()

        for t in range(self.FLAGS.max_episode_len):

            # take action according to current policy

            action, policy_stats = sess.run(
                [self.a_i_, self.policy_xi_stats],
                feed_dict={self.x_i: np.array([state])})

            action = np.reshape(action, (self.ACTION_DIM, ))

            next_state, reward, done, info = self.env.step(
                action)  # next state, reward, terminal

            # insert this in memory with a uniform distribution over actions
            if add_to_mem:  # can also remove this and still work/optimization
                self.memory.add(
                    Transition(state=state,
                               action=action,
                               reward=reward,
                               done=done,
                               distribution=policy_stats,
                               next_state=next_state))

            # accumulate rewards
            episode_reward += reward
            episode_len += 1

            local_t = next(self.local_counter)
            global_t = next(self.global_counter)

            # update the state
            state = next_state

            if done:
                # print("Episode finished after {} timesteps".format(t+1))
                break

        return episode_reward, episode_len, local_t, global_t

    def evaluate_policy(self, sess, eval_every=3600, coord=None):
        """
        follow the current policy network and gather trajectories and update them in the replay memory
        
        return the reward for this epiosde here
        # plot the reward over the trajectories
        """

        self.video_dir = os.path.join(self.summary_writer.get_logdir(),
                                      "../videos")
        self.video_dir = os.path.abspath(self.video_dir)

        try:
            os.makedirs(self.video_dir)
        except Exception:
            pass

        self.env._max_episode_steps = self.FLAGS.max_episode_len
        self.env = Monitor(self.env,
                           directory=self.video_dir,
                           video_callable=lambda x: True,
                           resume=True)

        with sess.as_default(), sess.graph.as_default():
            # run stuff here

            try:

                while not coord.should_stop():

                    # sync the actor
                    global_step, _ = sess.run([
                        tf.contrib.framework.get_global_step(),
                        self.sync_local_actor_op
                    ])

                    #for each episode reset first
                    eps_reward, eps_len, _, global_t = self.current_policy_step(
                        sess, add_to_mem=False)

                    # Add summaries
                    if self.summary_writer is not None:
                        episode_summary = tf.Summary()
                        episode_summary.value.add(simple_value=eps_reward,
                                                  tag=self.name +
                                                  "/total_reward")
                        episode_summary.value.add(simple_value=eps_len,
                                                  tag=self.name +
                                                  "/episode_length")
                        self.summary_writer.add_summary(
                            episode_summary, global_step)

                        episode_summary_frame = tf.Summary()
                        episode_summary_frame.value.add(
                            simple_value=eps_reward,
                            tag=self.name + "/frame/total_reward")
                        episode_summary_frame.value.add(
                            simple_value=eps_len,
                            tag=self.name + "/frame/episode_length")
                        self.summary_writer.add_summary(
                            episode_summary_frame, global_t)

                        self.summary_writer.flush()

                    if self.saver is not None:
                        self.saver.save(sess, self.checkpoint_path)

                    tf.logging.info(
                        "Eval results at step {}: total_reward {}, episode_length {}"
                        .format(global_step, eps_reward, eps_len))
                    tf.logging.info("Total steps taken so far: {}".format(
                        self.global_counter))

                    # # Sleep until next evaluation cycle
                    time.sleep(eval_every)

                    # for stopping once
#                     coord.request_stop()
#                     return
            except tf.errors.CancelledError:
                return
Beispiel #14
0
    for t in range(len(rewards)):
        total_discounted_reward = 0
        discount = 1
        for k in range(t, len(rewards)):
            total_discounted_reward += rewards[k] * discount
            discount *= discount_factor
            # Don't count rewards from subsequent rounds
            if rewards[k] != 0:
                break
        discounted_rewards[t] = total_discounted_reward
    return discounted_rewards


env = gym.make('Pong-v4')

pongNet = PolicyNet(hidden_layer_size, learning_rate, checkpoints_dir)
if load_checkpoint:
    pongNet.load_checkpoint()

batch_feature_vector = []  #Vector of state, action, and reward
smoothed_reward = None
episode_count = 1

while True:
    print("Starting episode {}".format(episode_count))

    episode_done = False
    episode_reward_sum = 0

    round_num = 1