Exemple #1
0
    def __init__(self, policy_net_path, value_net_path, time_limit=20):
        self.time_limit = time_limit
        self.game = None
        self.root = None
        policy_model = PolicyNet('./train/', '/val/')
        value_model = ValueNet('./train/', '/val/')

        g_policy = tf.Graph()
        with g_policy.as_default():
            self.policy_board = tf.placeholder(dtype=tf.float32)
            self.p_is_training = tf.placeholder(dtype=tf.bool)
            self.policy_out = policy_model.inference(
                self.policy_board, is_training=self.p_is_training)
            self.policy_loader = tf.train.Saver()

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.policy_sess = tf.Session(config=config)
            print('load policy model:', policy_net_path)
            self.policy_loader.restore(self.policy_sess, policy_net_path)

        g_value = tf.Graph()
        with g_value.as_default():
            self.value_board = tf.placeholder(dtype=tf.float32,
                                              shape=(None, 19, 19, 21))
            self.v_is_training = tf.placeholder(dtype=tf.bool)
            _, self.value_out = value_model.inference(self.value_board,
                                                      self.v_is_training)
            self.value_loader = tf.train.Saver()

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.value_sess = tf.Session(config=config)
            print('load value model:', value_net_path)
            self.value_loader.restore(self.value_sess, value_net_path)
    def __init__(self, args, debug=False):
        self.policy_net = PolicyNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                    ROW_DIM * COLUMN_DIM, 128)
        self.value_net = ValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                  ROW_DIM * COLUMN_DIM, 128)
        self.q_value_net1 = QValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                      ROW_DIM * COLUMN_DIM, 128)
        self.q_value_net2 = QValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                      ROW_DIM * COLUMN_DIM, 128)
        self.target_value_net = ValueNet(NUM_OF_COLOR, ROW_DIM * COLUMN_DIM,
                                         ROW_DIM * COLUMN_DIM, 128)

        if debug:
            if type(debug) == str:
                self.load_net(debug)
        else:
            if args.input_network:
                self.load_net(args.input_network)

            self.soft_q_optimizer1 = optim.Adam(self.q_value_net1.parameters(),
                                                lr=args.q_lr)
            self.soft_q_optimizer2 = optim.Adam(self.q_value_net2.parameters(),
                                                lr=args.q_lr)
            self.value_optimizer = optim.Adam(self.value_net.parameters(),
                                              lr=args.value_lr)
            self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                               lr=args.policy_lr)
            self.q_criterion1 = nn.MSELoss()
            self.q_criterion2 = nn.MSELoss()
            self.value_criterion = nn.MSELoss()

        self.to_cuda()
        self.args = args
Exemple #3
0
def main():
    net = PolicyNet()

    net.read_weights_from_file('./weights/policy_2.0_2017-12-04T21:08:08.381535')
    player2 = RandomPlayer()
    player1 = MCTPlayer()
    game = Game(player1, player2)
    print()
    game.play(log=True)
Exemple #4
0
    def __init__(self,
                 state_size,
                 action_size,
                 lr=1e-3,
                 gamma=0.99,
                 clipping_epsilon=0.1,
                 ppo_epochs=10,
                 minibatch_size=64,
                 rollout_length=1000,
                 gae_lambda=0.95):
        self.lr = lr
        self.clipping_epsilon = clipping_epsilon
        self.ppo_epochs = ppo_epochs
        self.minibatch_size = minibatch_size
        self.rollout_length = rollout_length

        self.policy = PolicyNet(state_size, action_size)
        self.value_estimator = ValueNet(state_size)
        self.rollout = Rollout(gamma=gamma, gae_lambda=gae_lambda)
Exemple #5
0
    for t in range(len(rewards)):
        total_discounted_reward = 0
        discount = 1
        for k in range(t, len(rewards)):
            total_discounted_reward += rewards[k] * discount
            discount *= discount_factor
            # Don't count rewards from subsequent rounds
            if rewards[k] != 0:
                break
        discounted_rewards[t] = total_discounted_reward
    return discounted_rewards


env = gym.make('Pong-v4')

pongNet = PolicyNet(hidden_layer_size, learning_rate, checkpoints_dir)
if load_checkpoint:
    pongNet.load_checkpoint()

batch_feature_vector = []  #Vector of state, action, and reward
smoothed_reward = None
episode_count = 1

while True:
    print("Starting episode {}".format(episode_count))

    episode_done = False
    episode_reward_sum = 0

    round_num = 1
Exemple #6
0
ACTION_DIM = env.action_space.shape[0]
INPUT_DIM = env.observation_space.shape[0]

# disable GPU memory usuage here
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# create the summary writer here
summary_writer = tf.summary.FileWriter(os.path.join(MODEL_DIR, "train"))

# to run stuff on cpu
with tf.device("/cpu:0"):

    # Keeps track of the number of updates we've performed
    global_step = tf.Variable(0, name="global_step", trainable=False)

    global_actor_net = PolicyNet(HIDDEN_LAYER, ACTION_DIM, name="global_actor")
    global_critic_net = AdvantageValueNet(HIDDEN_LAYER, name="global_critic")
    # connecting stuff
    tmp_x = tf.placeholder(dtype=tf.float32,
                           shape=(BATCH_SIZE, INPUT_DIM),
                           name="tmp_x")
    tmp_a = tf.placeholder(dtype=tf.float32,
                           shape=(BATCH_SIZE, ACTION_DIM),
                           name="tmp_a")

    global_average_actor_net = PolicyNet(HIDDEN_LAYER,
                                         ACTION_DIM,
                                         name="global_Average_actor")

    _, tmp_policy = global_actor_net(tmp_x)
    _ = global_critic_net(tmp_x, tmp_a, tmp_policy)
Exemple #7
0
 def __init__(self):
     self.policy_net = PolicyNet()
     self.eval_net = EvalNet()
Exemple #8
0
    def build_graph(self):
        """
        builds a local graph
        """
        # place holders for inputs here
        HIDDEN_LAYER = self.FLAGS.feature_layer_size

        self.x_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.INPUT_DIM),
                                  name="x_i")
        self.a_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.ACTION_DIM),
                                  name="a_i")
        self.q_opc = tf.placeholder(dtype=tf.float32,
                                    shape=(None, 1),
                                    name="q_opc")
        self.q_ret = tf.placeholder(dtype=tf.float32,
                                    shape=(None, 1),
                                    name="q_ret")
        self.c = self.FLAGS.c  # truncation threshold constant

        self.actor_net = PolicyNet(HIDDEN_LAYER,
                                   self.ACTION_DIM,
                                   name=self.name + "_actor",
                                   co_var=self.co_var)
        self.critic_net = AdvantageValueNet(HIDDEN_LAYER,
                                            name=self.name + "_critic")

        self.policy_xi_stats, self.policy_xi_dist = self.actor_net(self.x_i)

        self.val_xi, self.adv_xi_ai = self.critic_net(self.x_i, self.a_i,
                                                      self.policy_xi_dist)

        #sample a' now
        self.a_i_ = tf.reshape(self.policy_xi_dist.sample(1),
                               shape=[-1, self.ACTION_DIM])

        _, self.adv_xi_ai_ = self.critic_net(
            self.x_i, self.a_i_,
            self.policy_xi_dist)  # val will be the same for

        _, self.average_policy_xi_dist = self.average_actor_net(
            self.x_i)  # can this be done better ?

        self.prob_a_i = tf.reshape(self.policy_xi_dist.prob(self.a_i),
                                   shape=[-1, 1]) + 1e-8
        self.prob_a_i_ = tf.reshape(self.policy_xi_dist.prob(self.a_i_),
                                    shape=[-1, 1]) + 1e-8

        self.log_prob_a_i = tf.log(self.prob_a_i)
        self.log_prob_a_i_ = tf.log(self.prob_a_i_)

        # for predicting 1-step a_i', p_i, p_i',
        self.u_i = tf.placeholder(dtype=tf.float32,
                                  shape=(None, self.ACTION_DIM))

        self.u_i_dist = tf.contrib.distributions.MultivariateNormalDiag(
            loc=self.u_i, scale_diag=tf.ones_like(self.u_i) * self.co_var)

        self.u_i_prob_a_i = tf.reshape(self.u_i_dist.prob(self.a_i),
                                       shape=[-1, 1]) + 1e-8
        self.u_i_prob_a_i_ = tf.reshape(self.u_i_dist.prob(self.a_i_),
                                        shape=[-1, 1]) + 1e-8

        self.p_i = tf.divide(self.prob_a_i, self.u_i_prob_a_i)
        self.p_i_ = tf.divide(self.prob_a_i_, self.u_i_prob_a_i_)

        # take care of NaNs here, for importance sampling weights (might be an extra step)
        self.p_i = tf.where(tf.is_nan(self.p_i), tf.zeros_like(self.p_i),
                            self.p_i)
        self.p_i_ = tf.where(tf.is_nan(self.p_i_), tf.zeros_like(self.p_i_),
                             self.p_i_)

        self.c_i = tf.minimum(1., tf.pow(self.p_i, 1.0 / self.ACTION_DIM))

        # for verification about checking if params are getting synched
        self.local_actor_vars = self.actor_net.local_params()
        self.global_actor_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'global_actor')

        self.local_critic_vars = self.critic_net.local_params()
        self.global_critic_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, 'global_critic')

        # Sync ops from global
        self.sync_local_actor_op = self.actor_net.update_local_params_op(
            'global_actor')  # global actor
        self.sync_local_critic_op = self.critic_net.update_local_params_op(
            'global_critic')

        # soft update the average network
        self.soft_update_average_actor_op = self.average_actor_net.soft_update_from_target_params(
            'global_actor', self.FLAGS.tau)

        #Get gradients from local network using local losses

        g1 = tf.reshape(tf.gradients(
            (self.log_prob_a_i * (self.q_opc - self.val_xi)),
            self.policy_xi_stats,
            name=self.name + "g1_grads"),
                        shape=[-1, self.ACTION_DIM])
        g2 = (self.adv_xi_ai_ - self.val_xi) * tf.reshape(
            tf.gradients((self.log_prob_a_i_),
                         self.policy_xi_stats,
                         name=self.name + "g2_grads"),
            shape=[-1, self.ACTION_DIM])

        self.g = tf.minimum(self.c, self.p_i) * g1 + tf.nn.relu(
            1 - tf.divide(self.c, self.p_i_)) * g2

        self.k = tf.reshape(tf.gradients(
            tf.contrib.distributions.kl_divergence(self.average_policy_xi_dist,
                                                   self.policy_xi_dist),
            self.policy_xi_stats),
                            shape=[-1, self.ACTION_DIM])

        self.kg = tf.reduce_sum(tf.multiply(self.g, self.k), 1, keep_dims=True)

        #print "kg", self.kg

        self.k2 = tf.reduce_sum(tf.multiply(self.k, self.k), 1, keep_dims=True)

        self.reg_g = self.g - tf.maximum(
            tf.zeros_like(self.g),
            tf.divide((self.kg - self.FLAGS.delta), self.k2)) * self.k

        # take gradients wrt to the local params
        self.actor_grads = tf.gradients(self.policy_xi_stats,
                                        self.local_actor_vars,
                                        grad_ys=-self.reg_g,
                                        name="actor_grads")

        #for ti,tj in zip(self.actor_grads, self.global_actor_vars):
        #    print ti, "\n", tj , "\n", "==========="

        # apply local gradients to the global network
        self.actor_train_op = self.optimizer.apply_gradients(
            zip(self.actor_grads, self.global_actor_vars),
            global_step=tf.train.get_global_step())

        # critic loss function and updates

        # take gradient wrt to local variables
        self.critic_loss_1 = ((self.q_ret - self.adv_xi_ai)**2.0) / 2.0

        # for predicting 1-step a_i', p_i, p_i',
        self.v_target = tf.placeholder(dtype=tf.float32, shape=(None, 1))

        #self.v_trunc = tf.minimum(self.p_i, 1.0) * (self.q_ret - self.adv_xi_ai) + self.val_xi
        self.critic_loss_2 = ((self.v_target - self.val_xi)**2.0) / 2.0

        self.critic_loss = self.critic_loss_1 + self.critic_loss_2

        #Apply local gradients to global network

        self.critic_grads = tf.gradients(self.critic_loss,
                                         self.local_critic_vars)

        self.critic_train_op = self.optimizer.apply_gradients(
            zip(self.critic_grads, self.global_critic_vars),
            global_step=tf.train.get_global_step())

        # critic_summaries op
        critic_grads_summary = []
        for grad, var in zip(self.critic_grads, self.local_critic_vars):
            critic_grads_summary.append(
                tf.summary.histogram(var.name + '/gradient', grad))
            critic_grads_summary.append(
                tf.summary.histogram(var.name + '/weight', var))

        self.critic_summary_op = tf.summary.merge([
            tf.summary.scalar(self.name + "_critc_mean_loss_Q",
                              tf.reduce_mean(self.critic_loss_1)),
            tf.summary.scalar(self.name + "_critc_mean_loss_V",
                              tf.reduce_mean(self.critic_loss_2)),
            tf.summary.scalar(self.name + "_critc_sum_loss_Q",
                              tf.reduce_sum(self.critic_loss_1)),
            tf.summary.scalar(self.name + "_critc_sum_loss_V",
                              tf.reduce_sum(self.critic_loss_2)),
            tf.summary.scalar(self.name + "_critc_mean_loss",
                              tf.reduce_mean(self.critic_loss)),
            tf.summary.scalar(self.name + "_critc_sum_loss",
                              tf.reduce_sum(self.critic_loss)),
            tf.summary.histogram(self.name + "_val_target", self.v_target),
            tf.summary.histogram(self.name + "_val_pred", self.val_xi),
            tf.summary.histogram(self.name + "_Q_pred", self.adv_xi_ai),
            tf.summary.histogram(self.name + "_Q_ret", self.q_ret),
            tf.summary.histogram(self.name + "_Q_opc", self.q_opc),
        ] + critic_grads_summary)

        # actor summaries op

        actor_grads_summary = []
        for grad, var in zip(self.actor_grads, self.local_actor_vars):
            actor_grads_summary.append(
                tf.summary.histogram(var.name + '/gradient', grad))
            actor_grads_summary.append(
                tf.summary.histogram(var.name + '/weight', var))

        self.actor_summary_op = tf.summary.merge([
            tf.summary.scalar(self.name + "_actor_mean_loss_reg_g",
                              tf.reduce_mean(self.reg_g)),
            tf.summary.scalar(self.name + "_actor_neg_mean_loss_reg_g",
                              tf.reduce_mean(-self.reg_g)),
            tf.summary.scalar(self.name + "_actor_sum_loss_reg_g",
                              tf.reduce_sum(self.reg_g)),
            tf.summary.scalar(self.name + "_actor_neg_sum_reg_g",
                              tf.reduce_sum(-self.reg_g)),
            tf.summary.scalar(self.name +
                              "_actor_sum_g", tf.reduce_sum(self.g)),
            tf.summary.scalar(self.name +
                              "_actor_neg_sum_g", tf.reduce_sum(-self.g)),
            tf.summary.scalar(self.name +
                              "_actor_mean_kl", tf.reduce_mean(self.k)),
            tf.summary.scalar(self.name +
                              "_actor_sum_kl", tf.reduce_sum(self.k)),
            tf.summary.histogram(self.name +
                                 "_policy_stats", self.policy_xi_stats),
        ] + actor_grads_summary)