Ejemplo n.º 1
0
    def __init__(self, env, args):

        #self.video_dir = os.path.join(summary_writer.get_logdir(), "../videos")
        #self.video_dir = os.path.abspath(args.video_dir)

        self.args = args
        self.env = env
        self.summary_writer = None 

        # define environment
        ob_space = env.observation_space.shape
        ac_space = env.action_space.n

        worker_device = "/job:worker/task:{}/cpu:0".format(args.task)
        with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(ob_space, ac_space)
                self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32),
                                                trainable=False)
        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.policy = pi = LSTMPolicy(ob_space, ac_space)
                pi.global_step = self.global_step

        # copy weights from the parameter server to the local model
        self.sync = tf.group(*[v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list)])
Ejemplo n.º 2
0
def main(env, snapshot, visualise):
    env = create_env(env, client_id=0, remotes=1)
    with tf.variable_scope("global"):
        policy = LSTMPolicy(env.observation_space.shape, env.action_space.n)

    last_state = env.reset()
    # state = last_state
    last_features = policy.get_initial_features()
    length = 0
    rewards = 0
    variables_to_save = [
        v for v in tf.global_variables() if not v.name.startswith("local")
    ]
    saver = tf.train.Saver(variables_to_save)
    with tf.Session() as sess:
        # Restore variables from disk.
        # saver.restore(sess, "train/model.ckpt-361814.data-00000-of-00001")
        # saver.restore(sess, "train/model.ckpt-361814")
        # saver.restore(sess, "/tmp/neonrace/train/model.ckpt-361714")
        saver.restore(sess, snapshot)
        while True:
            terminal_end = False

            fetched = policy.act(last_state, *last_features)
            action, value_, features = fetched[0], fetched[1], fetched[2:]
            # state, reward, terminal, info = env.step(action.argmax())
            action_n = action.argmax()

            # state, reward, terminal, info = env.step(default_action)
            state, reward, terminal, info = env.step(action_n)
            if visualise:
                env.render()
            # env.render() # I need to visualize it during testing
            print 'length: %d, rewards: %f' % (length, rewards)

            length += 1
            rewards += reward

            last_state = state
            last_features = features

            if terminal:
                terminal_end = True
                print("Episode finished. Sum of rewards: %d. Length: %d" %
                      (rewards, length))
                length = 0
                rewards = 0
                break
Ejemplo n.º 3
0
    def __init__(self, env, task, visualise):
        self.env = env
        self.task = task
        worker_device = "/job:worker/task:{}/cpu:0".format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(env.observation_space.shape,
                                          env.action_space.n)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(
                    env.observation_space.shape, env.action_space.n)
                pi.global_step = self.global_step
Ejemplo n.º 4
0
    def _build_net(self):
        self.network = pi = LSTMPolicy(self.obs_shape, self.numaction,
                                       self.designHead)

        with tf.variable_scope("predictor"):
            self.ap_network = predictor = StateActionPredictor(
                self.obs_shape, self.numaction, self.designHead)

        self.ac = tf.placeholder(tf.float32, [None, self.numaction], name="ac")
        self.adv = tf.placeholder(tf.float32, [None], name="adv")
        self.r = tf.placeholder(tf.float32, [None], name="r")
        log_prob_tf = tf.nn.log_softmax(pi.logits)
        prob_tf = tf.nn.softmax(pi.logits)
        pi_loss = -tf.reduce_mean(
            tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv)
        vf_loss = 0.5 * tf.reduce_mean(tf.square(pi.vf - self.r))
        entropy = -tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1))
        self.loss = pi_loss + 0.5 * vf_loss - entropy * constants[
            'ENTROPY_BETA']

        # compute gradients
        grads = tf.gradients(self.loss, pi.var_list)

        # computing predictor loss
        self.predloss = constants['PREDICTION_LR_SCALE'] * (
            predictor.invloss * (1 - constants['FORWARD_LOSS_WT']) +
            predictor.forwardloss * constants['FORWARD_LOSS_WT'])
        predgrads = tf.gradients(self.predloss, predictor.var_list)

        # clip gradients
        grads, _ = tf.clip_by_global_norm(grads, constants['GRAD_NORM_CLIP'])
        grads_and_vars = list(zip(grads, self.network.var_list))

        predgrads, _ = tf.clip_by_global_norm(predgrads,
                                              constants['GRAD_NORM_CLIP'])
        pred_grads_and_vars = list(zip(predgrads, self.ap_network.var_list))
        grads_and_vars = grads_and_vars + pred_grads_and_vars

        opt = tf.train.AdamOptimizer(constants['LEARNING_RATE'])
        self.train_op = tf.group(opt.apply_gradients(grads_and_vars))
Ejemplo n.º 5
0
    def __init__(self, env, task):
        """
An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
should be computed.
"""

        self.env = env
        self.task = task
        worker_device = "/job:worker/task:{}/cpu:0".format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(env.observation_space.shape,
                                          env.action_space.n)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(
                    env.observation_space.shape, env.action_space.n)
                pi.global_step = self.global_step

            self.ac = tf.placeholder(tf.float32, [None, env.action_space.n],
                                     name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")

            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)

            # the "policy gradients" loss:  its derivative is precisely the policy gradient
            # notice that self.ac is a placeholder that is provided externally.
            # adv will contain the advantages, as calculated in process_rollout
            pi_loss = -tf.reduce_sum(
                tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)

            # loss of value function
            vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r))
            entropy = -tf.reduce_sum(prob_tf * log_prob_tf)

            bs = tf.to_float(tf.shape(pi.x)[0])
            self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01

            # 20 represents the number of "local steps":  the number of timesteps
            # we run the policy before we update the parameters.
            # The larger local steps is, the lower is the variance in our policy gradients estimate
            # on the one hand;  but on the other hand, we get less frequent parameter updates, which
            # slows down learning.  In this code, we found that making local steps be much
            # smaller than 20 makes the algorithm more difficult to tune and to get to work.
            self.runner = RunnerThread(env, pi, 20)

            grads = tf.gradients(self.loss, pi.var_list)

            if use_tf12_api:
                tf.summary.scalar("model/policy_loss", pi_loss / bs)
                tf.summary.scalar("model/value_loss", vf_loss / bs)
                tf.summary.scalar("model/entropy", entropy / bs)
                tf.summary.image("model/state", pi.x)
                tf.summary.scalar("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.summary.scalar("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
                self.summary_op = tf.summary.merge_all()

            else:
                tf.scalar_summary("model/policy_loss", pi_loss / bs)
                tf.scalar_summary("model/value_loss", vf_loss / bs)
                tf.scalar_summary("model/entropy", entropy / bs)
                tf.image_summary("model/state", pi.x)
                tf.scalar_summary("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.scalar_summary("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
                self.summary_op = tf.merge_all_summaries()

            grads, _ = tf.clip_by_global_norm(grads, 40.0)

            # copy weights from the parameter server to the local model
            self.sync = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(pi.var_list, self.network.var_list)
            ])

            grads_and_vars = list(zip(grads, self.network.var_list))
            inc_step = self.global_step.assign_add(tf.shape(pi.x)[0])

            # each worker has a different set of adam optimizer parameters
            opt = tf.train.AdamOptimizer(1e-4)
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)
            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 6
0
    def __init__(self, gameState, task):
        """
        An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
        Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
        But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
        should be computed.

        """
        self.task = task
        self.gameState = gameState
        predictor = None
        numaction = ACTION_SIZE
        worker_device = "/job:worker/task:{}/cpu:0".format(task)

        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(OBSERVATION_SHAPE, numaction)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(OBSERVATION_SHAPE,
                                                     numaction)
                pi.global_step = self.global_step

            # Computing a3c loss: https://arxiv.org/abs/1506.02438
            self.ac = tf.placeholder(tf.float32, [None, numaction], name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")
            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)
            # 1) the "policy gradients" loss:  its derivative is precisely the policy gradient
            # notice that self.ac is a placeholder that is provided externally.
            # adv will contain the advantages, as calculated in process_rollout
            pi_loss = -tf.reduce_mean(
                tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv)  # Eq (19)
            # 2) loss of value function: l2_loss = (x-y)^2/2
            vf_loss = 0.5 * tf.reduce_mean(
                tf.square(pi.vf - self.r))  # Eq (28)
            # 3) entropy to ensure randomness
            entropy = -tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1))
            # final a3c loss: lr of critic is half of actor
            self.loss = pi_loss + 0.5 * vf_loss - entropy * constants[
                'ENTROPY_BETA']

            # compute gradients
            grads = tf.gradients(
                self.loss * 20.0, pi.var_list
            )  # batchsize=20. Factored out to make hyperparams not depend on it.

            # computing predictor loss

            self.runner = RunnerThread(gameState, pi,
                                       constants['ROLLOUT_MAXLEN'], predictor)

            # storing summaries
            bs = tf.to_float(tf.shape(pi.x)[0])
            if use_tf12_api:
                tf.summary.scalar("model/policy_loss", pi_loss)
                tf.summary.scalar("model/value_loss", vf_loss)
                tf.summary.scalar("model/entropy", entropy)
                tf.summary.image("model/state", pi.x)  # max_outputs=10
                tf.summary.scalar("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.summary.scalar("model/var_global_norm",
                                  tf.global_norm(pi.var_list))

                self.summary_op = tf.summary.merge_all()
            else:
                tf.scalar_summary("model/policy_loss", pi_loss)
                tf.scalar_summary("model/value_loss", vf_loss)
                tf.scalar_summary("model/entropy", entropy)
                tf.image_summary("model/state", pi.x)
                tf.scalar_summary("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.scalar_summary("model/var_global_norm",
                                  tf.global_norm(pi.var_list))

                self.summary_op = tf.merge_all_summaries()

            # clip gradients
            grads, _ = tf.clip_by_global_norm(grads,
                                              constants['GRAD_NORM_CLIP'])
            grads_and_vars = list(zip(grads, self.network.var_list))

            # update global step by batch size
            inc_step = self.global_step.assign_add(tf.shape(pi.x)[0])

            # each worker has a different set of adam optimizer parameters
            # TODO: make optimizer global shared, if needed
            print("Optimizer: Adam with lr: %f" % (constants['LEARNING_RATE']))
            print("Input observation shape: ", OBSERVATION_SHAPE)
            opt = tf.train.RMSPropOptimizer(constants['LEARNING_RATE'])
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)

            # copy weights from the parameter server to the local model
            sync_var_list = [
                v1.assign(v2)
                for v1, v2 in zip(pi.var_list, self.network.var_list)
            ]

            self.sync = tf.group(*sync_var_list)

            # initialize extras
            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 7
0
    def __init__(self,
                 env,
                 task,
                 visualise,
                 unsupType,
                 envWrap=False,
                 designHead='universe',
                 noReward=False):
        """
        An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
        Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
        But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
        should be computed.
        """
        self.task = task
        self.unsup = unsupType is not None
        self.envWrap = envWrap
        self.env = env

        predictor = None
        numaction = env.action_space.n
        worker_device = "/job:worker/task:{}/cpu:0".format(task)

        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(env.observation_space.shape,
                                          numaction, designHead)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)
                if self.unsup:
                    with tf.variable_scope("predictor"):
                        if 'state' in unsupType:
                            self.ap_network = StatePredictor(
                                env.observation_space.shape, numaction,
                                designHead, unsupType)
                        else:
                            self.ap_network = StateActionPredictor(
                                env.observation_space.shape, numaction,
                                designHead)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(
                    env.observation_space.shape, numaction, designHead)
                pi.global_step = self.global_step
                if self.unsup:
                    with tf.variable_scope("predictor"):
                        if 'state' in unsupType:
                            self.local_ap_network = predictor = StatePredictor(
                                env.observation_space.shape, numaction,
                                designHead, unsupType)
                        else:
                            self.local_ap_network = predictor = StateActionPredictor(
                                env.observation_space.shape, numaction,
                                designHead)

            # Computing a3c loss: https://arxiv.org/abs/1506.02438
            self.ac = tf.placeholder(tf.float32, [None, numaction], name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")
            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)
            # 1) the "policy gradients" loss:  its derivative is precisely the policy gradient
            # notice that self.ac is a placeholder that is provided externally.
            # adv will contain the advantages, as calculated in process_rollout
            pi_loss = -tf.reduce_mean(
                tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv)  # Eq (19)
            # 2) loss of value function: l2_loss = (x-y)^2/2
            vf_loss = 0.5 * tf.reduce_mean(
                tf.square(pi.vf - self.r))  # Eq (28)
            # 3) entropy to ensure randomness
            entropy = -tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1))
            # final a3c loss: lr of critic is half of actor
            self.loss = pi_loss + 0.5 * vf_loss - entropy * constants[
                'ENTROPY_BETA']

            # compute gradients
            grads = tf.gradients(
                self.loss * 20.0, pi.var_list
            )  # batchsize=20. Factored out to make hyperparams not depend on it.

            # computing predictor loss
            if self.unsup:
                if 'state' in unsupType:
                    self.predloss = constants[
                        'PREDICTION_LR_SCALE'] * predictor.forwardloss
                else:
                    self.predloss = constants['PREDICTION_LR_SCALE'] * (
                        predictor.invloss *
                        (1 - constants['FORWARD_LOSS_WT']) +
                        predictor.forwardloss * constants['FORWARD_LOSS_WT'])
                predgrads = tf.gradients(
                    self.predloss * 20.0, predictor.var_list
                )  # batchsize=20. Factored out to make hyperparams not depend on it.

                # do not backprop to policy
                if constants['POLICY_NO_BACKPROP_STEPS'] > 0:
                    grads = [
                        tf.scalar_mul(
                            tf.to_float(
                                tf.greater(
                                    self.global_step,
                                    constants['POLICY_NO_BACKPROP_STEPS'])),
                            grads_i) for grads_i in grads
                    ]

            self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'],
                                       visualise, predictor, envWrap, noReward)

            # storing summaries
            bs = tf.to_float(tf.shape(pi.x)[0])
            if use_tf12_api:
                tf.summary.scalar("model/policy_loss", pi_loss)
                tf.summary.scalar("model/value_loss", vf_loss)
                tf.summary.scalar("model/entropy", entropy)
                tf.summary.image("model/state", pi.x)  # max_outputs=10
                tf.summary.scalar("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.summary.scalar("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
                if self.unsup:
                    tf.summary.scalar("model/predloss", self.predloss)
                    if 'action' in unsupType:
                        tf.summary.scalar("model/inv_loss", predictor.invloss)
                        tf.summary.scalar("model/forward_loss",
                                          predictor.forwardloss)
                    tf.summary.scalar("model/predgrad_global_norm",
                                      tf.global_norm(predgrads))
                    tf.summary.scalar("model/predvar_global_norm",
                                      tf.global_norm(predictor.var_list))
                self.summary_op = tf.summary.merge_all()
            else:
                tf.scalar_summary("model/policy_loss", pi_loss)
                tf.scalar_summary("model/value_loss", vf_loss)
                tf.scalar_summary("model/entropy", entropy)
                tf.image_summary("model/state", pi.x)
                tf.scalar_summary("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.scalar_summary("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
                if self.unsup:
                    tf.scalar_summary("model/predloss", self.predloss)
                    if 'action' in unsupType:
                        tf.scalar_summary("model/inv_loss", predictor.invloss)
                        tf.scalar_summary("model/forward_loss",
                                          predictor.forwardloss)
                    tf.scalar_summary("model/predgrad_global_norm",
                                      tf.global_norm(predgrads))
                    tf.scalar_summary("model/predvar_global_norm",
                                      tf.global_norm(predictor.var_list))
                self.summary_op = tf.merge_all_summaries()

            # clip gradients
            grads, _ = tf.clip_by_global_norm(grads,
                                              constants['GRAD_NORM_CLIP'])
            grads_and_vars = list(zip(grads, self.network.var_list))
            if self.unsup:
                predgrads, _ = tf.clip_by_global_norm(
                    predgrads, constants['GRAD_NORM_CLIP'])
                pred_grads_and_vars = list(
                    zip(predgrads, self.ap_network.var_list))
                grads_and_vars = grads_and_vars + pred_grads_and_vars

            # update global step by batch size
            inc_step = self.global_step.assign_add(tf.shape(pi.x)[0])

            # each worker has a different set of adam optimizer parameters
            # TODO: make optimizer global shared, if needed
            print("Optimizer: ADAM with lr: %f" % (constants['LEARNING_RATE']))
            print("Input observation shape: ", env.observation_space.shape)
            opt = tf.train.AdamOptimizer(constants['LEARNING_RATE'])
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)

            # copy weights from the parameter server to the local model
            sync_var_list = [
                v1.assign(v2)
                for v1, v2 in zip(pi.var_list, self.network.var_list)
            ]
            if self.unsup:
                sync_var_list += [
                    v1.assign(v2) for v1, v2 in zip(predictor.var_list,
                                                    self.ap_network.var_list)
                ]
            self.sync = tf.group(*sync_var_list)

            # initialize extras
            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 8
0
    def __init__(self, envs, workerid, target_task):
        """
An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
should be computed.
"""

        self.env = envs
        self.num_tasks = num_tasks = 2  # only suitable when number of tasks equal to 2.
        self.target_task = target_task
        self.aux_tasks_id = int(1 - target_task)
        self.workerid = workerid
        self.network = [None] * self.num_tasks
        self.local_logitProjnet = [None] * self.num_tasks
        self.local_network = [None] * self.num_tasks
        self.global_step = [None] * self.num_tasks
        self.logitProjnet = [None] * self.num_tasks
        self.T1 = [100000000, 3000000]  # [400, 5000] #[4000000, 6000000]
        self.T2 = [100000000, 4000000]  # [400, 5000] #[4000000, 6000000]
        pi = [None] * self.num_tasks
        worker_device = "/job:worker/task:{}/cpu:0".format(workerid)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global" + str(0)):  # Pong
                self.network[0] = LSTMPolicy(envs[0].observation_space.shape,
                                             envs[0].action_space.n)
                self.global_step[0] = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.zeros_initializer,
                    trainable=False)
            with tf.variable_scope("global" + str(1)):  #bowling
                self.network[1] = LSTMPolicy(envs[1].observation_space.shape,
                                             envs[1].action_space.n)
                self.global_step[1] = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.zeros_initializer,
                    trainable=False)
            with tf.variable_scope(
                    "globallogits0"):  # network for projection logits.
                self.logitProjnet[0] = logitsProj(envs[0].action_space.n)

            with tf.variable_scope(
                    "globallogits1"):  # network for projection logits.
                self.logitProjnet[1] = logitsProj(envs[1].action_space.n)

        with tf.device(worker_device):
            with tf.variable_scope("local" + str(0)):
                self.local_network[0] = pi[0] = LSTMPolicy(
                    envs[0].observation_space.shape, envs[0].action_space.n)
                pi[0].global_step = self.global_step[0]

            with tf.variable_scope("local" + str(1)):
                self.local_network[1] = pi[1] = LSTMPolicy(
                    envs[1].observation_space.shape, envs[1].action_space.n)
                pi[1].global_step = self.global_step[1]

            with tf.variable_scope("local" + "logits0"):
                self.local_logitProjnet[0] = logitsProj(envs[0].action_space.n)

            with tf.variable_scope("local" + "logits1"):
                self.local_logitProjnet[1] = logitsProj(envs[1].action_space.n)

            self.ac = [None] * num_tasks
            self.adv = [None] * num_tasks
            self.r = [None] * num_tasks
            log_prob_tf = [None] * num_tasks
            prob_tf = [None] * num_tasks
            pi_loss = [None] * num_tasks
            vf_loss = [None] * num_tasks
            entropy = [None] * num_tasks
            bs = [None] * num_tasks
            self.loss = [None] * num_tasks
            self.runner = [None] * num_tasks
            grads = [None] * num_tasks
            self.summary_op = [[None, None] for i in np.arange(num_tasks)]
            self.sync = [None] * num_tasks
            grads_and_vars = [None] * num_tasks
            self.inc_step = [None] * num_tasks
            opt = [None] * num_tasks
            self.train_op = [None] * num_tasks
            self.target_logits = [None] * num_tasks
            soft_p_temperature = [None] * num_tasks
            soft_t_temperature = [None] * num_tasks
            self.KD_trainop = [None] * num_tasks
            kl_loss = [None] * num_tasks
            grads_kd = [None] * num_tasks
            grads_and_vars_kd = [None] * num_tasks
            optkd = [None] * num_tasks
            self.sync_logits = [None] * num_tasks
            self.logits_stu = [None] * num_tasks
            soft_student_logits = [None] * num_tasks
            soft_teacher_logits = [None] * num_tasks
            self.proj_loss = [None] * num_tasks
            grad_logproj = [None] * num_tasks
            grads_and_vars_logproj = [None] * num_tasks
            optlgproj = [None] * num_tasks
            self.lgproj_trainop = [None] * num_tasks
            self.summary_op_proj = [None] * num_tasks
            for ii in np.arange(num_tasks):
                # start to build loss for target network
                self.ac[ii] = tf.placeholder(tf.float32,
                                             [None, envs[ii].action_space.n],
                                             name="ac" + str(ii))
                self.adv[ii] = tf.placeholder(tf.float32, [None],
                                              name="adv" + str(ii))
                self.r[ii] = tf.placeholder(tf.float32, [None],
                                            name="r" + str(ii))

                log_prob_tf[ii] = tf.nn.log_softmax(pi[ii].logits)
                prob_tf[ii] = tf.nn.softmax(pi[ii].logits)

                # the "policy gradients" loss:  its derivative is precisely the policy gradient
                # notice that self.ac is a placeholder that is provided externally.
                # ac will contain the advantages, as calculated in process_rollout
                pi_loss[ii] = -tf.reduce_sum(
                    tf.reduce_sum(log_prob_tf[ii] * self.ac[ii], [1]) *
                    self.adv[ii])

                # loss of value function
                vf_loss[ii] = 0.5 * tf.reduce_sum(
                    tf.square(pi[ii].vf - self.r[ii]))
                entropy[ii] = -tf.reduce_sum(prob_tf[ii] * log_prob_tf[ii])

                bs[ii] = tf.to_float(tf.shape(pi[ii].x)[0])
                self.loss[
                    ii] = pi_loss[ii] + 0.5 * vf_loss[ii] - entropy[ii] * 0.01

                # 20 represents the number of "local steps":  the number of timesteps
                # we run the policy before we update the parameters.
                # The larger local steps is, the lower is the variance in our policy gradients estimate
                # on the one hand;  but on the other hand, we get less frequent parameter updates, which
                # slows down learning.  In this code, we found that making local steps be much
                # smaller than 20 makes the algorithm more difficult to tune and to get to work.
                # name = "worker"+str(workerid)+"task"+str(ii)
                name = "task" + str(ii)
                self.runner[ii] = RunnerThread(envs[ii], pi[ii], 20, name)

                grads[ii] = tf.gradients(self.loss[ii], pi[ii].var_list)
                summaries1 = list()  # summary when it's target tasks
                summaries1.append(
                    tf.scalar_summary("model/policy_loss" + str(ii),
                                      pi_loss[ii] / bs[ii]))
                summaries1.append(
                    tf.scalar_summary("model/value_loss" + str(ii),
                                      vf_loss[ii] / bs[ii]))
                summaries1.append(
                    tf.scalar_summary("model/entropy" + str(ii),
                                      entropy[ii] / bs[ii]))
                summaries1.append(
                    tf.image_summary("model/state" + str(ii), pi[ii].x))
                summaries1.append(
                    tf.scalar_summary("model/grad_global_norm" + str(ii),
                                      tf.global_norm(grads[ii])))
                summaries1.append(
                    tf.scalar_summary("model/var_global_norm" + str(ii),
                                      tf.global_norm(pi[ii].var_list)))
                summaries1.append(
                    tf.histogram_summary("model/action_weight" + str(ii),
                                         prob_tf[ii]))

                summaries2 = list()  # summary when it's aux tasks.
                summaries2.append(
                    tf.histogram_summary("model/action_weight" + str(ii),
                                         prob_tf[ii]))
                summaries2.append(
                    tf.scalar_summary("model/entropy" + str(ii),
                                      entropy[ii] / bs[ii]))
                self.summary_op[ii][0] = tf.merge_summary(summaries1)
                self.summary_op[ii][1] = tf.merge_summary(summaries2)

                grads[ii], _ = tf.clip_by_global_norm(grads[ii], 40.0)

                # self.sync = [None] * self.num_tasks
                zipvars_lp = zip(pi[ii].var_list, self.network[ii].var_list)
                self.sync[ii] = tf.group(
                    *[v1.assign(v2) for v1, v2 in zipvars_lp])

                grads_and_vars[ii] = list(
                    zip(grads[ii], self.network[ii].var_list))
                self.inc_step[ii] = self.global_step[ii].assign_add(
                    tf.shape(pi[ii].x)[0])

                # each worker has a different set of adam optimizer parameters
                opt[ii] = tf.train.AdamOptimizer(1e-4)
                self.train_op[ii] = tf.group(
                    opt[ii].apply_gradients(grads_and_vars[ii]),
                    self.inc_step[ii])

                # knowledge distillation
                self.target_logits[ii] = tf.placeholder(
                    tf.float32, [None, envs[ii].action_space.n],
                    name="target_logits")  # logits from teacher
                Tao = 1.0  # temperature used for distillation.

                soft_p_temperature[ii] = tf.nn.softmax(
                    pi[ii].logits_fordistill)

                soft_t_temperature[ii] = tf.nn.softmax(
                    tf.truediv(self.target_logits[ii], Tao))

                kl_loss[ii] = tf.reduce_mean(
                    tf.reduce_sum(
                        soft_t_temperature[ii] * tf.log(1e-10 + tf.truediv(
                            soft_t_temperature[ii], soft_p_temperature[ii])),
                        1))

                grads_kd[ii] = tf.gradients(kl_loss[ii], pi[ii].var_list)
                grads_kd[ii], _ = tf.clip_by_global_norm(grads_kd[ii], 40.0)
                grads_and_vars_kd[ii] = list(
                    zip(grads_kd[ii], self.network[ii].var_list))
                optkd[ii] = tf.train.AdamOptimizer(1e-4)
                self.KD_trainop[ii] = optkd[ii].apply_gradients(
                    grads_and_vars_kd[ii])

                'learning logits projection'
                zipvars_lp = zip(self.local_logitProjnet[ii].var_list,
                                 self.logitProjnet[ii].var_list)
                self.sync_logits[ii] = tf.group(
                    *[v1.assign(v2) for v1, v2 in zipvars_lp])
                # soft_student_logits = tf.nn.softmax(pi[target_task].logits)
                self.logits_stu[ii] = tf.placeholder(
                    tf.float32, [None, envs[ii].action_space.n])
                soft_student_logits[ii] = tf.nn.softmax(self.logits_stu[ii])
                soft_teacher_logits[ii] = tf.nn.softmax(
                    self.local_logitProjnet[ii].logits_out)
                self.proj_loss[ii] = proj_loss = tf.reduce_mean(
                    tf.reduce_sum(
                        soft_teacher_logits[ii] * tf.log(1e-10 + tf.truediv(
                            soft_teacher_logits[ii], soft_student_logits[ii])),
                        1))  # target task --> student
                grad_logproj[ii] = tf.gradients(
                    proj_loss, self.local_logitProjnet[ii].var_list)
                grad_logproj[ii], _ = tf.clip_by_global_norm(
                    grad_logproj[ii], 40.0)
                grads_and_vars_logproj[ii] = list(
                    zip(grad_logproj[ii], self.logitProjnet[ii].var_list))
                optlgproj[ii] = tf.train.AdamOptimizer(1e-4)
                self.lgproj_trainop[ii] = optlgproj[ii].apply_gradients(
                    grads_and_vars_logproj[ii])
                self.summary_op_proj[ii] = tf.scalar_summary(
                    "model/proj_loss" + str(ii), self.proj_loss[ii])

            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 9
0
    def __init__(self, env, task, visualise, learning_rate, meta, remotes,
                 num_trials, total_num_steps):
        """ (original comment)
An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
should be computed.
"""
        self.env = env
        self.task = task
        self.remotes = remotes
        self.learning_rate = learning_rate
        self.num_trials = num_trials
        num_local_steps = 5  # t_max in the A3C paper: number of steps in the rollouts
        isBanditEnvironment = "Bandit" in env.env.spec.id  # boolean variable, is True if the environment is a Bandit environment
        if isBanditEnvironment:
            if 'Two' in env.env.spec.id:
                reward_range = 2
            elif 'Eleven' in env.env.spec.id:
                reward_range = 13

        worker_device = "/job:worker/task:{}/cpu:0".format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                if isBanditEnvironment:
                    self.network = LSTMPolicyBandit(
                        env.observation_space.shape, env.action_space.n,
                        reward_range)
                else:
                    self.network = LSTMPolicy(env.observation_space.shape,
                                              env.action_space.n)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False
                )  # Cree un compteur global et l'initialise a zero, saud si on reprend un training existant

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                if isBanditEnvironment:
                    self.local_network = pi = LSTMPolicyBandit(
                        env.observation_space.shape, env.action_space.n,
                        reward_range)
                else:
                    self.local_network = pi = LSTMPolicy(
                        env.observation_space.shape, env.action_space.n)
                pi.global_step = self.global_step

            self.ac = tf.placeholder(
                tf.float32, [None, env.action_space.n], name="ac"
            )  # action, will contain the list of the action vectors at each step of the rollout ; placeholder called by A3C's process function in trainer.process(sess) in worker.py
            self.adv = tf.placeholder(
                tf.float32, [None], name="adv"
            )  # advantage, wil contain the list of the advantages at each step of the rollout ; placeholder called by A3C's process function in trainer.process(sess) in worker.py
            self.return_ = tf.placeholder(
                tf.float32, [None], name="return_"
            )  # return, wil contain the return obtained after visiting each of steps in the rollout ; placeholder called by A3C's process function in trainer.process(sess) in worker.py

            log_prob_tf = tf.nn.log_softmax(
                pi.logits)  # the log probability of each action log(\pi(a|s))
            prob_tf = tf.nn.softmax(
                pi.logits)  # the probability of each action \pi(a|s)

            # (original comment) the "policy gradients" loss:  its derivative is precisely the policy gradient. Notice that self.ac is a placeholder that is provided externally.
            #
            pi_loss = -tf.reduce_sum(
                tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)

            # loss of value function
            vf_loss = 0.5 * tf.reduce_sum(
                tf.square(pi.vf - self.return_)
            )  # why not taking the sum of the squared values of self.adv directly ?
            entropy = -tf.reduce_sum(prob_tf * log_prob_tf)
            beta_entropy = (float(1) / total_num_steps) * tf.cast(
                tf.constant(total_num_steps) - self.global_step, tf.float32)

            bs = tf.to_float(tf.shape(
                pi.x)[0])  # bs = batch size = number of steps in the rollout

            self.loss = pi_loss + (0.05 * vf_loss) - (
                beta_entropy * entropy
            )  # why 0.5 when we already put 0.5 in the definition of vf_loss ?

            # (Original comment)
            # num_local_steps represents the number of timesteps we run the policy before we update the parameters.
            # The larger local steps is, the lower is the variance in our policy gradients estimate on the one hand;  but on the other hand, we get less frequent parameter updates, which slows down learning.
            # In this code, we found that making local steps be much smaller than 20 makes the algorithm more difficult to tune and to get to work.
            # (My comment):
            # The original A3C paper uses num_local_step = 5 on Atari games, but it uses an action repeat of 4 (not present here), so the network is updates every 20 frames, as in the original universe-starter-agent

            self.runner = RunnerThread(
                env, pi, num_local_steps, visualise, meta, task, remotes,
                num_trials
            )  # Objet de la classe RunnerThread definie plus haut. 20 is the maximum number of steps in a partial rollout.

            # computes the gradient of the loss function:
            grads = tf.gradients(self.loss, pi.var_list)

            # tensorboard:
            if use_tf12_api:
                tf.summary.scalar("model/policy_loss", pi_loss / bs)
                tf.summary.scalar("model/value_loss", vf_loss / bs)
                tf.summary.scalar("model/entropy", entropy / bs)
                tf.summary.scalar("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.summary.scalar("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
                if not isBanditEnvironment:
                    tf.summary.image("model/state", pi.x)
                self.summary_op = tf.summary.merge_all()
            else:
                tf.scalar_summary("model/policy_loss", pi_loss / bs)
                tf.scalar_summary("model/value_loss", vf_loss / bs)
                tf.scalar_summary("model/entropy", entropy / bs)
                tf.scalar_summary("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.scalar_summary("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
                if not isBanditEnvironment:
                    tf.image_summary("model/state", pi.x)
                self.summary_op = tf.merge_all_summaries()

            # Create a list of (gradient, variable) pairs to feed into the Adam Optimizer (each variable will then be updated according to the paired gradient)
            grads, _ = tf.clip_by_global_norm(grads, 40.0)  # ?
            grads_and_vars = list(zip(grads, self.network.var_list))

            # copy weights from the parameter server to the local model
            self.sync = tf.group(
                *[
                    v1.assign(v2)
                    for v1, v2 in zip(pi.var_list, self.network.var_list)
                ]
            )  # remplace les valeurs de pi.var_list par ceux de self.network.var_list (execute dans la function "process")

            # updates the global counter: adds (and assign) tf.shape(pi.x)[0] to the value of the variable self.global_step (initialise a zero), and inc_step takes this updtated value:
            inc_step = self.global_step.assign_add(
                tf.shape(pi.x)[0]
            )  # on incremente le compteur global du nombre de steps contenus dans le rollout (= batch size) ; appele par la fonction process
            self.inc_step = inc_step  # so that we can call it directly from the inc_global_step method

            # each worker has a different set of adam optimizer parameters
            opt = tf.train.AdamOptimizer(
                self.learning_rate
            )  # the default learning rate is 1e-4. This value with the argument -lr <new_value>
            self.train_op = tf.group(
                opt.apply_gradients(grads_and_vars), inc_step
            )  # tf.group creates an op that groups multiple operations (here, two operations)
            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 10
0
    def __init__(self, env, task):
        self.env = env
        self.task = task
        worker_device = "/job:worker/task:{}/cpu:0".format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(env.observation_space.shape,
                                          env.action_space.n)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.zeros_initializer,
                    trainable=False)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(
                    env.observation_space.shape, env.action_space.n)
                pi.global_step = self.global_step

            self.ac = tf.placeholder(tf.float32, [None, env.action_space.n],
                                     name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")

            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)

            pi_loss = -tf.reduce_sum(
                tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)
            vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r))
            entropy = -tf.reduce_sum(prob_tf * log_prob_tf)

            bs = tf.to_float(tf.shape(pi.x)[0])
            self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01
            self.runner = RunnerThread(env, pi, 20)

            grads = tf.gradients(self.loss, pi.var_list)

            tf.scalar_summary("model/policy_loss", pi_loss / bs)
            tf.scalar_summary("model/value_loss", vf_loss / bs)
            tf.scalar_summary("model/entropy", entropy / bs)
            tf.image_summary("model/state", pi.x)
            tf.scalar_summary("model/grad_global_norm", tf.global_norm(grads))
            tf.scalar_summary("model/var_global_norm",
                              tf.global_norm(pi.var_list))

            self.summary_op = tf.merge_all_summaries()
            grads, _ = tf.clip_by_global_norm(grads, 40.0)

            self.sync = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(pi.var_list, self.network.var_list)
            ])

            grads_and_vars = list(zip(grads, self.network.var_list))
            inc_step = self.global_step.assign_add(tf.shape(pi.x)[0])

            opt = tf.train.AdamOptimizer(1e-4)
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)
            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 11
0
def inference(args):
    indir = os.path.join(args.log_dir, 'train')
    outdir = os.path.join(
        args.log_dir, 'inference') if args.out_dir is None else args.out_dir

    with open(indir + "/checkpoint", "r") as f:
        first_line = f.readline().strip()
        print("first_line is : {}".format(first_line))
    ckpt = first_line.split(' ')[-1].split('/')[-1][:-1]
    ckpt = ckpt.split('-')[-1]
    ckpt = indir + '/model.ckpt-' + ckpt

    print("ckpt: {}".format(ckpt))

    # define environment
    env = create_icegame_env(outdir, args.env_id)
    num_actions = env.action_space.n

    with tf.device("/cpu:0"):
        # define policy network
        with tf.variable_scope("global"):
            policy = LSTMPolicy(env.observation_space.shape, num_actions)
            policy.global_step = tf.get_variable(
                "global_step", [],
                tf.int32,
                initializer=tf.constant_initializer(0, dtype=tf.int32),
                trainable=False)
        # Variable names that start with "local" are not saved in checkpoints.
        variables_to_restore = [
            v for v in tf.global_variables() if not v.name.startswith("local")
        ]
        init_all_op = tf.global_variables_initializer()

        saver = FastSaver(variables_to_restore)

        # print trainable variables
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     tf.get_variable_scope().name)
        logger.info('Trainable vars:')
        for v in var_list:
            logger.info('  {} {}'.format(v.name, v.get_shape()))
        logger.info("Restored the trained model.")

        # summary of rewards
        action_writers = []
        summary_writer = tf.summary.FileWriter(outdir)
        for act_idx in range(num_actions):
            action_writers.append(
                tf.summary.FileWriter(
                    os.path.join(outdir, "action_{}".format(act_idx))))

        logger.info("Inference events directory: %s", outdir)
        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)

        with tf.Session() as sess:
            logger.info("Initializing all parameters.")
            sess.run(init_all_op)
            logger.info("Restoring trainable global parameters.")
            saver.restore(sess, ckpt)
            logger.info("Restored model was trained for %.2fM global steps",
                        sess.run(policy.global_step) / 1000000.)

            last_features = policy.get_initial_features()  # reset lstm memory
            length = 0
            rewards = 0
            loopsizes = []

            # All Episodes records
            for ep in range(args.num_episodes):
                """TODO: policy sampling strategy
                    random, greedy and sampled policy.
                """

                last_state = env.reset()

                # Episode records

                # running policy
                while True:
                    fetched = policy.act_inference(last_state, *last_features)
                    prob_action, action, value_, features = fetched[
                        0], fetched[1], fetched[2], fetched[3:]

                    #TODO: policy sampling strategy

                    # Greedy
                    stepAct = action.argmax()
                    state, reward, terminal, info = env.step(stepAct)

                    # update stats
                    length += 1
                    rewards += reward
                    last_state = state
                    last_features = features
                    """TODO: Resonable Statistics are necessary
                    """

                    if info:
                        loopsize = info["Loop Size"]
                        looparea = info["Loop Area"]

                    # store summary
                    summary = tf.Summary()
                    summary.value.add(tag='ep_{}/reward'.format(ep),
                                      simple_value=reward)
                    summary.value.add(tag='ep_{}/netreward'.format(ep),
                                      simple_value=rewards)
                    summary.value.add(tag='ep_{}/value'.format(ep),
                                      simple_value=float(value_[0]))

                    if info:
                        summary.value.add(tag='ep_{}/loop_size'.format(ep),
                                          simple_value=loopsize)
                        summary.value.add(tag='ep_{}/loop_area'.format(ep),
                                          simple_value=looparea)
                        loopsizes.append(loopsize)

                    summary_writer.add_summary(summary, length)
                    summary_writer.flush()

                    summary = tf.Summary()
                    for ac_id in range(num_actions):
                        summary.value.add(tag='ep_{}/a_{}'.format(ep, ac_id),
                                          simple_value=float(
                                              prob_action[ac_id]))
                        action_writers[ac_id].add_summary(summary, length)
                        action_writers[ac_id].flush()
                    """TODO:
                        1. Need more concrete idea for playing the game when interfering.
                        2. Save these values for post processing.
                    """
                    if terminal:
                        #if length >= timestep_limit:
                        #    last_state, _, _, _ = env.reset()

                        last_features = policy.get_initial_features(
                        )  # reset lstm memory
                        print(
                            "Episode finished. Sum of rewards: %.2f. Length: %d."
                            % (rewards, length))

                        length = 0
                        rewards = 0
                        break

        logger.info('Finished %d true episodes.', args.num_episodes)

        # Count loop topology
        unique, counts = np.unique(loopsizes, return_counts=True)
        loopstatistics = dict(zip(unique, counts))
        print(loopstatistics)
        env.close()
Ejemplo n.º 12
0
    def __init__(self, env, task, visualise, sensor_pb, vision_pb):
        """
        An implementation of the A3C algorithm that is reasonably well-tuned
        for the VNC environments. Below, we will have a modest amount of
        complexity due to the way TensorFlow handles data parallelism. But
        overall, we'll define the model, specify its inputs, and describe how
        the policy gradients step should be computed.
        """

        self.env = env
        self.task = task
        self.visualise = visualise
        obs_shape = env.observation_shape
        worker_device = '/job:worker/task:{}/cpu:0'.format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope('global'):
                self.network = LSTMPolicy(obs_shape, env.action_space)
                self.global_step = tf.get_variable(
                    'global_step', [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(obs_shape,
                                                     env.action_space)
                pi.global_step = self.global_step

            self.ac = tf.placeholder(tf.float32, [None, env.action_space],
                                     name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")

            with gfile.FastGFile(sensor_pb, 'rb') as f:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(f.read())
                self.input_sns, self.output_sns = tf.import_graph_def(
                    graph_def,
                    return_elements=['lstm_1_input:0', 'output_node0:0'])

            with gfile.FastGFile(vision_pb, 'rb') as f:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(f.read())
                self.input_img, self.output_img = tf.import_graph_def(
                    graph_def, return_elements=['input_1:0', 'output_node0:0'])

            env.sensor_agent.input_tf = self.input_sns
            env.sensor_agent.output_tf = self.output_sns
            env.vision_agent.input_tf = self.input_img
            env.vision_agent.output_tf = self.output_img

            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)

            # the "policy gradients" loss: its derivative is precisely the
            # policy gradient notice that self.ac is a placeholder that is
            # provided externally. adv will contain the advantages, as
            # calculated in process_rollout
            pi_loss = -tf.reduce_sum(
                tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)

            # loss of value function
            vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r))
            entropy = -tf.reduce_sum(prob_tf * log_prob_tf)

            bs = tf.to_float(tf.shape(pi.x)[0])
            self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01

            grads = tf.gradients(self.loss, pi.var_list)

            tf.summary.scalar("model/policy_loss", pi_loss / bs)
            tf.summary.scalar("model/value_loss", vf_loss / bs)
            tf.summary.scalar("model/entropy", entropy / bs)
            tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads))
            tf.summary.scalar("model/var_global_norm",
                              tf.global_norm(pi.var_list))
            self.summary_op = tf.summary.merge_all()

            grads, _ = tf.clip_by_global_norm(grads, 40.0)

            # copy weights from the parameter server to the local model
            self.sync = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(pi.var_list, self.network.var_list)
            ])

            grads_and_vars = list(zip(grads, self.network.var_list))
            inc_step = self.global_step.assign_add(tf.shape(pi.x)[0])

            # each worker has a different set of adam optimizer parameters
            opt = tf.train.AdamOptimizer(1e-4)
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)
            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 13
0
    def __init__(self, env, env_id, task):
        """
        An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
        Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
        But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
        should be computed.
        """

        self.env = env
        self.task = task
        self.env_id = env_id
        from config import if_learning_v
        self.if_learning_v = if_learning_v

        from config import project, mode
        if (project is 'f') and (mode is 'on_line'):
            self.log_thread = True
        else:
            '''only log if the task is on zero and cluster is the main cluster'''
            if (self.task % config.num_workers_global
                    == 0) and (config.cluster_current == config.cluster_main):
                self.log_thread = True
            else:
                self.log_thread = False

        worker_device = "/job:worker/task:{}/cpu:0".format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(env.observation_space.shape,
                                          env.action_space.n, self.env_id)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.zeros_initializer(),
                    trainable=False)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(
                    env.observation_space.shape, env.action_space.n,
                    self.env_id)
                pi.global_step = self.global_step

            # self.env_id = 'PongDeterministic-v3'
            self.ac = tf.placeholder(tf.float32, [None, env.action_space.n],
                                     name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")
            self.step_forward = tf.placeholder(tf.int32, [None],
                                               name="step_forward")
            if self.if_learning_v:
                self.v_lable = tf.placeholder(tf.float32, [None],
                                              name="v_lable")

            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)

            # the "policy gradients" loss:  its derivative is precisely the policy gradients
            # notice that self.ac is a placeholder that is provided externally.
            # ac will contain the advantages, as calculated in process_rollout
            pi_loss = -tf.reduce_sum(
                tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)

            # loss of value function
            vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r))

            # -entropy loss
            entropy = -tf.reduce_sum(prob_tf * log_prob_tf)

            # v loss
            if self.if_learning_v:
                v_loss = 0.5 * tf.reduce_sum(tf.square(pi.v - self.v_lable))

            bs = tf.to_float(tf.shape(pi.x)[0])

            if self.if_learning_v:
                self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01 + 0.5 * v_loss
            else:
                self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01

            # config.update_step represents the number of "local steps":  the number of timesteps
            # we run the policy before we update the parameters.
            # The larger local steps is, the lower is the variance in our policy gradients estimate
            # on the one hand;  but on the other hand, we get less frequent parameter updates, which
            # slows down learning.  In this code, we found that making local steps be much
            # smaller than 20 makes the algorithm more difficult to tune and to get to work.
            self.runner = RunnerThread(env, env_id, pi, config.update_step,
                                       self.log_thread)

            grads = tf.gradients(self.loss, pi.var_list)

            tf.summary.scalar(self.env_id + "/model/policy_loss", pi_loss / bs)
            tf.summary.scalar(self.env_id + "/model/value_loss", vf_loss / bs)
            tf.summary.scalar(self.env_id + "/model/entropy", entropy / bs)
            tf.summary.scalar(self.env_id + "/model/grad_global_norm",
                              tf.global_norm(grads))
            tf.summary.scalar(self.env_id + "/model/var_global_norm",
                              tf.global_norm(pi.var_list))
            if self.if_learning_v:
                tf.summary.scalar(self.env_id + "/model/v_loss", v_loss / bs)

            self.summary_op = tf.summary.merge_all()
            grads, _ = tf.clip_by_global_norm(grads, 40.0)

            # copy weights from the parameter server to the local model
            self.sync = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(pi.var_list, self.network.var_list)
            ])

            grads_and_vars = list(zip(grads, self.network.var_list))
            inc_step = self.global_step.assign_add(
                tf.shape(self.step_forward)[0])

            # each worker has a different set of adam optimizer parameters
            opt = tf.train.AdamOptimizer(1e-4)
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)
            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 14
0
def inference(args):
    indir = os.path.join(args.log_dir, 'train')
    outdir = os.path.join(args.log_dir, 'player') if args.out_dir is None else args.out_dir

    with open(indir + "/checkpoint", "r") as f:
        first_line = f.readline().strip()
        print ("first_line is : {}".format(first_line))
    ckpt = first_line.split(' ')[-1].split('/')[-1][:-1]
    ckpt = ckpt.split('-')[-1]
    ckpt = indir + '/model.ckpt-' + ckpt

    print ("ckpt: {}".format(ckpt))

    # define environment
    env = create_icegame_env(outdir, args.env_id)
    num_actions = env.action_space.n

    with tf.device("/cpu:0"):
        # define policy network
        with tf.variable_scope("global"):
            policy = LSTMPolicy(env.observation_space.shape, num_actions)
            policy.global_step = tf.get_variable("global_step", [], 
                    tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False)
        # Variable names that start with "local" are not saved in checkpoints.
        variables_to_restore = [v for v in tf.global_variables() if not v.name.startswith("local")]
        init_all_op = tf.global_variables_initializer()

        saver = FastSaver(variables_to_restore)

        # print trainable variables
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
        logger.info('Trainable vars:')
        for v in var_list:
            logger.info('  {} {}'.format(v.name, v.get_shape()))
        logger.info("Restored the trained model.")

        # summary of rewards
        action_writers = []
        summary_writer = tf.summary.FileWriter(outdir)
        for act_idx in range(num_actions):
            action_writers.append(tf.summary.FileWriter(
                os.path.join(outdir, "action_{}".format(act_idx))
            ))

        logger.info("Inference events directory: %s", outdir)
        config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)

        with tf.Session() as sess:
            logger.info("Initializing all parameters.")
            sess.run(init_all_op)
            logger.info("Restoring trainable global parameters.")
            saver.restore(sess, ckpt)
            logger.info("Restored model was trained for %.2fM global steps", sess.run(policy.global_step)/1000000.)

            last_features = policy.get_initial_features()  # reset lstm memory
            length = 0
            rewards = 0

            # For plotting
            plt.ion()
            fig = plt.figure(num=None, figsize=(8, 8), dpi=92, facecolor='w', edgecolor='k')

            gs1 = gridspec.GridSpec(3, 3)
            gs1.update(left=0.05, right=0.85, wspace=0.15)
            ax1 = plt.subplot(gs1[:-1, :])
            ax2 = plt.subplot(gs1[-1, :-1])
            ax3 = plt.subplot(gs1[-1, -1])

            ax1.set_title("IceGame (Agent Lives: {}, UpTimes: {})".format(env.lives, env.sim.get_updated_counter()))

            ind = np.arange(num_actions)
            width = 0.20
            #action_legends = ["Up", "Down", "Left", "Right", "NextUp", "NextDown", "Metropolis"]
            action_legends = [">", "v", "<", "^", "", "", "Metro"]

            for ep in range(args.num_episodes):
                """TODO: policy sampling strategy
                    random, greedy and sampled policy.
                """

                last_state = env.reset()
                steps_rewards=[]
                steps_values=[]

                # running policy
                while True:
                    fetched = policy.act_inference(last_state, *last_features)
                    prob_action, action, value_, features = fetched[0], fetched[1], fetched[2], fetched[3:]

                    #TODO: policy sampling strategy

                    # Greedy
                    #print ("Prob of actions: {}".format(prob_action))
                    stepAct = action.argmax()
                    state, reward, terminal, info = env.step(stepAct)

                    # update stats
                    length += 1
                    rewards += reward
                    last_state = state
                    last_features = features
                    steps_rewards.append(rewards)
                    steps_values.append(value_)

                    if info:
                        loopsize = info["Loop Size"]
                        looparea = info["Loop Area"]

                    """Animation for State and Actions
                    """
                    ax2.clear()
                    ax2.bar(ind, prob_action)
                    ax2.set_xticks(ind + width / 2)
                    ax2.set_xticklabels(action_legends)

                    ax1.imshow(state[:,:,2], 'Reds', interpolation="None",  vmin=-1, vmax=1)
                    # with hist
                    #ax1.imshow(state[:,:,7], 'Reds', interpolation="None",  vmin=-1, vmax=1)
                    ax1.set_title("IceGame: (Agent Lives: {}, UpTimes: {})".format(env.lives, env.sim.get_updated_counter()))

                    ax3.clear()
                    ax3.plot(steps_rewards, linewidth=2)
                    ax3.plot(steps_values, linewidth=2)
                    #plt.savefig("records/{}.png".format(length))

                    plt.pause(0.20)

                    # store summary
                    summary = tf.Summary()
                    summary.value.add(tag='ep_{}/reward'.format(ep), simple_value=reward)
                    summary.value.add(tag='ep_{}/netreward'.format(ep), simple_value=rewards)
                    summary.value.add(tag='ep_{}/value'.format(ep), simple_value=float(value_[0]))

                    if info:
                        summary.value.add(tag='ep_{}/loop_size'.format(ep), simple_value=loopsize)
                        summary.value.add(tag='ep_{}/loop_area'.format(ep), simple_value=looparea)

                    summary_writer.add_summary(summary, length)
                    summary_writer.flush()

                    summary = tf.Summary()
                    for ac_id in range(num_actions):
                        summary.value.add(tag='ep_{}/a_{}'.format(ep, ac_id), simple_value=float(prob_action[ac_id]))
                        action_writers[ac_id].add_summary(summary, length)
                        action_writers[ac_id].flush()

                    """TODO:
                        1. Need more concrete idea for playing the game when interfering.
                        2. Save these values for post processing.
                    """
                    if terminal:
                        #if length >= timestep_limit:
                        #    last_state, _, _, _ = env.reset()

                        last_features = policy.get_initial_features()  # reset lstm memory
                        print("Episode finished. Sum of rewards: %.2f. Length: %d." % (rewards, length))

                        length = 0
                        rewards = 0
                        break

        logger.info('Finished %d true episodes.', args.num_episodes)
        plt.savefig("GameScene.png")
        logger.info("Save the last scene to GameScene.png")
        env.close()
Ejemplo n.º 15
0
    def __init__(self, env, task, visualise, test=False):
        """
An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
should be computed.
"""

        self.env = env
        self.task = task
        self.meta_action_size = 32

        worker_device = "/job:worker/task:{}/cpu:0".format(task)
        if test:
            worker_device = "/job:eval/task:{}/cpu:0".format(task)
        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(env.observation_space.shape,
                                          env.action_space.n,
                                          self.meta_action_size)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)
                self.meta_network = MetaPolicy(env.observation_space.shape,
                                               self.meta_action_size)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(
                    env.observation_space.shape, env.action_space.n,
                    self.meta_action_size)
                self.local_meta_network = meta_pi = MetaPolicy(
                    env.observation_space.shape, self.meta_action_size)
                pi.global_step = self.global_step

            self.ac = tf.placeholder(tf.float32, [None, env.action_space.n],
                                     name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")

            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)

            # the "policy gradients" loss:  its derivative is precisely the policy gradient
            # notice that self.ac is a placeholder that is provided externally.
            # adv will contain the advantages, as calculated in process_rollout
            pi_loss = -tf.reduce_sum(
                tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)

            # loss of value function
            vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r))
            entropy = -tf.reduce_sum(prob_tf * log_prob_tf)

            bs = tf.to_float(tf.shape(pi.x)[0])
            self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01

            self.visualise = visualise

            grads = tf.gradients(self.loss, pi.var_list)

            actor_summary = [
                tf.summary.scalar("model/policy_loss", pi_loss / bs),
                tf.summary.scalar("model/value_loss", vf_loss / bs),
                tf.summary.scalar("model/entropy", entropy / bs),
                tf.summary.image("model/state", pi.x),
                tf.summary.scalar("model/grad_global_norm",
                                  tf.global_norm(grads)),
                tf.summary.scalar("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
            ]

            self.summary_op = tf.summary.merge(actor_summary)

            grads, _ = tf.clip_by_global_norm(grads, 40.0)

            # This is sync ops which copy weights from shared space to the local.
            self.sync = tf.group(*([
                v1.assign(v2)
                for v1, v2 in zip(pi.var_list, self.network.var_list)
            ]))

            grads_and_vars = list(zip(grads, self.network.var_list))
            inc_step = self.global_step.assign_add(tf.shape(pi.x)[0])
            # each worker has a different set of adam optimizer parameters
            opt = tf.train.AdamOptimizer(1e-4)
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)
            self.summary_writer = None
            self.local_steps = 0

            ###################################
            ########## META CONTROLLER ########
            ###################################
            self.meta_ac = tf.placeholder(tf.float32,
                                          [None, self.meta_action_size],
                                          name="meta_ac")
            self.meta_adv = tf.placeholder(tf.float32, [None], name="meta_adv")
            self.meta_r = tf.placeholder(tf.float32, [None], name="meta_r")

            meta_log_prob_tf = tf.nn.log_softmax(meta_pi.logits)
            meta_prob_tf = tf.nn.softmax(meta_pi.logits)

            meta_pi_loss = -tf.reduce_sum(
                tf.reduce_sum(meta_log_prob_tf * self.meta_ac, [1]) *
                self.meta_adv)
            meta_vf_loss = 0.5 * tf.reduce_sum(
                tf.square(meta_pi.vf - self.meta_r))

            # entropy
            meta_entropy = -tf.reduce_sum(meta_prob_tf * meta_log_prob_tf)
            meta_bs = tf.to_float(tf.shape(meta_pi.x)[0])

            self.meta_loss = meta_pi_loss + 0.5 * meta_vf_loss - meta_entropy * 0.01
            meta_grads = tf.gradients(self.meta_loss, meta_pi.var_list)
            meta_grads, _ = tf.clip_by_global_norm(meta_grads, 40.0)

            self.meta_sync = tf.group(*([
                v1.assign(v2)
                for v1, v2 in zip(meta_pi.var_list, self.meta_network.var_list)
            ]))

            meta_grads_and_vars = list(
                zip(meta_grads, self.meta_network.var_list))
            meta_opt = tf.train.AdamOptimizer(1e-4)
            self.meta_train_op = meta_opt.apply_gradients(meta_grads_and_vars)

            meta_summary = [
                tf.summary.scalar("meta_model/policy_loss",
                                  meta_pi_loss / meta_bs),
                tf.summary.scalar("meta_model/value_loss",
                                  meta_vf_loss / meta_bs),
                tf.summary.scalar("meta_model/entropy",
                                  meta_entropy / meta_bs),
                tf.summary.scalar("meta_model/grad_global_norm",
                                  tf.global_norm(meta_grads)),
                tf.summary.scalar("meta_model/var_global_norm",
                                  tf.global_norm(meta_pi.var_list))
            ]
            self.meta_summary_op = tf.summary.merge(meta_summary)
            self.beta = 0.75
Ejemplo n.º 16
0
def inference(args):
    """
    It only restores LSTMPolicy architecture, and does inference using that.
    """
    # get address of checkpoints
    indir = os.path.join(args.log_dir, 'train')
    outdir = os.path.join(
        args.log_dir, 'inference') if args.out_dir is None else args.out_dir
    with open(indir + '/checkpoint', 'r') as f:
        first_line = f.readline().strip()
    ckpt = first_line.split(' ')[-1].split('/')[-1][:-1]
    ckpt = ckpt.split('-')[-1]
    ckpt = indir + '/model.ckpt-' + ckpt

    # define environment
    if args.record:
        env = create_env(args.env_id,
                         client_id='0',
                         remotes=None,
                         envWrap=args.envWrap,
                         designHead=args.designHead,
                         record=True,
                         noop=args.noop,
                         acRepeat=args.acRepeat,
                         outdir=outdir)
    else:
        env = create_env(args.env_id,
                         client_id='0',
                         remotes=None,
                         envWrap=args.envWrap,
                         designHead=args.designHead,
                         record=True,
                         noop=args.noop,
                         acRepeat=args.acRepeat)
    numaction = env.action_space.n

    with tf.device("/cpu:0"):
        # define policy network
        with tf.variable_scope("global"):
            policy = LSTMPolicy(env.observation_space.shape, numaction,
                                args.designHead)
            policy.global_step = tf.get_variable(
                "global_step", [],
                tf.int32,
                initializer=tf.constant_initializer(0, dtype=tf.int32),
                trainable=False)

        # Variable names that start with "local" are not saved in checkpoints.
        if use_tf12_api:
            variables_to_restore = [
                v for v in tf.global_variables()
                if not v.name.startswith("local")
            ]
            init_all_op = tf.global_variables_initializer()
        else:
            variables_to_restore = [
                v for v in tf.all_variables() if not v.name.startswith("local")
            ]
            init_all_op = tf.initialize_all_variables()
        saver = FastSaver(variables_to_restore)

        # print trainable variables
        var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     tf.get_variable_scope().name)
        logger.info('Trainable vars:')
        for v in var_list:
            logger.info('  %s %s', v.name, v.get_shape())

        # summary of rewards
        action_writers = []
        if use_tf12_api:
            summary_writer = tf.summary.FileWriter(outdir)
            for ac_id in range(numaction):
                action_writers.append(
                    tf.summary.FileWriter(
                        os.path.join(outdir, 'action_{}'.format(ac_id))))
        else:
            summary_writer = tf.train.SummaryWriter(outdir)
            for ac_id in range(numaction):
                action_writers.append(
                    tf.train.SummaryWriter(
                        os.path.join(outdir, 'action_{}'.format(ac_id))))
        logger.info("Inference events directory: %s", outdir)

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        with tf.Session(config=config) as sess:
            logger.info("Initializing all parameters.")
            sess.run(init_all_op)
            logger.info("Restoring trainable global parameters.")
            saver.restore(sess, ckpt)
            logger.info("Restored model was trained for %.2fM global steps",
                        sess.run(policy.global_step) / 1000000.)
            #saving with meta graph:
            metaSaver = tf.train.Saver(variables_to_restore)
            metaSaver.save(
                sess, '/home/swagking0/noreward-rl/models/models_me/mario_me')

            last_state = env.reset()
            if args.render or args.record:
                env.render()
            last_features = policy.get_initial_features()  # reset lstm memory
            length = 0
            rewards = 0
            mario_distances = np.zeros((args.num_episodes, ))
            for i in range(args.num_episodes):
                print("Starting episode %d" % (i + 1))
                if args.recordSignal:
                    from PIL import Image
                    signalCount = 1
                    utils.mkdir_p(outdir + '/recordedSignal/ep_%02d/' % i)
                    Image.fromarray(
                        (255 * last_state[..., -1]).astype('uint8')).save(
                            outdir + '/recordedSignal/ep_%02d/%06d.jpg' %
                            (i, signalCount))

                if args.random:
                    print('I am random policy!')
                else:
                    if args.greedy:
                        print('I am greedy policy!')
                    else:
                        print('I am sampled policy!')
                while True:
                    # run policy
                    fetched = policy.act_inference(last_state, *last_features)
                    prob_action, action, value_, features = fetched[
                        0], fetched[1], fetched[2], fetched[3:]

                    # run environment: sampled one-hot 'action' (not greedy)
                    if args.random:
                        stepAct = np.random.randint(0,
                                                    numaction)  # random policy
                    else:
                        if args.greedy:
                            stepAct = prob_action.argmax()  # greedy policy
                        else:
                            stepAct = action.argmax()
                    # print(stepAct, prob_action.argmax(), prob_action)
                    state, reward, terminal, info = env.step(stepAct)

                    # update stats
                    length += 1
                    rewards += reward
                    last_state = state
                    last_features = features
                    if args.render or args.record:
                        env.render()
                    if args.recordSignal:
                        signalCount += 1
                        Image.fromarray(
                            (255 * last_state[..., -1]).astype('uint8')).save(
                                outdir + '/recordedSignal/ep_%02d/%06d.jpg' %
                                (i, signalCount))

                    # store summary
                    summary = tf.Summary()
                    summary.value.add(tag='ep_{}/reward'.format(i),
                                      simple_value=reward)
                    summary.value.add(tag='ep_{}/netreward'.format(i),
                                      simple_value=rewards)
                    summary.value.add(tag='ep_{}/value'.format(i),
                                      simple_value=float(value_[0]))
                    if 'NoFrameskip-v' in args.env_id:  # atari
                        summary.value.add(
                            tag='ep_{}/lives'.format(i),
                            simple_value=env.unwrapped.ale.lives())
                    summary_writer.add_summary(summary, length)
                    summary_writer.flush()
                    summary = tf.Summary()
                    for ac_id in range(numaction):
                        summary.value.add(tag='action_prob',
                                          simple_value=float(
                                              prob_action[ac_id]))
                        action_writers[ac_id].add_summary(summary, length)
                        action_writers[ac_id].flush()

                    timestep_limit = env.spec.tags.get(
                        'wrapper_config.TimeLimit.max_episode_steps')
                    if timestep_limit is None:
                        timestep_limit = env.spec.timestep_limit
                    if terminal or length >= timestep_limit:
                        if length >= timestep_limit or not env.metadata.get(
                                'semantics.autoreset'):
                            last_state = env.reset()
                        last_features = policy.get_initial_features(
                        )  # reset lstm memory
                        print(
                            "Episode finished. Sum of rewards: %.2f. Length: %d."
                            % (rewards, length))
                        if 'distance' in info:
                            print('Mario Distance Covered:', info['distance'])
                            mario_distances[i] = info['distance']
                        length = 0
                        rewards = 0
                        if args.render or args.record:
                            env.render()
                        if args.recordSignal:
                            signalCount += 1
                            Image.fromarray(
                                (255 *
                                 last_state[..., -1]).astype('uint8')).save(
                                     outdir +
                                     '/recordedSignal/ep_%02d/%06d.jpg' %
                                     (i, signalCount))
                        break

        logger.info('Finished %d true episodes.', args.num_episodes)
        if 'distance' in info:
            print('Mario Distances:', mario_distances)
            np.save(outdir + '/distances.npy', mario_distances)
        env.close()
Ejemplo n.º 17
0
    def __init__(self, env, task, visualise):
        """
An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
should be computed.
parameters:
    1. env: environment
    2. task: taks id
    3. network: LSTMPolicy
    4. global_step: variable that tracks global steps
    5. local_network: copy of LSTMPolicy
    6. ac: acter critic
    7. adv: advantage place holder, single value
    8. r: reward
    9. loss: the loss value
    10. runner
"""

        self.env = env
        self.task = task    #task id specifying which worker is working
        worker_device = "/job:worker/task:{}/cpu:0".format(task)  #create single worker
        #while  
        with tf.device(tf.train.replica_device_setter(1, worker_device=worker_device)):
            with tf.variable_scope("global"):
                # the input shpae is 128 x 200 x 1, size of the action is 
                self.network = LSTMPolicy(env.observation_space.shape, env.action_space.n)
                self.global_step = tf.get_variable("global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32),
                                                   trainable=False)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                #space dimension 128 x 200, action space size 7, if using neon race
                self.local_network = pi = LSTMPolicy(env.observation_space.shape, env.action_space.n)
                #this syncronize the worker with global step
                pi.global_step = self.global_step
            
            #placeholder for variables
            self.ac = tf.placeholder(tf.float32, [None, env.action_space.n], name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")
            
            # probability of actions
            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)

            # the "policy gradients" loss:  its derivative is precisely the policy gradient
            # notice that self.ac is a placeholder that is provided externally.
            # adv will contain the advantages, as calculated in process_rollout
            pi_loss = - tf.reduce_sum(tf.reduce_sum(log_prob_tf * self.ac, [1]) * self.adv)

            # loss of value function
            vf_loss = 0.5 * tf.reduce_sum(tf.square(pi.vf - self.r))   # sum of squared error which is defined as state_value - actual_reward
            entropy = - tf.reduce_sum(prob_tf * log_prob_tf)

            bs = tf.to_float(tf.shape(pi.x)[0])
            # this is the total loss function
            self.loss = pi_loss + 0.5 * vf_loss - entropy * 0.01

            # 20 represents the number of "local steps":  the number of timesteps
            # we run the policy before we update the parameters.
            # The larger local steps is, the lower is the variance in our policy gradients estimate
            # on the one hand;  but on the other hand, we get less frequent parameter updates, which
            # slows down learning.  In this code, we found that making local steps be much
            # smaller than 20 makes the algorithm more difficult to tune and to get to work.
            #t_max = 20, looking a head of 20 steps
            self.runner = RunnerThread(env, pi, 20, visualise)


            grads = tf.gradients(self.loss, pi.var_list)

            #save summary
            if use_tf12_api:
                tf.summary.scalar("model/policy_loss", pi_loss / bs)
                tf.summary.scalar("model/value_loss", vf_loss / bs)
                tf.summary.scalar("model/entropy", entropy / bs)
                tf.summary.image("model/state", pi.x)
                tf.summary.scalar("model/grad_global_norm", tf.global_norm(grads))
                tf.summary.scalar("model/var_global_norm", tf.global_norm(pi.var_list))
                self.summary_op = tf.summary.merge_all()

            else:
                tf.scalar_summary("model/policy_loss", pi_loss / bs)
                tf.scalar_summary("model/value_loss", vf_loss / bs)
                tf.scalar_summary("model/entropy", entropy / bs)
                tf.image_summary("model/state", pi.x)
                tf.scalar_summary("model/grad_global_norm", tf.global_norm(grads))
                tf.scalar_summary("model/var_global_norm", tf.global_norm(pi.var_list))
                self.summary_op = tf.merge_all_summaries()

            # perform gradient clipping
            # https://hackernoon.com/gradient-clipping-57f04f0adae
            grads, _ = tf.clip_by_global_norm(grads, 40.0)

            # copy weights from the parameter server to the local model
            self.sync = tf.group(*[v1.assign(v2) for v1, v2 in zip(pi.var_list, self.network.var_list)])

            # this will generate a tuple, where each of elements is in the form of (gradient, variable_name)
            grads_and_vars = list(zip(grads, self.network.var_list))
            inc_step = self.global_step.assign_add(tf.shape(pi.x)[0])

            # each worker has a different set of adam optimizer parameters
            opt = tf.train.AdamOptimizer(1e-4)
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step)
            self.summary_writer = None
            self.local_steps = 0
Ejemplo n.º 18
0
def init(env):
    policy = LSTMPolicy(env.observation_space.shape, env.action_space.n)
    # Load this from training snapshot
    return policy