Ejemplo n.º 1
0
    def _build_net(self):
        self.network = pi = LSTMPolicy(self.obs_shape, self.numaction,
                                       self.designHead)

        with tf.variable_scope("predictor"):
            self.ap_network = predictor = StateActionPredictor(
                self.obs_shape, self.numaction, self.designHead)

        self.ac = tf.placeholder(tf.float32, [None, self.numaction], name="ac")
        self.adv = tf.placeholder(tf.float32, [None], name="adv")
        self.r = tf.placeholder(tf.float32, [None], name="r")
        log_prob_tf = tf.nn.log_softmax(pi.logits)
        prob_tf = tf.nn.softmax(pi.logits)
        pi_loss = -tf.reduce_mean(
            tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv)
        vf_loss = 0.5 * tf.reduce_mean(tf.square(pi.vf - self.r))
        entropy = -tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1))
        self.loss = pi_loss + 0.5 * vf_loss - entropy * constants[
            'ENTROPY_BETA']

        # compute gradients
        grads = tf.gradients(self.loss, pi.var_list)

        # computing predictor loss
        self.predloss = constants['PREDICTION_LR_SCALE'] * (
            predictor.invloss * (1 - constants['FORWARD_LOSS_WT']) +
            predictor.forwardloss * constants['FORWARD_LOSS_WT'])
        predgrads = tf.gradients(self.predloss, predictor.var_list)

        # clip gradients
        grads, _ = tf.clip_by_global_norm(grads, constants['GRAD_NORM_CLIP'])
        grads_and_vars = list(zip(grads, self.network.var_list))

        predgrads, _ = tf.clip_by_global_norm(predgrads,
                                              constants['GRAD_NORM_CLIP'])
        pred_grads_and_vars = list(zip(predgrads, self.ap_network.var_list))
        grads_and_vars = grads_and_vars + pred_grads_and_vars

        opt = tf.train.AdamOptimizer(constants['LEARNING_RATE'])
        self.train_op = tf.group(opt.apply_gradients(grads_and_vars))
Ejemplo n.º 2
0
        print('--------------------------------')

    def get_available_action(self, check=True):
        acts = []
        for a in ["W","A","S","D"]:
            if self.check_action(a) or (not check):
                acts.append(a)
        return random.choice(acts)


testing = False
ACTS = ['W','A','S','D']
l = 42
s = Simulator([l,l], [2, 2], 0)
rl = RL([None,l,l,1],4)
sap = SAP([l,l,1],4,load_from_file=False)
rl.set_session(sap.sess)
sQ = None
s1 = s.get_state()


batch = []
xs = []
ys = []
tempys = []
for i in range(1000000000):

    e = 5 / ((i/5000)+10)
    """if i > 1000000:
        if i > 8000:
            time.sleep(0.5)
Ejemplo n.º 3
0
    def __init__(self,
                 env,
                 task,
                 visualise,
                 unsupType,
                 envWrap=False,
                 designHead='universe',
                 noReward=False):
        """
        An implementation of the A3C algorithm that is reasonably well-tuned for the VNC environments.
        Below, we will have a modest amount of complexity due to the way TensorFlow handles data parallelism.
        But overall, we'll define the model, specify its inputs, and describe how the policy gradients step
        should be computed.
        """
        self.task = task
        self.unsup = unsupType is not None
        self.envWrap = envWrap
        self.env = env

        predictor = None
        numaction = env.action_space.n
        worker_device = "/job:worker/task:{}/cpu:0".format(task)

        with tf.device(
                tf.train.replica_device_setter(1,
                                               worker_device=worker_device)):
            with tf.variable_scope("global"):
                self.network = LSTMPolicy(env.observation_space.shape,
                                          numaction, designHead)
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)
                if self.unsup:
                    with tf.variable_scope("predictor"):
                        if 'state' in unsupType:
                            self.ap_network = StatePredictor(
                                env.observation_space.shape, numaction,
                                designHead, unsupType)
                        else:
                            self.ap_network = StateActionPredictor(
                                env.observation_space.shape, numaction,
                                designHead)

        with tf.device(worker_device):
            with tf.variable_scope("local"):
                self.local_network = pi = LSTMPolicy(
                    env.observation_space.shape, numaction, designHead)
                pi.global_step = self.global_step
                if self.unsup:
                    with tf.variable_scope("predictor"):
                        if 'state' in unsupType:
                            self.local_ap_network = predictor = StatePredictor(
                                env.observation_space.shape, numaction,
                                designHead, unsupType)
                        else:
                            self.local_ap_network = predictor = StateActionPredictor(
                                env.observation_space.shape, numaction,
                                designHead)

            # Computing a3c loss: https://arxiv.org/abs/1506.02438
            self.ac = tf.placeholder(tf.float32, [None, numaction], name="ac")
            self.adv = tf.placeholder(tf.float32, [None], name="adv")
            self.r = tf.placeholder(tf.float32, [None], name="r")
            log_prob_tf = tf.nn.log_softmax(pi.logits)
            prob_tf = tf.nn.softmax(pi.logits)
            # 1) the "policy gradients" loss:  its derivative is precisely the policy gradient
            # notice that self.ac is a placeholder that is provided externally.
            # adv will contain the advantages, as calculated in process_rollout
            pi_loss = -tf.reduce_mean(
                tf.reduce_sum(log_prob_tf * self.ac, 1) * self.adv)  # Eq (19)
            # 2) loss of value function: l2_loss = (x-y)^2/2
            vf_loss = 0.5 * tf.reduce_mean(
                tf.square(pi.vf - self.r))  # Eq (28)
            # 3) entropy to ensure randomness
            entropy = -tf.reduce_mean(tf.reduce_sum(prob_tf * log_prob_tf, 1))
            # final a3c loss: lr of critic is half of actor
            self.loss = pi_loss + 0.5 * vf_loss - entropy * constants[
                'ENTROPY_BETA']

            # compute gradients
            grads = tf.gradients(
                self.loss * 20.0, pi.var_list
            )  # batchsize=20. Factored out to make hyperparams not depend on it.

            # computing predictor loss
            if self.unsup:
                if 'state' in unsupType:
                    self.predloss = constants[
                        'PREDICTION_LR_SCALE'] * predictor.forwardloss
                else:
                    self.predloss = constants['PREDICTION_LR_SCALE'] * (
                        predictor.invloss *
                        (1 - constants['FORWARD_LOSS_WT']) +
                        predictor.forwardloss * constants['FORWARD_LOSS_WT'])
                predgrads = tf.gradients(
                    self.predloss * 20.0, predictor.var_list
                )  # batchsize=20. Factored out to make hyperparams not depend on it.

                # do not backprop to policy
                if constants['POLICY_NO_BACKPROP_STEPS'] > 0:
                    grads = [
                        tf.scalar_mul(
                            tf.to_float(
                                tf.greater(
                                    self.global_step,
                                    constants['POLICY_NO_BACKPROP_STEPS'])),
                            grads_i) for grads_i in grads
                    ]

            self.runner = RunnerThread(env, pi, constants['ROLLOUT_MAXLEN'],
                                       visualise, predictor, envWrap, noReward)

            # storing summaries
            bs = tf.to_float(tf.shape(pi.x)[0])
            if use_tf12_api:
                tf.summary.scalar("model/policy_loss", pi_loss)
                tf.summary.scalar("model/value_loss", vf_loss)
                tf.summary.scalar("model/entropy", entropy)
                tf.summary.image("model/state", pi.x)  # max_outputs=10
                tf.summary.scalar("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.summary.scalar("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
                if self.unsup:
                    tf.summary.scalar("model/predloss", self.predloss)
                    if 'action' in unsupType:
                        tf.summary.scalar("model/inv_loss", predictor.invloss)
                        tf.summary.scalar("model/forward_loss",
                                          predictor.forwardloss)
                    tf.summary.scalar("model/predgrad_global_norm",
                                      tf.global_norm(predgrads))
                    tf.summary.scalar("model/predvar_global_norm",
                                      tf.global_norm(predictor.var_list))
                self.summary_op = tf.summary.merge_all()
            else:
                tf.scalar_summary("model/policy_loss", pi_loss)
                tf.scalar_summary("model/value_loss", vf_loss)
                tf.scalar_summary("model/entropy", entropy)
                tf.image_summary("model/state", pi.x)
                tf.scalar_summary("model/grad_global_norm",
                                  tf.global_norm(grads))
                tf.scalar_summary("model/var_global_norm",
                                  tf.global_norm(pi.var_list))
                if self.unsup:
                    tf.scalar_summary("model/predloss", self.predloss)
                    if 'action' in unsupType:
                        tf.scalar_summary("model/inv_loss", predictor.invloss)
                        tf.scalar_summary("model/forward_loss",
                                          predictor.forwardloss)
                    tf.scalar_summary("model/predgrad_global_norm",
                                      tf.global_norm(predgrads))
                    tf.scalar_summary("model/predvar_global_norm",
                                      tf.global_norm(predictor.var_list))
                self.summary_op = tf.merge_all_summaries()

            # clip gradients
            grads, _ = tf.clip_by_global_norm(grads,
                                              constants['GRAD_NORM_CLIP'])
            grads_and_vars = list(zip(grads, self.network.var_list))
            if self.unsup:
                predgrads, _ = tf.clip_by_global_norm(
                    predgrads, constants['GRAD_NORM_CLIP'])
                pred_grads_and_vars = list(
                    zip(predgrads, self.ap_network.var_list))
                grads_and_vars = grads_and_vars + pred_grads_and_vars

            # update global step by batch size
            inc_step = self.global_step.assign_add(tf.shape(pi.x)[0])

            # each worker has a different set of adam optimizer parameters
            # TODO: make optimizer global shared, if needed
            print("Optimizer: ADAM with lr: %f" % (constants['LEARNING_RATE']))
            print("Input observation shape: ", env.observation_space.shape)
            opt = tf.train.AdamOptimizer(constants['LEARNING_RATE'])
            self.train_op = tf.group(opt.apply_gradients(grads_and_vars),
                                     inc_step)

            # copy weights from the parameter server to the local model
            sync_var_list = [
                v1.assign(v2)
                for v1, v2 in zip(pi.var_list, self.network.var_list)
            ]
            if self.unsup:
                sync_var_list += [
                    v1.assign(v2) for v1, v2 in zip(predictor.var_list,
                                                    self.ap_network.var_list)
                ]
            self.sync = tf.group(*sync_var_list)

            # initialize extras
            self.summary_writer = None
            self.local_steps = 0