Exemple #1
0
class I2A(object):
    def __init__(self, config):
        self.config = config
        # for universe-starter-agent
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
        # build some graph nodes
        self.inputs_s = tf.placeholder(
            tf.float32, [None] + [config.n_actions, config.rollout_length] +
            config.state_dims)
        self.inputs_r = tf.placeholder(
            tf.float32, [None] + [config.n_actions, config.rollout_length])
        self.X = tf.placeholder(tf.float32, [None] + config.state_dims)

        # instantiate the model free policy
        self.mf_policy = justCNN
        with tf.variable_scope('mf_policy', reuse=config.reuse):
            mf_feats = self.mf_policy(self.X)
            mf_feats = layers.flatten(mf_feats)
        # instantiate the rollout policy
        self.rollout_policy = Policy(natureCNN, config)
        with tf.variable_scope('rollout_policy', reuse=config.reuse):
            self.rp_logits, rp_pi, rp_actions, rp_vf = self.rollout_policy.forward(
                self.X)
        # instantiate the imagination core
        # we can only instantiate this once we have defined our rollout policy
        self.imagination_core = ImaginationCore(config, self.rollout_policy)
        # instantiate the encoder
        self.encoder = Encoder(justCNN, config)
        with tf.variable_scope('encoder', reuse=config.reuse):
            encodings = self.encoder.forward(self.inputs_s, self.inputs_r)
        aggregate = tf.reshape(
            encodings, shape=[-1, config.n_actions * config.hidden_dim])
        # we can experiment with this next line of code
        # you can either concat, add, or multiply
        i2a_inputs = tf.concat([aggregate, mf_feats], -1)
        # instantiate the I2A policy
        self.i2a_policy = Policy(linear, config)
        with tf.variable_scope('i2a_policy', reuse=config.reuse):
            self.logits, self.pi, self.actions, self.vf = self.i2a_policy.forward(
                i2a_inputs)

        # TODO:
        #		during training, run actions through the rollout policy and set the loss to negative log likelihood with
        #		the i2a policy in order to keep the KL between rollout nad i2a policy small
        #
        # 		the reason we define a separate rollout policy and pass it into the env model rather than contain
        #		it within the env model is that we also want to directly pass states through the rollout model during
        #		training

    def act(self, state):
        # i2a gets a 1-batch of states
        # [-1, 84, 84, 4]
        sess = tf.get_default_session()
        rollouts_s, rollouts_r = self.rollout(state)
        # we should get something that is
        # rollouts_s: [-1, n_actions, rollout_length, 84, 84, 4]
        # rollouts_r: [-1, n_actions, rollout_length]
        logits, pi, actions, vf = sess.run(
            [self.logits, self.pi, self.actions, self.vf],
            feed_dict={
                self.inputs_s: rollouts_s,
                self.inputs_r: rollouts_r,
                self.X: state
            })
        # (rollouts_s, rollouts_r) is in a tuple because this is literally our state for I2A
        return actions, vf, rollouts_s, rollouts_r

    def value(self, state):
        # i2a gets a 1-batch of states
        # [-1, 84, 84, 4]
        sess = tf.get_default_session()
        rollouts_s, rollouts_r = self.rollout(state)
        # we should get something that is
        # rollouts_s: [-1, n_actions, rollout_length, 84, 84, 4]
        # rollouts_r: [-1, n_actions, rollout_length]
        logits, pi, actions, vf = sess.run(
            [self.logits, self.pi, self.actions, self.vf],
            feed_dict={
                self.inputs_s: rollouts_s,
                self.inputs_r: rollouts_r,
                self.X: state
            })
        # (rollouts_s, rollouts_r) is in a tuple because this is literally our state for I2A
        return vf

    def rollout(self, states):
        # initialize state for batch rollouts
        # this is of shape [-1, n_actions, 84, 84, 4]
        config = self.config
        states = np.expand_dims(states, axis=1)
        states = np.concatenate([states] * config.n_actions, axis=1)
        # roll everything out and put it in a placeholder
        rollouts_s = []
        rollouts_r = []
        for i in range(config.rollout_length):
            # on the first timestep, each rollout will take its own action
            # afterwards, each rollout will take randomly sampled actions
            next_states, rewards = self.imagination_core.predict(states,
                                                                 init=i == 0)
            # add to our rollout
            rollouts_s.append(next_states)
            rollouts_r.append(rewards)
            # advance
            states = next_states
        # rollouts_s should be a list of objects of [-1, n_actions, 84, 84, 4]
        # we want to stack them so the new rollouts_s is [-1, rollout_length, n_actions, 84, 84, 4]
        rollouts_s = np.stack(rollouts_s, axis=1)
        rollouts_r = np.stack(rollouts_r,
                              axis=1).reshape(-1, config.rollout_length,
                                              config.n_actions)
        rollouts_s = np.transpose(rollouts_s, (0, 2, 1, 3, 4, 5))
        rollouts_r = np.transpose(rollouts_r, (0, 2, 1))
        return rollouts_s, rollouts_r
Exemple #2
0
class I2A(object):
    def __init__(self, config):
        self.config = config
        # for universe-starter-agent
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)
        # build some graph nodes
        self.inputs_s = tf.placeholder(
            tf.float32, [None] + [config.n_actions, config.rollout_length] +
            config.state_dims)
        self.inputs_r = tf.placeholder(
            tf.float32, [None] + [config.n_actions, config.rollout_length])
        self.X = tf.placeholder(tf.float32, [None] + config.state_dims)

        # instantiate the model free policy
        self.mf_policy = justCNN
        with tf.variable_scope('mf_policy', reuse=config.reuse):
            mf_feats = self.mf_policy(self.X)
            mf_feats = layers.flatten(mf_feats)
        # instantiate the rollout policy
        self.rollout_policy = Policy(natureCNN, config)
        with tf.variable_scope('rollout_policy', reuse=config.reuse):
            self.rp_logits, rp_pi, rp_actions, rp_vf = self.rollout_policy.forward(
                self.X)
        # instantiate the imagination core
        # we can only instantiate this once we have defined our rollout policy
        self.imagination_core = ImaginationCore(config, self.rollout_policy)
        # instantiate the encoder
        self.encoder = Encoder(justCNN, config)
        with tf.variable_scope('encoder', reuse=config.reuse):
            encodings = self.encoder.forward(self.inputs_s, self.inputs_r)
        aggregate = tf.reshape(
            encodings, shape=[-1, config.n_actions * config.hidden_dim])
        # we can experiment with this next line of code
        # you can either concat, add, or multiply
        i2a_inputs = tf.concat([aggregate, mf_feats], -1)
        # instantiate the I2A policy
        self.i2a_policy = Policy(linear, config)
        with tf.variable_scope('i2a_policy', reuse=config.reuse):
            self.logits, self.pi, self.actions, self.vf = self.i2a_policy.forward(
                i2a_inputs)

        # TODO:
        #		during training, run actions through the rollout policy and set the loss to negative log likelihood with
        #		the i2a policy in order to keep the KL between rollout nad i2a policy small
        #
        # 		the reason we define a separate rollout policy and pass it into the env model rather than contain
        #		it within the env model is that we also want to directly pass states through the rollout model during
        #		training

    def act(self, state):
        # i2a gets a 1-batch of states
        # [1, 84, 84, 4]
        sess = tf.get_default_session()
        rollouts_s, rollouts_r = self.rollout(state)
        # we should get something that is
        # rollouts_s: [n_actions, rollout_length, 84, 84, 4]
        # rollouts_r: [n_actions, rollout_length]
        rollouts_s = np.expand_dims(rollouts_s, axis=0)
        rollouts_r = np.expand_dims(rollouts_r, axis=0)
        logits, pi, actions, vf = sess.run(
            [self.logits, self.pi, self.actions, self.vf],
            feed_dict={
                self.inputs_s: rollouts_s,
                self.inputs_r: rollouts_r,
                self.X: state
            })
        # (rollouts_s, rollouts_r) is in a tuple because this is literally our state for I2A
        return actions, vf, rollouts_s, rollouts_r

    def value(self, state):
        sess = tf.get_default_session()
        rollouts_s, rollouts_r = self.rollout(state)
        # we should get something that is
        # rollouts_s: [n_actions, rollout_length, 84, 84, 4]
        # rollouts_r: [n_actions, rollout_length]
        rollouts_s = np.expand_dims(rollouts_s, axis=0)
        rollouts_r = np.expand_dims(rollouts_r, axis=0)
        logits, pi, actions, vf = sess.run(
            [self.logits, self.pi, self.actions, self.vf],
            feed_dict={
                self.inputs_s: rollouts_s,
                self.inputs_r: rollouts_r,
                self.X: state
            })
        # (rollouts_s, rollouts_r) is in a tuple because this is literally our state for I2A
        return vf

    def rollout(self, state):
        # initialize state for batch rollouts
        # this is of shape [n_actions, 84, 84, 4]
        config = self.config
        states = np.concatenate([state] * config.n_actions, axis=0)
        # roll everything out and put it in a placeholder
        rollouts_s = []
        rollouts_r = []
        for i in range(config.rollout_length):
            # on the first timestep, each rollout will take its own action
            # afterwards, each rollout will take randomly sampled actions
            next_states, rewards = self.imagination_core.predict(states,
                                                                 init=i == 0)
            # add to our rollout
            rollouts_s.append(next_states)
            rollouts_r.append(rewards)
            # advance
            states = next_states
        # you now have something of [rollout_length, n_actions, 84, 84, 4]
        rollouts_s = np.array(rollouts_s)
        rollouts_r = np.array(rollouts_r).reshape(config.rollout_length,
                                                  config.n_actions)
        rollouts_s = np.transpose(rollouts_s, (1, 0, 2, 3, 4))
        rollouts_r = np.transpose(rollouts_r, (1, 0))
        return rollouts_s, rollouts_r


# ###### TEST CODE
# class config():
# 	n_actions = 6
# 	state_dims = [84, 84, 4]
# 	channels = 4
# 	frame_dims = [84, 84]
# 	rollout_length = 3
# 	hidden_dim = 512
# 	lstm_layers = 1

# config = config()

# # ##### VALIDITY TEST #######
# i2a = I2A(config)

# real_rewards = tf.placeholder(tf.float32, [None])

# aux_policy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=i2a.actions, logits=i2a.rp_logits)
# loss = tf.losses.mean_squared_error(real_rewards, i2a.vf) #+ aux_policy_loss
# opt = tf.train.AdamOptimizer(learning_rate=1e-3)
# train_op = opt.minimize(loss)

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

# state = np.random.random((1, 84, 84, 4))
# with sess:
# print(i2a.act(state))

# for i in range(64):
# 	rand1 = 4#np.random.random()
# 	rand2 = 4#np.random.random()
# 	state = np.ones((32, 84, 84, 4)) * rand1
# 	rollouts_s = np.ones((32, 6, 3, 84, 84, 4)) * 2 * rand1
# 	rollouts_r = np.ones((32, 6, 3)) * 3 * rand2
# 	rewards = np.ones((32)) * rand1 * rand2 * 10

# 	_, l = sess.run(
# 		[train_op, loss],
# 		feed_dict={
# 			i2a.inputs_s: rollouts_s,
# 			i2a.inputs_r: rollouts_r,
# 			i2a.x: state,
# 			real_rewards: rewards
# 		})
# 	print(l)

# rand1 = 4#np.random.random()
# rand2 = 4#np.random.random()
# state = np.ones((32, 84, 84, 4)) * rand1
# rollouts_s = np.ones((32, 6, 3, 84, 84, 4)) * 2 * rand1
# rollouts_r = np.ones((32, 6, 3)) * 3 * rand2
# rewards = np.ones((32)) * rand1 * rand2 * 10

# vf, l = sess.run(
# 	[i2a.vf, loss],
# 	feed_dict={
# 		i2a.inputs_s: rollouts_s,
# 		i2a.inputs_r: rollouts_r,
# 		i2a.x: state,
# 		real_rewards: rewards
# 	})

# print(vf, l)

######## ENCODER TEMPORAL AND ACTION SPACE DEPENDENCY TEST######
# inputs = tf.placeholder(tf.float32, [None] + [config.n_actions, config.rollout_length] + config.state_dims)
# labels = tf.placeholder(tf.float32, [None, config.n_actions, 64])

# encoder = Encoder(natureCNN, config, 'haha')
# with tf.variable_scope('haha'):
# 	pred = encoder.forward(inputs)

# loss = tf.losses.mean_squared_error(labels, pred)
# opt = tf.train.AdamOptimizer(learning_rate=2e-3)
# train_op = opt.minimize(loss)

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

# for k in range(32):
# 	x = np.ones((32, 6, 15, 84, 84, 4))
# 	# truth = np.ones((192, 15, 512)) * 3
# 	truth = np.ones((32, config.n_actions, config.hidden_dim)) * 5
# 	# for i in range(15):
# 	# # for i in range(6):
# 	# 	truth[:, i, :] = i
# 	l, _, p = sess.run(
# 		[loss, train_op, pred],
# 		feed_dict={
# 			inputs: x,
# 			labels: truth
# 		})
# 	print(np.mean(p, axis=(0, 2)).tolist())
# 	print(l)

# x = np.ones((1, 6, 15, 84, 84, 4))
# # for i in range(15):
# for i in range(6):
# 	# x[:, :, :, :, :, :] = i * 3
# 	x[:, i, :, :, :, :] = i * 0.1
# p = sess.run(
# 	[pred],
# 	feed_dict={
# 		inputs: x
# 	})[0]

# print(np.mean(p, axis=(0, 2)).tolist())

# x = np.ones((1, 6, 15, 84, 84, 4))
# # for i in range(15):
# for i in range(15):
# 	x[:, :, i, :, :, :] = i * 0.1
# 	# x[:, i, :, :, :, :] = i
# p = sess.run(
# 	[pred],
# 	feed_dict={
# 		inputs: x
# 	})[0]

# print(np.mean(p, axis=(0, 2)).tolist())