def build_graph(self, x, num_outputs=1, reuse=False):
     with tf.variable_scope(self.scope):
         if reuse:
             tf.get_variable_scope().reuse_variables()
         p_h1 = fc(x, 'fc1', nh=self.hidden_size)
         p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size)
         logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x)
     return logits
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        self.agent_id = agent_id
        nbins = 11
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space.shape[0]
        all_ac_shape = (nbatch, (sum([ac.shape[0]
                                      for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('oppo_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = []
            for k in range(len(ob_spaces)):
                if k == agent_id:
                    continue
                pi.append(
                    fc(h2, 'pi%d' % k, ac_spaces[k] * nbins, act=lambda x: x))

        pi = tf.reshape(pi, [nbatch, nact, nbins])
        a0 = sample(pi, axis=2)
        self.initial_state = []  # not stateful

        def step(ob, obs, *_args, **_kwargs):
            a = sess.run(a0, {X: ob, X_v: obs})
            return a

        def transform(a):
            # transform from [0, 9] to [-0.8, 0.8]
            a = np.array(a, dtype=np.float32)
            a = (a - (nbins - 1) / 2) / (nbins - 1) * 2.0
            return a

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.step = step
Beispiel #3
0
    def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces,
                 nenv, nsteps, nstack, reuse=False, name='model'):
        nbins = 11
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack)
        nact = ac_space.shape[0]
        all_ac_shape = (nbatch, (sum([ac.shape[0] for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = fc(h2, 'pi', nact * nbins, act=lambda x: x)

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2))
            h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2))
            vf = fc(h4, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        pi = tf.reshape(pi, [nbatch, nact, nbins])
        a0 = sample(pi, axis=2)
        self.initial_state = []  # not stateful

        def step(ob, obs, a_v, *_args, **_kwargs):
            # output continuous actions within [-1, 1]
            if a_v is not None:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v})
            else:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs})
            a = transform(a)
            return a, v, []  # dummy state

        def value(ob, a_v, *_args, **_kwargs):
            if a_v is not None:
                return sess.run(v0, {X_v: ob, A_v: a_v})
            else:
                return sess.run(v0, {X_v: ob})

        def transform(a):
            # transform from [0, 9] to [-0.8, 0.8]
            a = np.array(a, dtype=np.float32)
            a = (a - (nbins - 1) / 2) / (nbins - 1) * 2.0
            return a

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #4
0
    def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces,
                 nenv, nsteps, nstack, reuse=False, name='model'):
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack)
        nact = ac_space.shape[0]
        all_ac_shape = (nbatch, (sum([ac.shape[0] for ac in ac_spaces]) - nact) * nstack)
        X = tf.placeholder(tf.float32, ob_shape)  # obs
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=64, init_scale=np.sqrt(2), act=tf.nn.tanh)
            h2 = fc(h1, 'fc2', nh=64, init_scale=np.sqrt(2), act=tf.nn.tanh)
            pi = fc(h2, 'pi', nact, act=lambda x: x, init_scale=0.01)

        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            logstd = tf.get_variable('sigma', shape=[nact], dtype=tf.float32,
                                     initializer=tf.constant_initializer(0.0))
            logstd = tf.expand_dims(logstd, 0)
            std = tf.exp(logstd)
            std = tf.tile(std, [nbatch, 1])

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=64, init_scale=np.sqrt(2), act=tf.nn.tanh)
            h4 = fc(h3, 'fc4', nh=64, init_scale=np.sqrt(2), act=tf.nn.tanh)
            vf = fc(h4, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        a0 = pi + tf.random_normal(tf.shape(std), 0.0, 1.0) * std

        self.initial_state = []  # not stateful

        def step(ob, obs, a_v, *_args, **_kwargs):
            if a_v is not None:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v})
            else:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs})
            return a, v, []  # dummy state

        def value(ob, a_v, *_args, **_kwargs):
            if a_v is not None:
                return sess.run(v0, {X_v: ob, A_v: a_v})
            else:
                return sess.run(v0, {X_v: ob})

        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.std = std
        self.logstd = logstd
        self.step = step
        self.value = value
        self.mean_std = tf.concat([pi, std], axis=1)
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space.n
        all_ac_shape = (nbatch, (sum([ac.n
                                      for ac in ac_spaces]) - nact) * nstack)
        X = tf.placeholder(tf.float32, ob_shape, name='X')  # obs
        X_v = tf.placeholder(tf.float32, all_ob_shape, name='X_v')
        A_v = tf.placeholder(tf.float32, all_ac_shape, name='A_v')
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = fc(h2, 'pi', nact, act=lambda x: x)

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2))
            h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2))
            vf = fc(h4, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  # not stateful

        def step(ob, obs, a_v, *_args, **_kwargs):
            if a_v is not None:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v})
            else:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs})
            return a, v, []  # dummy state

        def value(ob, a_v, *_args, **_kwargs):
            if a_v is not None:
                return sess.run(v0, {X_v: ob, A_v: a_v})
            else:
                return sess.run(v0, {X_v: ob})

        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #6
0
    def __init__(self, sess, ob_space, ac_space, ob_spaces, ac_spaces,
                 nenv, nsteps, nstack, reuse=False, name='model'):
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0] for obs in ob_spaces]) * nstack)
        nact = ac_space.n
        actions = tf.placeholder(tf.int32, (nbatch))
        all_ac_shape = (nbatch, (sum([ac.n for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = fc(h2, 'pi', nact, act=lambda x: x)

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2))
            h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2))
            vf = fc(h4, 'v', 1, act=lambda x: x)

        print(pi, actions)
        self.log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=actions)
        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  # not stateful

        def step_log_prob(ob, acts):
            log_prob = sess.run(self.log_prob, {X: ob, actions: acts})
            return log_prob.reshape([-1, 1])

        def step(ob, obs, a_v, *_args, **_kwargs):
            if a_v is not None:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs, A_v: a_v})
            else:
                a, v = sess.run([a0, v0], {X: ob, X_v: obs})
            return a, v, []  # dummy state

        def value(ob, a_v, *_args, **_kwargs):
            if a_v is not None:
                return sess.run(v0, {X_v: ob, A_v: a_v})
            else:
                return sess.run(v0, {X_v: ob})

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step_log_prob = step_log_prob
        self.step = step
        self.value = value
Beispiel #7
0
    def __init__(self,
                 sess,
                 oppo_policy,
                 ob_space,
                 ac_space,
                 op_ac_n,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        # nstack always = 1
        self.oppo_policy = oppo_policy
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        op_ac_shape = (nbatch, op_ac_n * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space.n
        actions = tf.placeholder(tf.int32, (nbatch))
        all_ac_shape = (nbatch, (sum([ac.n
                                      for ac in ac_spaces]) - nact) * nstack)

        # oppo_a0 = [sample(_) for _ in self.oppo_policy.pi]
        oppo_a_list = self.oppo_policy.pi
        # (k, batch, act_nums) -> (batch, \sum_k(act_nums))
        oppo_a0 = oppo_a_list[0]
        for k in range(1, len(oppo_a_list)):
            oppo_a0 = tf.concat([oppo_a0, oppo_a_list[k]], axis=1)

        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs, not state(all obs)
        op_act_x = oppo_a0  # tf.placeholder(tf.float32, op_ac_shape)  # opponents' act
        X = tf.concat([obs_x, op_act_x], axis=1)  # input
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        # A_v = tf.concat([tf.expand_dims(actions, axis=1), op_act_x], axis=1)
        with tf.variable_scope('policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = fc(h2, 'pi', nact, act=lambda x: x)

        with tf.variable_scope('value_{}'.format(name), reuse=reuse):
            if len(ob_spaces) > 1:
                Y = tf.concat([X_v, A_v], axis=1)
            else:
                Y = X_v
            h3 = fc(Y, 'fc3', nh=256, init_scale=np.sqrt(2))
            h4 = fc(h3, 'fc4', nh=256, init_scale=np.sqrt(2))
            vf = fc(h4, 'v', 1, act=lambda x: x)

        self.log_prob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=pi, labels=actions)
        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  # not stateful

        def step_log_prob(ob, acts):
            log_prob = sess.run(self.log_prob, {X: ob, actions: acts})
            return log_prob.reshape([-1, 1])

        def step(ob, obs, a_v, *_args, **_kwargs):
            oppo_a = sess.run(oppo_a0, {oppo_policy.obs_x: ob})

            if a_v is not None:
                a, v = sess.run([a0, v0], {
                    obs_x: ob,
                    op_act_x: oppo_a,
                    X_v: obs,
                    A_v: a_v
                })
            else:
                a, v = sess.run([a0, v0], {
                    obs_x: ob,
                    op_act_x: oppo_a,
                    X_v: obs
                })
            return a, v, []  # dummy state

        def value(ob, obs, a_v, *_args, **_kwargs):
            oppo_a = sess.run(oppo_a0, {oppo_policy.obs_x: ob})

            if a_v is not None:
                return sess.run(v0, {X_v: obs, A_v: a_v, op_act_x: oppo_a})
            else:
                return sess.run(v0, {X_v: obs, op_act_x: oppo_a})

        self.obs_x = obs_x
        self.op_act_x = op_act_x
        self.X = obs_x
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.vf = vf
        self.step_log_prob = step_log_prob
        self.step = step
        self.value = value
        self.oppo_a = oppo_a0
    def __init__(self,
                 sess,
                 agent_id,
                 ob_space,
                 ac_space,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        self.agent_id = agent_id
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space
        actions = [
            tf.placeholder(tf.int32, (nbatch))
            for _ in range(len(ob_spaces) - 1)
        ]
        all_ac_shape = (nbatch, (sum([ac.n
                                      for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('oppo_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=128, init_scale=np.sqrt(2))
            h2 = fc(h1, 'fc2', nh=128, init_scale=np.sqrt(2))
            pi = []
            for k in range(len(ob_spaces)):
                if k == agent_id:
                    continue
                pi.append(fc(h2, 'pi_%d' % k, ac_spaces[k].n, act=lambda x: x))
        self.log_prob = [
            -tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi[i],
                                                            labels=actions[i])
            for i in range(len(pi))
        ]
        a0 = [sample(_) for _ in pi]
        self.initial_state = []  # not stateful

        def step_log_prob(ob, acts_n):
            acts = [
                acts_n[i] for i in range(len(acts_n)) if i != self.agent_id
            ]
            feed_dict = {X: ob}
            feed_dict.update(zip(actions, acts))
            log_prob = sess.run(self.log_prob, feed_dict)
            return log_prob.reshape([-1, 1])

        def step(ob, obs, a_v, *_args, **_kwargs):
            a = sess.run(a0, {X: ob, X_v: obs})
            return a

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.step_log_prob = step_log_prob
        self.step = step
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 ob_spaces,
                 ac_spaces,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 name='model'):
        self.agent_id = agent_id
        nbatch = nenv * nsteps
        ob_shape = (nbatch, ob_space.shape[0] * nstack)
        all_ob_shape = (nbatch, sum([obs.shape[0]
                                     for obs in ob_spaces]) * nstack)
        nact = ac_space.shape[0]
        all_ac_shape = (nbatch, (sum([ac.shape[0]
                                      for ac in ac_spaces]) - nact) * nstack)
        obs_x = tf.placeholder(tf.float32, ob_shape)  # obs
        X = obs_x
        X_v = tf.placeholder(tf.float32, all_ob_shape)
        A_v = tf.placeholder(tf.float32, all_ac_shape)
        with tf.variable_scope('oppo_policy_{}'.format(name), reuse=reuse):
            h1 = fc(X, 'fc1', nh=64, init_scale=np.sqrt(2), act=tf.nn.tanh)
            h2 = fc(h1, 'fc2', nh=64, init_scale=np.sqrt(2), act=tf.nn.tanh)
            pi = []
            for k in range(len(ob_spaces)):
                if k == agent_id:
                    continue
                pi.append(
                    fc(h2,
                       'pi%d' % k,
                       ac_spaces[k],
                       act=lambda x: x,
                       init_scale=0.01))

        with tf.variable_scope('oppo_{}'.format(name), reuse=reuse):
            logstd = tf.get_variable('sigma',
                                     shape=[nact],
                                     dtype=tf.float32,
                                     initializer=tf.constant_initializer(0.0))
            logstd = tf.expand_dims(logstd, 0)
            std = tf.exp(logstd)
            std = tf.tile(std, [nbatch, 1])

        a0 = pi + tf.random_normal(tf.shape(std), 0.0, 1.0) * std

        self.initial_state = []  # not stateful

        def step(ob, obs, *_args, **_kwargs):
            a = sess.run(a0, {X: ob, X_v: obs})
            return a

        self.obs_x = obs_x
        self.X = X
        self.X_v = X_v
        self.A_v = A_v
        self.pi = pi
        self.std = std
        self.logstd = logstd
        self.step = step
        self.mean_std = tf.concat([pi, std], axis=1)