Esempio n. 1
0
    def _init(self, ob_space, ac_space, kind):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        x = ob / 255.0
        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(logits)
        self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample() # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Esempio n. 2
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Esempio n. 3
0
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_proba_dist_type(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        obscaled = ob / 255.0

        with tf.variable_scope("pol"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            logits = U.dense(x,
                             pdtype.param_shape()[0], "logits",
                             U.normc_initializer(0.01))
            self.pd = pdtype.proba_distribution_from_flat(logits)
        with tf.variable_scope("vf"):
            x = obscaled
            x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
            self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
            self.vpredz = self.vpred

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = self.pd.sample()  # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Esempio n. 4
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(
            ob_space,
            gym.spaces.Box)  #ru guo hou mian tiao jian wei jia ze tui chu
        #print ("mlp_policy/20lines") zhi xing liang ci
        #print ("ac_space.shape[0]", ac_space.shape[0]) shu chu jie guo shi 3
        self.pdtype = pdtype = make_pdtype(
            ac_space
        )  #return DiagGaussianPdType(ac_space.shape[0]) zhe li mian zui hou you pdclass()
        sequence_length = None

        ob = U.get_placeholder(
            name="ob",
            dtype=tf.float32,
            shape=[sequence_length] + list(ob_space.shape)
        )  #return tf.placeholder(dtype=dtype, shape=shape, name=name)
        #print ("obspace.shape:::", list(ob_space.shape)) shu chu shi [11]
        with tf.variable_scope("obfilter"):
            #print("gail-tf/gailtf/baselines/ppo1/mlp_policy.py/28lines:")
            self.ob_rms = RunningMeanStd(
                shape=ob_space.shape)  #zhe ge han shu kan  bu dong

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)  #ob zhe ge shi hou hai shi placeholder
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(
                            1.0)))  #da jian le quan lian jie ceng
        self.vpred = U.dense(
            last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0)
        )[:,
          0]  #wen ti shi zhe li zui hou mei you shu chu dong zuo de kongjian

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            print("gaussian_fixed_var is used")
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            #print ("gaussian_fixed_var is not used") mei you bei yong dao
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(
            pdparam
        )  # mo rren shang mian de pdtype yi ding shi DiagGaussianPd return DiagGaussianPd
        #pd li mian you kl, entropy, sample deng fang fa
        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Esempio n. 5
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        last_action = U.get_placeholder(shape=(None, 524), dtype=tf.float32, name="last_action_one_hot")
        self.msize = 64 # change to 64 later
        self.ssize = 64 
        self.isize = 11
        self.available_action_size = 524

        available_action = ob[:, (5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize):(5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize+self.available_action_size)]
        # ob = ob[:,:-(self.available_action_size)]

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -20.0, 20.0)
        obz = (ob - self.ob_rms.mean) / self.ob_rms.std

        minimap = obz[:, 0:5*self.msize*self.msize]
        # minimap /= 2
        screen = obz[:, 5*self.msize*self.msize: 5*self.msize*self.msize+ 10*self.ssize*self.ssize]
        # screen /= 2
        info = obz[:, (5*self.msize*self.msize+10*self.ssize*self.ssize):(5*self.msize*self.msize+10*self.ssize*self.ssize+self.isize)]
        # info /= 2


        # get value prediction, crtic
        mconv1 = tf.layers.conv2d(
            inputs=tf.reshape(minimap, [-1,self.msize,self.msize,5]),
            filters=32,
            kernel_size=[5, 5],
            padding="same",
            kernel_initializer=U.normc_initializer(0.01),
            activation=tf.nn.leaky_relu)
        mpool1 = tf.layers.max_pooling2d(inputs=mconv1, pool_size=[2, 2], strides=2)
        mconv2 = tf.layers.conv2d(
            inputs=mpool1,
            filters=64,
            kernel_size=[5, 5],
            padding="same",
            kernel_initializer=U.normc_initializer(0.01),
            activation=tf.nn.leaky_relu,
            name="vffcmconv2")
        mpool2 = tf.layers.max_pooling2d(inputs=mconv2, pool_size=[2, 2], strides=2)
        mpool2_flat = tf.reshape(mpool2, [-1, 16 * 16 * 64])

        sconv1 = tf.layers.conv2d(
            inputs=tf.reshape(screen, [-1,self.ssize, self.ssize,10]),
            filters=48,
            kernel_size=[5, 5],
            padding="same",
            kernel_initializer=U.normc_initializer(0.01),
            activation=tf.nn.leaky_relu)
        spool1 = tf.layers.max_pooling2d(inputs=sconv1, pool_size=[2, 2], strides=2)
        sconv2 = tf.layers.conv2d(
            inputs=spool1,
            filters=80,
            kernel_size=[5, 5],
            padding="same",
            kernel_initializer=U.normc_initializer(0.01),
            activation=tf.nn.leaky_relu)
        spool2 = tf.layers.max_pooling2d(inputs=sconv2, pool_size=[2, 2], strides=2)
        spool2_flat = tf.reshape(spool2, [-1, 16 * 16 * 80])

        info_fc = tf.layers.dense(inputs=layers.flatten(info),
                   units=8,
                   activation=tf.tanh)
        
        aa_fc = tf.layers.dense(inputs=layers.flatten(available_action),
                   units=32,
                   activation=tf.tanh)

        HIDDEN_SIZE = 128
        l1_action = tf.layers.dense(layers.flatten(last_action), 256, tf.nn.relu)
        input_to_rnn = tf.reshape(l1_action, [-1, 16, 16])
        action_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=HIDDEN_SIZE, 
            forget_bias=1.0, state_is_tuple=True)
        inputs_rnn = tf.unstack(input_to_rnn, num=16, axis=1)
        rnn_outputs,rnn_state= tf.contrib.rnn.static_rnn(action_lstm_cell,
            inputs_rnn, dtype=tf.float32)
        l2_action = tf.layers.dense(rnn_state[-1], 
            128, tf.nn.tanh)          # hidden layer
        last_acs_ph_lstm = tf.layers.dense(l2_action, 
            32, tf.nn.tanh)

        last_out = tf.concat([mpool2_flat, spool2_flat, info_fc, aa_fc, last_acs_ph_lstm], 
            axis=1)
        vf_last_out = tf.nn.tanh(U.dense(last_out, 1024, 'vf_last_out',
            weight_init=U.normc_initializer(1.0)))
        # vf_last_out_2 = tf.nn.tanh(U.dense(vf_last_out, 64, 'vf_last_out_2',
        #     weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(vf_last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pol_last_out = U.dense(last_out, (pdtype.param_shape()[0])*5, "polfinaldense", U.normc_initializer(0.01))
            pdparam = U.dense(pol_last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(available_action), self.pd.mode(available_action))
        self.ac = ac
        self._act = U.function([stochastic, ob, last_action], [ac, self.vpred])
Esempio n. 6
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                               -20.0, 20.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        # last_out = obz
        # for i in range(num_hid_layers):
        #     last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        ### add conv net instead of using dense
        self.msize = 64  # change to 64 later
        self.ssize = 64
        self.isize = 11
        self.available_action_size = 524
        minimap = obz[:, 0:5 * self.msize * self.msize]
        screen = obz[:,
                     5 * self.msize * self.msize:5 * self.msize * self.msize +
                     10 * self.ssize * self.ssize]
        info = obz[:, (5 * self.msize * self.msize +
                       10 * self.ssize * self.ssize):(
                           5 * self.msize * self.msize +
                           10 * self.ssize * self.ssize + self.isize)]
        available_action = obz[:, (5 * self.msize * self.msize +
                                   10 * self.ssize * self.ssize +
                                   self.isize):(5 * self.msize * self.msize +
                                                10 * self.ssize * self.ssize +
                                                self.isize +
                                                self.available_action_size)]

        conv1_minimap = tf.layers.conv2d(inputs=tf.reshape(
            minimap, [-1, self.msize, self.msize, 5]),
                                         filters=10,
                                         kernel_size=5,
                                         strides=1,
                                         padding='same',
                                         activation=tf.nn.leaky_relu,
                                         name="polmconv1")  # -> (64, 64, 10)
        pool1_minimap = tf.layers.max_pooling2d(
            conv1_minimap, pool_size=4, strides=4,
            name="polmpool1")  # -> (16, 16, 10)
        conv2_minimap = tf.layers.conv2d(pool1_minimap,
                                         10,
                                         5,
                                         1,
                                         'same',
                                         activation=tf.nn.relu,
                                         name="polmconv2")  # -> (16, 16, 10)
        pool2_minimap = tf.layers.max_pooling2d(
            conv2_minimap, 2, 2, name="polmpool2")  # -> (8, 8, 10)
        flat_minimap = tf.reshape(pool2_minimap,
                                  [-1, 8 * 8 * 10])  # -> (8*8*10, )
        # dense_minimap = tf.layers.dense(inputs=flat_minimap, units=1024, activation=tf.nn.relu)
        # # dropout_mininmap = tf.layers.dropout(
        # #     inputs=dense_minimap, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
        # minimap_output = tf.layers.dense(dense_minimap, 64)

        conv1_screen = tf.layers.conv2d(
            inputs=tf.reshape(screen,
                              [-1, self.ssize, self.ssize, 10]),  # (64,64,10)
            filters=20,
            kernel_size=5,
            strides=1,
            padding='same',
            activation=tf.nn.leaky_relu,
            name="polsconv1")  # -> (64, 64, 20)
        pool1_screen = tf.layers.max_pooling2d(
            conv1_screen, pool_size=4, strides=4,
            name="polspool1")  # -> (16, 16, 20)
        conv2_screen = tf.layers.conv2d(pool1_screen,
                                        20,
                                        5,
                                        1,
                                        'same',
                                        activation=tf.nn.relu,
                                        name="polsconv2")  # -> (16, 16, 20)
        pool2_screen = tf.layers.max_pooling2d(
            conv2_screen, 2, 2, name="polspool2")  # -> (8, 8, 20)
        flat_screen = tf.reshape(pool2_screen,
                                 [-1, 8 * 8 * 20])  # -> (8*8*20, )
        # dense_screen = tf.layers.dense(inputs=flat_screen, units=1024, activation=tf.nn.relu)
        # # dropout_screen = tf.layers.dropout(
        # #     inputs=dense_screen, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
        # screen_output = tf.layers.dense(dense_screen, 64, tf.nn.relu)

        info_fc = tf.layers.dense(inputs=layers.flatten(info),
                                  units=4,
                                  activation=tf.tanh,
                                  name="poldense1")

        aa_fc = tf.layers.dense(inputs=layers.flatten(available_action),
                                units=16,
                                activation=tf.tanh,
                                name="poldense2")

        last_out = tf.concat([flat_minimap, flat_screen, info_fc, aa_fc],
                             axis=1,
                             name="polconcat")
        # last_out = tf.layers.dense(inputs=last_out,units=600,name="poldense3")
        # last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc1", weight_init=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        # change for BC
        #stochastic = tf.placeholder(dtype=tf.bool, shape=())
        stochastic = U.get_placeholder(name="stochastic",
                                       dtype=tf.bool,
                                       shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.vpred])