Ejemplo n.º 1
0
    def __init__(self, learning_rate, memory_size, batch_size, sess,
                 output_size):
        self.sess = sess

        #state_t
        self.encoder_input = tf.placeholder(tf.float32,
                                            shape=[None, n_features],
                                            name='encoder_input')
        self.encoder_output = mlp(inputs=self.encoder_input,
                                  n_output=output_size,
                                  scope='encoder_output',
                                  hiddens=[32, 16, 8])
        self.decoder_output = mlp(inputs=self.encoder_output,
                                  n_output=n_features,
                                  scope='decoder_output',
                                  hiddens=[8, 16, 32])
        self.encoder_output_ = tf.stop_gradient(self.decoder_output)

        #some const
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        #memory
        self.memory = Memory(self.memory_size)

        #for train
        self.loss = tf.reduce_mean(
            tf.squared_difference(self.encoder_input, self.decoder_output))
        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)
    def __init__(
            self,
            learning_rate,
            memory_size,
            batch_size,
            sess,
            output_size
    ):
        self.sess = sess

        #state_t
        self.encoder_input_t = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_t')

        self.encoder_output_t = mlp(inputs=self.encoder_input_t, n_output=output_size, scope='encoder_output_t',
                                    hiddens=[16, 8])
        self.encoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_t')

        self.decoder_output_t = mlp(inputs=self.encoder_output_t, n_output=n_features, scope='decoder_output_t')
        self.decoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_t')

        self.encoder_output_t_ = tf.stop_gradient(self.encoder_output_t)

        #state_t+1  tpo->time plus one
        self.encoder_input_tpo = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_tpo')

        self.encoder_output_tpo = mlp(inputs=self.encoder_input_tpo, n_output=output_size, scope='encoder_output_tpo',
                                    hiddens=[16, 8])
        self.encoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_tpo')

        self.decoder_output_tpo = mlp(inputs=self.encoder_output_tpo, n_output=n_features, scope='decoder_output_tpo')
        self.decoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_tpo')

        self.encoder_output_tpo_ = tf.stop_gradient(self.encoder_output_tpo)

        #sync
        self.sync_encoder = [tf.assign(x, y) for x, y in zip(self.encoder_output_t_params, self.encoder_output_tpo_params)]
        self.sync_decoder = [tf.assign(x, y) for x, y in zip(self.decoder_output_t_params, self.decoder_output_tpo_params)]

        #some const
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        #memory
        self.memory = Memory(self.memory_size)

        #for train
        self.loss_0 = tf.reduce_mean(tf.squared_difference(self.encoder_input_t, self.decoder_output_t))
        self.loss_1 = tf.reduce_mean(tf.squared_difference(self.encoder_input_tpo,self.decoder_output_tpo))

        self.train_0 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_0)
        self.train_1 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_1)
Ejemplo n.º 3
0
    def __init__(
            self,
            learning_rate,
            memory_size,
            batch_size,
            sess
    ):
        self.sess = sess
        self.common_encoder_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_encoder_input')
        self.common_encoder_output = mlp(inputs=self.common_encoder_input, n_output=n_features, scope='common_encoder_output',
                                    hiddens=[16, 8])
        self.common_decoder_output = mlp(inputs=self.common_encoder_output, n_output=n_features, scope='common_decoder_output')

        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        self.memory = Memory(self.memory_size)

        self.loss = tf.reduce_mean(tf.squared_difference(self.common_encoder_input, self.common_decoder_output))
        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
Ejemplo n.º 4
0
    # training process
    train_rewards_list = []
    test_rewards_list = []
    show_every_steps = 100
    # Exploration parameters
    explore_start = 0.9  # exploration probability at start
    explore_stop = 0.01  # minimum S probability
    decay_rate = 0.0001  # expotentional decay rate for exploration prob
    # Network parameters
    hidden_size = 20  # number of units in each Q-network hidden layer
    learning_rate = 0.01  # Q-network learning rate
    # Memory parameters
    memory_size = 10000  # memory capacity
    batch_size = 32  # experience mini-batch size
    pretrain_length = batch_size  # number experiences to pretrain the memory
    memory = Memory(max_size=memory_size)

    # Initialize the simulation
    env = gym.make('CartPole-v1')

    # TODO 指定网络参数和名字
    agent = DQNAgent(env,
                     explore_start,
                     explore_stop,
                     decay_rate,
                     state_size=state_size,
                     action_size=action_size,
                     hidden_size=hidden_size,
                     use_targetQ=False,
                     C=20,
                     use_dueling=False,
Ejemplo n.º 5
0
    def __init__(self,
                 n_features,
                 n_actions,
                 model,
                 scope,
                 sess,
                 order,
                 hiddens,
                 beta,
                 C,
                 common_eval_input,
                 common_target_input,
                 common_eval_output,
                 common_target_output,
                 learning_rate=1e-5,
                 decay=0.99,
                 memory_size=20000000,
                 batch_size=100000,
                 epsilon_decrement=0.0005,
                 epsilon_lower=0.2):
        self.sess = sess
        self.scope = scope
        self.n_features = n_features
        self.batch_size = batch_size
        self.decay = decay
        self.model = model
        self.memory = Memory(memory_size)
        self.order = order
        self.beta = beta
        self.C = C

        self.learn_times = 0

        self.epsilon_lower = epsilon_lower
        self.epsilon_decrement = epsilon_decrement

        self.eval_input = tf.placeholder(tf.float32,
                                         shape=[None, self.n_features],
                                         name='eval_input')
        self.target_input = tf.placeholder(tf.float32,
                                           shape=[None, self.n_features],
                                           name='target_input')
        self.actions_selected = tf.placeholder(tf.int32,
                                               shape=[
                                                   None,
                                               ],
                                               name='actions_selected')
        self.done = tf.placeholder(tf.float32, shape=[
            None,
        ], name='done')
        self.decays = tf.placeholder(tf.float32, shape=[
            None,
        ], name='decay')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=[
                                          None,
                                      ],
                                      name='rewards')

        #about the encoder
        self.state_input_t = tf.placeholder(tf.float32,
                                            shape=[None, self.n_features],
                                            name='state_input_t')
        self.state_input_tpo = tf.placeholder(tf.float32,
                                              shape=[None, self.n_features],
                                              name='state_input_tpo')
        self.action_plus_state_input = tf.placeholder(
            tf.float32,
            shape=[None, self.n_features + 1],
            name='action_plus_state_input')

        #share the first layers
        self.common_eval_input = common_eval_input
        self.common_target_input = common_target_input
        self.common_eval_output = common_eval_output
        self.common_target_output = common_target_output

        with tf.variable_scope(self.scope):
            self._epsilon = tf.get_variable(name='epsilon',
                                            dtype=tf.float32,
                                            initializer=1.0)
            self._epsilon_decrement = tf.constant(epsilon_decrement)
            self.update_epsilon = tf.assign(
                self._epsilon, self._epsilon - self._epsilon_decrement)
            self.reset_epsilon = tf.assign(self._epsilon, 1)

            # self.eval_output = model(inputs=self.eval_input, n_output=n_actions, scope='eval_net', hiddens=hiddens)
            # self.target_output = tf.stop_gradient(
            #     model(inputs=self.target_input, n_output=n_actions, scope='target_net', hiddens=hiddens))

            self.eval_output = model(inputs=self.common_eval_output,
                                     n_output=n_actions,
                                     scope='eval_net',
                                     hiddens=hiddens)
            self.target_output = tf.stop_gradient(
                model(inputs=self.common_target_output,
                      n_output=n_actions,
                      scope='target_net',
                      hiddens=hiddens))

            #about encoder
            self.encoder_temp_t = mlp(inputs=self.state_input_t,
                                      n_output=64,
                                      scope='encoder_temp_t',
                                      hiddens=[32, 64])
            self.encoder_temp_tpo = tf.stop_gradient(
                mlp(inputs=self.state_input_tpo,
                    n_output=64,
                    scope='encoder_temp_tpo',
                    hiddens=[32, 64]))

            self.encoder_output_t = mlp(inputs=self.encoder_temp_t,
                                        n_output=self.n_features,
                                        scope='encoder_t',
                                        hiddens=[64, 32])
            self.encoder_output_tpo = mlp(inputs=self.encoder_temp_tpo,
                                          n_output=self.n_features,
                                          scope='encoder_tpo',
                                          hiddens=[64, 32])
            self.predict_output = mlp(inputs=self.action_plus_state_input,
                                      n_output=64,
                                      scope='predict_output',
                                      hiddens=[64, 32])

            self.predict_mse = tf.reduce_sum(
                tf.square(self.encoder_temp_tpo -
                          self.predict_output)) * self.n_features
            self.emax = tf.get_variable(name='emax',
                                        dtype=tf.float32,
                                        initializer=1.0)
            self.update_emax = tf.assign(
                self.emax, tf.maximum(self.emax, self.predict_mse))
            self.e_normalize = tf.div(self.predict_mse, self.emax)

            self.encoder_loss = tf.reduce_sum(
                tf.square(self.state_input_t - self.encoder_output_t))
            self.train_encoder = tf.train.AdamOptimizer(
                learning_rate).minimize(self.encoder_loss)
            self.M_loss = self.predict_mse
            self.train_M = tf.train.AdamOptimizer(learning_rate).minimize(
                self.M_loss)

        self.eval_output_selected = tf.reduce_sum(
            self.eval_output * tf.one_hot(self.actions_selected, n_actions),
            axis=1)
        self.eval_output_target = self.rewards + self.decays * tf.reduce_max(
            self.target_output, axis=1) * (1. - self.done)

        self.loss = tf.reduce_mean(
            tf.squared_difference(self.eval_output_selected,
                                  self.eval_output_target))
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

        self.eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope=scope + '/eval_net')
        self.target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope=scope + '/target_net')

        self.update = [
            tf.assign(x, y)
            for x, y in zip(self.target_params, self.eval_params)
        ]

        self.sess.run(tf.global_variables_initializer())