Esempio n. 1
0
    def __init__(self, learning_rate, memory_size, batch_size, sess,
                 output_size):
        self.sess = sess

        #state_t
        self.encoder_input = tf.placeholder(tf.float32,
                                            shape=[None, n_features],
                                            name='encoder_input')
        self.encoder_output = mlp(inputs=self.encoder_input,
                                  n_output=output_size,
                                  scope='encoder_output',
                                  hiddens=[32, 16, 8])
        self.decoder_output = mlp(inputs=self.encoder_output,
                                  n_output=n_features,
                                  scope='decoder_output',
                                  hiddens=[8, 16, 32])
        self.encoder_output_ = tf.stop_gradient(self.decoder_output)

        #some const
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        #memory
        self.memory = Memory(self.memory_size)

        #for train
        self.loss = tf.reduce_mean(
            tf.squared_difference(self.encoder_input, self.decoder_output))
        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)
    def __init__(
            self,
            learning_rate,
            memory_size,
            batch_size,
            sess,
            output_size
    ):
        self.sess = sess

        #state_t
        self.encoder_input_t = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_t')

        self.encoder_output_t = mlp(inputs=self.encoder_input_t, n_output=output_size, scope='encoder_output_t',
                                    hiddens=[16, 8])
        self.encoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_t')

        self.decoder_output_t = mlp(inputs=self.encoder_output_t, n_output=n_features, scope='decoder_output_t')
        self.decoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_t')

        self.encoder_output_t_ = tf.stop_gradient(self.encoder_output_t)

        #state_t+1  tpo->time plus one
        self.encoder_input_tpo = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_tpo')

        self.encoder_output_tpo = mlp(inputs=self.encoder_input_tpo, n_output=output_size, scope='encoder_output_tpo',
                                    hiddens=[16, 8])
        self.encoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_tpo')

        self.decoder_output_tpo = mlp(inputs=self.encoder_output_tpo, n_output=n_features, scope='decoder_output_tpo')
        self.decoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_tpo')

        self.encoder_output_tpo_ = tf.stop_gradient(self.encoder_output_tpo)

        #sync
        self.sync_encoder = [tf.assign(x, y) for x, y in zip(self.encoder_output_t_params, self.encoder_output_tpo_params)]
        self.sync_decoder = [tf.assign(x, y) for x, y in zip(self.decoder_output_t_params, self.decoder_output_tpo_params)]

        #some const
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        #memory
        self.memory = Memory(self.memory_size)

        #for train
        self.loss_0 = tf.reduce_mean(tf.squared_difference(self.encoder_input_t, self.decoder_output_t))
        self.loss_1 = tf.reduce_mean(tf.squared_difference(self.encoder_input_tpo,self.decoder_output_tpo))

        self.train_0 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_0)
        self.train_1 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_1)
Esempio n. 3
0
class auto_encoder:
    def __init__(
            self,
            learning_rate,
            memory_size,
            batch_size,
            sess
    ):
        self.sess = sess
        self.common_encoder_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_encoder_input')
        self.common_encoder_output = mlp(inputs=self.common_encoder_input, n_output=n_features, scope='common_encoder_output',
                                    hiddens=[16, 8])
        self.common_decoder_output = mlp(inputs=self.common_encoder_output, n_output=n_features, scope='common_decoder_output')

        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        self.memory = Memory(self.memory_size)

        self.loss = tf.reduce_mean(tf.squared_difference(self.common_encoder_input, self.common_decoder_output))
        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

    def update(self):
        data = self.memory.sample(self.batch_size)


        self.sess.run(self.train, feed_dict = {
            self.common_encoder_input: data
        })

    def store(self, state):
        self.memory.store(np.array([state]))

    @property
    def output(self):
        return self.common_encoder_output

    @property
    def full(self):
        return self.memory.return_index() == self.memory_size-1
Esempio n. 4
0
    def __init__(
            self,
            learning_rate,
            memory_size,
            batch_size,
            sess
    ):
        self.sess = sess
        self.common_encoder_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_encoder_input')
        self.common_encoder_output = mlp(inputs=self.common_encoder_input, n_output=n_features, scope='common_encoder_output',
                                    hiddens=[16, 8])
        self.common_decoder_output = mlp(inputs=self.common_encoder_output, n_output=n_features, scope='common_decoder_output')

        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        self.memory = Memory(self.memory_size)

        self.loss = tf.reduce_mean(tf.squared_difference(self.common_encoder_input, self.common_decoder_output))
        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
Esempio n. 5
0
class auto_encoder:
    def __init__(self, learning_rate, memory_size, batch_size, sess,
                 output_size):
        self.sess = sess

        #state_t
        self.encoder_input = tf.placeholder(tf.float32,
                                            shape=[None, n_features],
                                            name='encoder_input')
        self.encoder_output = mlp(inputs=self.encoder_input,
                                  n_output=output_size,
                                  scope='encoder_output',
                                  hiddens=[32, 16, 8])
        self.decoder_output = mlp(inputs=self.encoder_output,
                                  n_output=n_features,
                                  scope='decoder_output',
                                  hiddens=[8, 16, 32])
        self.encoder_output_ = tf.stop_gradient(self.decoder_output)

        #some const
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        #memory
        self.memory = Memory(self.memory_size)

        #for train
        self.loss = tf.reduce_mean(
            tf.squared_difference(self.encoder_input, self.decoder_output))
        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss)

    def learn(self):
        data = self.memory.sample(self.batch_size)

        state = []

        for i in range(self.batch_size):
            state.append(data[i][0])

        self.sess.run(self.train, feed_dict={self.encoder_input: state})

    def store(self, state):
        self.memory.store(np.array([state]))

    def output(self, state):

        return self.sess.run(self.encoder_output,
                             feed_dict={self.encoder_input: [np.array(state)]})

    def output_loss(self):
        data = self.memory.sample(self.batch_size)

        state = []

        for i in range(self.batch_size):
            state.append(data[i][0])

        temp = self.sess.run(self.loss, feed_dict={self.encoder_input: state})
        print('now loss:', temp)

        print(state[0])
        temp = self.sess.run(self.decoder_output,
                             feed_dict={self.encoder_input: state})
        print(temp[0])

    @property
    def full(self):
        return self.memory.return_index() == self.memory_size - 1

    def process_data(self):
        state, action, reward, state_next, done, decays = [], [], [], [], [], []
        temp = self.memory.sample(self.batch_size)
        for i in range(self.batch_size):
            state.append(temp[i][0])
            action.append(temp[i][1])
            reward.append(temp[i][2])
            state_next.append(temp[i][3])
            if temp[i][4] == False:
                done.append(np.array(0))
            else:
                done.append(np.array(1))
            decays.append(self.decay)
        return state, action, reward, state_next, done, decays
Esempio n. 6
0
    # training process
    train_rewards_list = []
    test_rewards_list = []
    show_every_steps = 100
    # Exploration parameters
    explore_start = 0.9  # exploration probability at start
    explore_stop = 0.01  # minimum S probability
    decay_rate = 0.0001  # expotentional decay rate for exploration prob
    # Network parameters
    hidden_size = 20  # number of units in each Q-network hidden layer
    learning_rate = 0.01  # Q-network learning rate
    # Memory parameters
    memory_size = 10000  # memory capacity
    batch_size = 32  # experience mini-batch size
    pretrain_length = batch_size  # number experiences to pretrain the memory
    memory = Memory(max_size=memory_size)

    # Initialize the simulation
    env = gym.make('CartPole-v1')

    # TODO 指定网络参数和名字
    agent = DQNAgent(env,
                     explore_start,
                     explore_stop,
                     decay_rate,
                     state_size=state_size,
                     action_size=action_size,
                     hidden_size=hidden_size,
                     use_targetQ=False,
                     C=20,
                     use_dueling=False,
class auto_encoder:
    def __init__(
            self,
            learning_rate,
            memory_size,
            batch_size,
            sess,
            output_size
    ):
        self.sess = sess

        #state_t
        self.encoder_input_t = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_t')

        self.encoder_output_t = mlp(inputs=self.encoder_input_t, n_output=output_size, scope='encoder_output_t',
                                    hiddens=[16, 8])
        self.encoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_t')

        self.decoder_output_t = mlp(inputs=self.encoder_output_t, n_output=n_features, scope='decoder_output_t')
        self.decoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_t')

        self.encoder_output_t_ = tf.stop_gradient(self.encoder_output_t)

        #state_t+1  tpo->time plus one
        self.encoder_input_tpo = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_tpo')

        self.encoder_output_tpo = mlp(inputs=self.encoder_input_tpo, n_output=output_size, scope='encoder_output_tpo',
                                    hiddens=[16, 8])
        self.encoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_tpo')

        self.decoder_output_tpo = mlp(inputs=self.encoder_output_tpo, n_output=n_features, scope='decoder_output_tpo')
        self.decoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_tpo')

        self.encoder_output_tpo_ = tf.stop_gradient(self.encoder_output_tpo)

        #sync
        self.sync_encoder = [tf.assign(x, y) for x, y in zip(self.encoder_output_t_params, self.encoder_output_tpo_params)]
        self.sync_decoder = [tf.assign(x, y) for x, y in zip(self.decoder_output_t_params, self.decoder_output_tpo_params)]

        #some const
        self.learning_rate = learning_rate
        self.memory_size = memory_size
        self.batch_size = batch_size

        #memory
        self.memory = Memory(self.memory_size)

        #for train
        self.loss_0 = tf.reduce_mean(tf.squared_difference(self.encoder_input_t, self.decoder_output_t))
        self.loss_1 = tf.reduce_mean(tf.squared_difference(self.encoder_input_tpo,self.decoder_output_tpo))

        self.train_0 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_0)
        self.train_1 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_1)
 
    def update(self):
        data = self.memory.sample(self.batch_size)

        #not sure if the data is legal

        self.sess.run([self.train_0,self.train_1] feed_dict = {
            self.encoder_input_t: data,
            self.decoder_output_tpo: data
        })

    def store(self, state):
        self.memory.store(np.array([state]))

    @property
    def output(self):
        return

    @property
    def full(self):
        return self.memory.return_index() == self.memory_size-1

    def sync(self):
        self.sess.run([self.sync_encoder, self.sync_decoder])
        return None

    def learn(self):
        self.sess.run([self.train_0, self.train_1], feed_dict = {

            })
        return None
Esempio n. 8
0
    def __init__(self,
                 n_features,
                 n_actions,
                 model,
                 scope,
                 sess,
                 order,
                 hiddens,
                 beta,
                 C,
                 common_eval_input,
                 common_target_input,
                 common_eval_output,
                 common_target_output,
                 learning_rate=1e-5,
                 decay=0.99,
                 memory_size=20000000,
                 batch_size=100000,
                 epsilon_decrement=0.0005,
                 epsilon_lower=0.2):
        self.sess = sess
        self.scope = scope
        self.n_features = n_features
        self.batch_size = batch_size
        self.decay = decay
        self.model = model
        self.memory = Memory(memory_size)
        self.order = order
        self.beta = beta
        self.C = C

        self.learn_times = 0

        self.epsilon_lower = epsilon_lower
        self.epsilon_decrement = epsilon_decrement

        self.eval_input = tf.placeholder(tf.float32,
                                         shape=[None, self.n_features],
                                         name='eval_input')
        self.target_input = tf.placeholder(tf.float32,
                                           shape=[None, self.n_features],
                                           name='target_input')
        self.actions_selected = tf.placeholder(tf.int32,
                                               shape=[
                                                   None,
                                               ],
                                               name='actions_selected')
        self.done = tf.placeholder(tf.float32, shape=[
            None,
        ], name='done')
        self.decays = tf.placeholder(tf.float32, shape=[
            None,
        ], name='decay')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=[
                                          None,
                                      ],
                                      name='rewards')

        #about the encoder
        self.state_input_t = tf.placeholder(tf.float32,
                                            shape=[None, self.n_features],
                                            name='state_input_t')
        self.state_input_tpo = tf.placeholder(tf.float32,
                                              shape=[None, self.n_features],
                                              name='state_input_tpo')
        self.action_plus_state_input = tf.placeholder(
            tf.float32,
            shape=[None, self.n_features + 1],
            name='action_plus_state_input')

        #share the first layers
        self.common_eval_input = common_eval_input
        self.common_target_input = common_target_input
        self.common_eval_output = common_eval_output
        self.common_target_output = common_target_output

        with tf.variable_scope(self.scope):
            self._epsilon = tf.get_variable(name='epsilon',
                                            dtype=tf.float32,
                                            initializer=1.0)
            self._epsilon_decrement = tf.constant(epsilon_decrement)
            self.update_epsilon = tf.assign(
                self._epsilon, self._epsilon - self._epsilon_decrement)
            self.reset_epsilon = tf.assign(self._epsilon, 1)

            # self.eval_output = model(inputs=self.eval_input, n_output=n_actions, scope='eval_net', hiddens=hiddens)
            # self.target_output = tf.stop_gradient(
            #     model(inputs=self.target_input, n_output=n_actions, scope='target_net', hiddens=hiddens))

            self.eval_output = model(inputs=self.common_eval_output,
                                     n_output=n_actions,
                                     scope='eval_net',
                                     hiddens=hiddens)
            self.target_output = tf.stop_gradient(
                model(inputs=self.common_target_output,
                      n_output=n_actions,
                      scope='target_net',
                      hiddens=hiddens))

            #about encoder
            self.encoder_temp_t = mlp(inputs=self.state_input_t,
                                      n_output=64,
                                      scope='encoder_temp_t',
                                      hiddens=[32, 64])
            self.encoder_temp_tpo = tf.stop_gradient(
                mlp(inputs=self.state_input_tpo,
                    n_output=64,
                    scope='encoder_temp_tpo',
                    hiddens=[32, 64]))

            self.encoder_output_t = mlp(inputs=self.encoder_temp_t,
                                        n_output=self.n_features,
                                        scope='encoder_t',
                                        hiddens=[64, 32])
            self.encoder_output_tpo = mlp(inputs=self.encoder_temp_tpo,
                                          n_output=self.n_features,
                                          scope='encoder_tpo',
                                          hiddens=[64, 32])
            self.predict_output = mlp(inputs=self.action_plus_state_input,
                                      n_output=64,
                                      scope='predict_output',
                                      hiddens=[64, 32])

            self.predict_mse = tf.reduce_sum(
                tf.square(self.encoder_temp_tpo -
                          self.predict_output)) * self.n_features
            self.emax = tf.get_variable(name='emax',
                                        dtype=tf.float32,
                                        initializer=1.0)
            self.update_emax = tf.assign(
                self.emax, tf.maximum(self.emax, self.predict_mse))
            self.e_normalize = tf.div(self.predict_mse, self.emax)

            self.encoder_loss = tf.reduce_sum(
                tf.square(self.state_input_t - self.encoder_output_t))
            self.train_encoder = tf.train.AdamOptimizer(
                learning_rate).minimize(self.encoder_loss)
            self.M_loss = self.predict_mse
            self.train_M = tf.train.AdamOptimizer(learning_rate).minimize(
                self.M_loss)

        self.eval_output_selected = tf.reduce_sum(
            self.eval_output * tf.one_hot(self.actions_selected, n_actions),
            axis=1)
        self.eval_output_target = self.rewards + self.decays * tf.reduce_max(
            self.target_output, axis=1) * (1. - self.done)

        self.loss = tf.reduce_mean(
            tf.squared_difference(self.eval_output_selected,
                                  self.eval_output_target))
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

        self.eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope=scope + '/eval_net')
        self.target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope=scope + '/target_net')

        self.update = [
            tf.assign(x, y)
            for x, y in zip(self.target_params, self.eval_params)
        ]

        self.sess.run(tf.global_variables_initializer())
Esempio n. 9
0
class DQN:
    def __init__(self,
                 n_features,
                 n_actions,
                 model,
                 scope,
                 sess,
                 order,
                 hiddens,
                 beta,
                 C,
                 common_eval_input,
                 common_target_input,
                 common_eval_output,
                 common_target_output,
                 learning_rate=1e-5,
                 decay=0.99,
                 memory_size=20000000,
                 batch_size=100000,
                 epsilon_decrement=0.0005,
                 epsilon_lower=0.2):
        self.sess = sess
        self.scope = scope
        self.n_features = n_features
        self.batch_size = batch_size
        self.decay = decay
        self.model = model
        self.memory = Memory(memory_size)
        self.order = order
        self.beta = beta
        self.C = C

        self.learn_times = 0

        self.epsilon_lower = epsilon_lower
        self.epsilon_decrement = epsilon_decrement

        self.eval_input = tf.placeholder(tf.float32,
                                         shape=[None, self.n_features],
                                         name='eval_input')
        self.target_input = tf.placeholder(tf.float32,
                                           shape=[None, self.n_features],
                                           name='target_input')
        self.actions_selected = tf.placeholder(tf.int32,
                                               shape=[
                                                   None,
                                               ],
                                               name='actions_selected')
        self.done = tf.placeholder(tf.float32, shape=[
            None,
        ], name='done')
        self.decays = tf.placeholder(tf.float32, shape=[
            None,
        ], name='decay')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=[
                                          None,
                                      ],
                                      name='rewards')

        #about the encoder
        self.state_input_t = tf.placeholder(tf.float32,
                                            shape=[None, self.n_features],
                                            name='state_input_t')
        self.state_input_tpo = tf.placeholder(tf.float32,
                                              shape=[None, self.n_features],
                                              name='state_input_tpo')
        self.action_plus_state_input = tf.placeholder(
            tf.float32,
            shape=[None, self.n_features + 1],
            name='action_plus_state_input')

        #share the first layers
        self.common_eval_input = common_eval_input
        self.common_target_input = common_target_input
        self.common_eval_output = common_eval_output
        self.common_target_output = common_target_output

        with tf.variable_scope(self.scope):
            self._epsilon = tf.get_variable(name='epsilon',
                                            dtype=tf.float32,
                                            initializer=1.0)
            self._epsilon_decrement = tf.constant(epsilon_decrement)
            self.update_epsilon = tf.assign(
                self._epsilon, self._epsilon - self._epsilon_decrement)
            self.reset_epsilon = tf.assign(self._epsilon, 1)

            # self.eval_output = model(inputs=self.eval_input, n_output=n_actions, scope='eval_net', hiddens=hiddens)
            # self.target_output = tf.stop_gradient(
            #     model(inputs=self.target_input, n_output=n_actions, scope='target_net', hiddens=hiddens))

            self.eval_output = model(inputs=self.common_eval_output,
                                     n_output=n_actions,
                                     scope='eval_net',
                                     hiddens=hiddens)
            self.target_output = tf.stop_gradient(
                model(inputs=self.common_target_output,
                      n_output=n_actions,
                      scope='target_net',
                      hiddens=hiddens))

            #about encoder
            self.encoder_temp_t = mlp(inputs=self.state_input_t,
                                      n_output=64,
                                      scope='encoder_temp_t',
                                      hiddens=[32, 64])
            self.encoder_temp_tpo = tf.stop_gradient(
                mlp(inputs=self.state_input_tpo,
                    n_output=64,
                    scope='encoder_temp_tpo',
                    hiddens=[32, 64]))

            self.encoder_output_t = mlp(inputs=self.encoder_temp_t,
                                        n_output=self.n_features,
                                        scope='encoder_t',
                                        hiddens=[64, 32])
            self.encoder_output_tpo = mlp(inputs=self.encoder_temp_tpo,
                                          n_output=self.n_features,
                                          scope='encoder_tpo',
                                          hiddens=[64, 32])
            self.predict_output = mlp(inputs=self.action_plus_state_input,
                                      n_output=64,
                                      scope='predict_output',
                                      hiddens=[64, 32])

            self.predict_mse = tf.reduce_sum(
                tf.square(self.encoder_temp_tpo -
                          self.predict_output)) * self.n_features
            self.emax = tf.get_variable(name='emax',
                                        dtype=tf.float32,
                                        initializer=1.0)
            self.update_emax = tf.assign(
                self.emax, tf.maximum(self.emax, self.predict_mse))
            self.e_normalize = tf.div(self.predict_mse, self.emax)

            self.encoder_loss = tf.reduce_sum(
                tf.square(self.state_input_t - self.encoder_output_t))
            self.train_encoder = tf.train.AdamOptimizer(
                learning_rate).minimize(self.encoder_loss)
            self.M_loss = self.predict_mse
            self.train_M = tf.train.AdamOptimizer(learning_rate).minimize(
                self.M_loss)

        self.eval_output_selected = tf.reduce_sum(
            self.eval_output * tf.one_hot(self.actions_selected, n_actions),
            axis=1)
        self.eval_output_target = self.rewards + self.decays * tf.reduce_max(
            self.target_output, axis=1) * (1. - self.done)

        self.loss = tf.reduce_mean(
            tf.squared_difference(self.eval_output_selected,
                                  self.eval_output_target))
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

        self.eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope=scope + '/eval_net')
        self.target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               scope=scope + '/target_net')

        self.update = [
            tf.assign(x, y)
            for x, y in zip(self.target_params, self.eval_params)
        ]

        self.sess.run(tf.global_variables_initializer())

    def act(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, 1)
        else:
            copy_state = copy.deepcopy(state)
            #for debug
            #exchange
            t = copy_state[self.order]
            copy_state[self.order] = copy_state[0]
            copy_state[0] = t

            action = self.sess.run(
                self.eval_output,
                feed_dict={self.common_eval_input: np.array([copy_state])})

            return np.argmax(action, axis=1)[0].tolist()

    def check(self, state):
        copy_state = copy.deepcopy(state)

        # exchange
        t = copy_state[self.order]
        copy_state[self.order] = copy_state[0]
        copy_state[0] = t

        action = self.sess.run(
            self.eval_output,
            feed_dict={self.common_eval_input: np.array([copy_state])})

        return np.argmax(action, axis=1)[0].tolist()

    def learn(self):
        self.learn_times += 1
        state, action, reward, state_next, done, decays = self.process_data()
        self.sess.run(self.train,
                      feed_dict={
                          self.common_eval_input: state,
                          self.actions_selected: action,
                          self.rewards: reward,
                          self.common_target_input: state_next,
                          self.done: done,
                          self.decays: decays
                      })

        if self.epsilon > self.epsilon_lower:
            self.sess.run(self.update_epsilon)

        if self.learn_times % 10 == 0:
            print('start update target network')
            self.sess.run(self.update)

    def store(self, state, action, reward, state_after, episode_ended):

        state_copy = copy.deepcopy(state)
        state_after_copy = copy.deepcopy(state_after)
        #exchange
        t = state_copy[self.order]
        state_copy[self.order] = state_copy[0]
        state_copy[0] = t

        t = state_after_copy[self.order]
        state_after_copy[self.order] = state_after_copy[0]
        state_after_copy[0] = t

        self.memory.store(
            np.array(
                [state_copy, action, reward, state_after_copy, episode_ended]))

    def process_data(self):
        state, action, reward, state_next, done, decays = [], [], [], [], [], []
        temp = self.memory.sample(self.batch_size)
        for i in range(self.batch_size):
            state.append(temp[i][0])
            action.append(temp[i][1])
            reward.append(temp[i][2])
            state_next.append(temp[i][3])
            if temp[i][4] == False:
                done.append(np.array(0))
            else:
                done.append(np.array(1))
            decays.append(self.decay)
        return state, action, reward, state_next, done, decays

    @property
    def epsilon(self):
        return self.sess.run(self._epsilon)

    def return_new_reward(self, reward, state_t, state_tpo, episode, action):
        self.sess.run(self.update_emax,
                      feed_dict={
                          self.state_input_t:
                          np.array([state_t]),
                          self.state_input_tpo:
                          np.array([state_tpo]),
                          self.action_plus_state_input:
                          np.array([state_t + [action]])
                      })

        temp = self.sess.run(self.e_normalize,
                             feed_dict={
                                 self.state_input_t:
                                 np.array([state_t]),
                                 self.state_input_tpo:
                                 np.array([state_tpo]),
                                 self.action_plus_state_input:
                                 np.array([state_t + [action]]),
                             })

        return reward + (self.beta / self.C) * temp

    def update_M(self):
        state, action, reward, state_next, done, decays = self.process_data()
        self.sess.run(self.train_M,
                      feed_dict={
                          self.state_input_tpo:
                          state_next,
                          self.action_plus_state_input:
                          np.hstack((state, np.array([action]).T))
                      })

    def update_encoder(self):
        state, action, reward, state_next, done, decays = self.process_data()
        self.sess.run(self.train_encoder,
                      feed_dict={self.state_input_t: state})
Esempio n. 10
0
    # training process
    train_rewards_list = []
    test_rewards_list = []
    show_every_steps = 100
    # Exploration parameters
    explore_start = 0.9  # exploration probability at start
    explore_stop = 0.01  # minimum S probability
    decay_rate = 0.0001  # expotentional decay rate for exploration prob
    # Network parameters
    hidden_size = 20  # number of units in each Q-network hidden layer
    learning_rate = 0.01  # Q-network learning rate
    # Memory parameters
    memory_size = 10000  # memory capacity
    batch_size = 32  # experience mini-batch size
    pretrain_length = batch_size  # number experiences to pretrain the memory
    memory = Memory(max_size=memory_size)

    # Initialize the simulation
    env = gym.make('CartPole-v1')

    # TODO 指定网络参数和模型名字
    agent = DQNAgent(env,
                     explore_start,
                     explore_stop,
                     decay_rate,
                     state_size=state_size,
                     action_size=action_size,
                     hidden_size=hidden_size,
                     use_targetQ=True,
                     C=20,
                     use_dueling=False,