class auto_encoder: def __init__( self, learning_rate, memory_size, batch_size, sess ): self.sess = sess self.common_encoder_input = tf.placeholder(tf.float32, shape=[None, n_features], name='common_encoder_input') self.common_encoder_output = mlp(inputs=self.common_encoder_input, n_output=n_features, scope='common_encoder_output', hiddens=[16, 8]) self.common_decoder_output = mlp(inputs=self.common_encoder_output, n_output=n_features, scope='common_decoder_output') self.learning_rate = learning_rate self.memory_size = memory_size self.batch_size = batch_size self.memory = Memory(self.memory_size) self.loss = tf.reduce_mean(tf.squared_difference(self.common_encoder_input, self.common_decoder_output)) self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss) def update(self): data = self.memory.sample(self.batch_size) self.sess.run(self.train, feed_dict = { self.common_encoder_input: data }) def store(self, state): self.memory.store(np.array([state])) @property def output(self): return self.common_encoder_output @property def full(self): return self.memory.return_index() == self.memory_size-1
class auto_encoder: def __init__(self, learning_rate, memory_size, batch_size, sess, output_size): self.sess = sess #state_t self.encoder_input = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input') self.encoder_output = mlp(inputs=self.encoder_input, n_output=output_size, scope='encoder_output', hiddens=[32, 16, 8]) self.decoder_output = mlp(inputs=self.encoder_output, n_output=n_features, scope='decoder_output', hiddens=[8, 16, 32]) self.encoder_output_ = tf.stop_gradient(self.decoder_output) #some const self.learning_rate = learning_rate self.memory_size = memory_size self.batch_size = batch_size #memory self.memory = Memory(self.memory_size) #for train self.loss = tf.reduce_mean( tf.squared_difference(self.encoder_input, self.decoder_output)) self.train = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss) def learn(self): data = self.memory.sample(self.batch_size) state = [] for i in range(self.batch_size): state.append(data[i][0]) self.sess.run(self.train, feed_dict={self.encoder_input: state}) def store(self, state): self.memory.store(np.array([state])) def output(self, state): return self.sess.run(self.encoder_output, feed_dict={self.encoder_input: [np.array(state)]}) def output_loss(self): data = self.memory.sample(self.batch_size) state = [] for i in range(self.batch_size): state.append(data[i][0]) temp = self.sess.run(self.loss, feed_dict={self.encoder_input: state}) print('now loss:', temp) print(state[0]) temp = self.sess.run(self.decoder_output, feed_dict={self.encoder_input: state}) print(temp[0]) @property def full(self): return self.memory.return_index() == self.memory_size - 1 def process_data(self): state, action, reward, state_next, done, decays = [], [], [], [], [], [] temp = self.memory.sample(self.batch_size) for i in range(self.batch_size): state.append(temp[i][0]) action.append(temp[i][1]) reward.append(temp[i][2]) state_next.append(temp[i][3]) if temp[i][4] == False: done.append(np.array(0)) else: done.append(np.array(1)) decays.append(self.decay) return state, action, reward, state_next, done, decays
print('Episode: {}'.format(ep), 'Total reward: {}'.format(total_reward), 'Training loss: {:.4f}'.format(loss), 'Explore P: {:.4f}'.format(agent.explore_p)) train_rewards_list.append((ep, total_reward)) # Add experience to memory memory.add((state, action, reward, next_state)) # Start new episode state = env.reset() else: # Add experience to memory memory.add((state, action, reward, next_state)) state = next_state t += 1 # Sample mini-batch from memory batch = memory.sample(batch_size) states = np.array([each[0] for each in batch]) actions = np.array([each[1] for each in batch]) rewards = np.array([each[2] for each in batch]) next_states = np.array([each[3] for each in batch]) # Train network loss = agent.learn(states, actions, rewards, gamma, next_states) test_rewards_list.extend( test_agent(agent, env, test_max_steps=convergence_reward + 25)) cur_compute_len = min(100, len(test_rewards_list)) mean_reward = np.mean(test_rewards_list[len(test_rewards_list) - cur_compute_len:]) print( 'Episode: {}'.format(ep),
class auto_encoder: def __init__( self, learning_rate, memory_size, batch_size, sess, output_size ): self.sess = sess #state_t self.encoder_input_t = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_t') self.encoder_output_t = mlp(inputs=self.encoder_input_t, n_output=output_size, scope='encoder_output_t', hiddens=[16, 8]) self.encoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_t') self.decoder_output_t = mlp(inputs=self.encoder_output_t, n_output=n_features, scope='decoder_output_t') self.decoder_output_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_t') self.encoder_output_t_ = tf.stop_gradient(self.encoder_output_t) #state_t+1 tpo->time plus one self.encoder_input_tpo = tf.placeholder(tf.float32, shape=[None, n_features], name='encoder_input_tpo') self.encoder_output_tpo = mlp(inputs=self.encoder_input_tpo, n_output=output_size, scope='encoder_output_tpo', hiddens=[16, 8]) self.encoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder_output_tpo') self.decoder_output_tpo = mlp(inputs=self.encoder_output_tpo, n_output=n_features, scope='decoder_output_tpo') self.decoder_output_tpo_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='decoder_output_tpo') self.encoder_output_tpo_ = tf.stop_gradient(self.encoder_output_tpo) #sync self.sync_encoder = [tf.assign(x, y) for x, y in zip(self.encoder_output_t_params, self.encoder_output_tpo_params)] self.sync_decoder = [tf.assign(x, y) for x, y in zip(self.decoder_output_t_params, self.decoder_output_tpo_params)] #some const self.learning_rate = learning_rate self.memory_size = memory_size self.batch_size = batch_size #memory self.memory = Memory(self.memory_size) #for train self.loss_0 = tf.reduce_mean(tf.squared_difference(self.encoder_input_t, self.decoder_output_t)) self.loss_1 = tf.reduce_mean(tf.squared_difference(self.encoder_input_tpo,self.decoder_output_tpo)) self.train_0 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_0) self.train_1 = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_1) def update(self): data = self.memory.sample(self.batch_size) #not sure if the data is legal self.sess.run([self.train_0,self.train_1] feed_dict = { self.encoder_input_t: data, self.decoder_output_tpo: data }) def store(self, state): self.memory.store(np.array([state])) @property def output(self): return @property def full(self): return self.memory.return_index() == self.memory_size-1 def sync(self): self.sess.run([self.sync_encoder, self.sync_decoder]) return None def learn(self): self.sess.run([self.train_0, self.train_1], feed_dict = { }) return None
class DQN: def __init__(self, n_features, n_actions, model, scope, sess, order, hiddens, beta, C, common_eval_input, common_target_input, common_eval_output, common_target_output, learning_rate=1e-5, decay=0.99, memory_size=20000000, batch_size=100000, epsilon_decrement=0.0005, epsilon_lower=0.2): self.sess = sess self.scope = scope self.n_features = n_features self.batch_size = batch_size self.decay = decay self.model = model self.memory = Memory(memory_size) self.order = order self.beta = beta self.C = C self.learn_times = 0 self.epsilon_lower = epsilon_lower self.epsilon_decrement = epsilon_decrement self.eval_input = tf.placeholder(tf.float32, shape=[None, self.n_features], name='eval_input') self.target_input = tf.placeholder(tf.float32, shape=[None, self.n_features], name='target_input') self.actions_selected = tf.placeholder(tf.int32, shape=[ None, ], name='actions_selected') self.done = tf.placeholder(tf.float32, shape=[ None, ], name='done') self.decays = tf.placeholder(tf.float32, shape=[ None, ], name='decay') self.rewards = tf.placeholder(tf.float32, shape=[ None, ], name='rewards') #about the encoder self.state_input_t = tf.placeholder(tf.float32, shape=[None, self.n_features], name='state_input_t') self.state_input_tpo = tf.placeholder(tf.float32, shape=[None, self.n_features], name='state_input_tpo') self.action_plus_state_input = tf.placeholder( tf.float32, shape=[None, self.n_features + 1], name='action_plus_state_input') #share the first layers self.common_eval_input = common_eval_input self.common_target_input = common_target_input self.common_eval_output = common_eval_output self.common_target_output = common_target_output with tf.variable_scope(self.scope): self._epsilon = tf.get_variable(name='epsilon', dtype=tf.float32, initializer=1.0) self._epsilon_decrement = tf.constant(epsilon_decrement) self.update_epsilon = tf.assign( self._epsilon, self._epsilon - self._epsilon_decrement) self.reset_epsilon = tf.assign(self._epsilon, 1) # self.eval_output = model(inputs=self.eval_input, n_output=n_actions, scope='eval_net', hiddens=hiddens) # self.target_output = tf.stop_gradient( # model(inputs=self.target_input, n_output=n_actions, scope='target_net', hiddens=hiddens)) self.eval_output = model(inputs=self.common_eval_output, n_output=n_actions, scope='eval_net', hiddens=hiddens) self.target_output = tf.stop_gradient( model(inputs=self.common_target_output, n_output=n_actions, scope='target_net', hiddens=hiddens)) #about encoder self.encoder_temp_t = mlp(inputs=self.state_input_t, n_output=64, scope='encoder_temp_t', hiddens=[32, 64]) self.encoder_temp_tpo = tf.stop_gradient( mlp(inputs=self.state_input_tpo, n_output=64, scope='encoder_temp_tpo', hiddens=[32, 64])) self.encoder_output_t = mlp(inputs=self.encoder_temp_t, n_output=self.n_features, scope='encoder_t', hiddens=[64, 32]) self.encoder_output_tpo = mlp(inputs=self.encoder_temp_tpo, n_output=self.n_features, scope='encoder_tpo', hiddens=[64, 32]) self.predict_output = mlp(inputs=self.action_plus_state_input, n_output=64, scope='predict_output', hiddens=[64, 32]) self.predict_mse = tf.reduce_sum( tf.square(self.encoder_temp_tpo - self.predict_output)) * self.n_features self.emax = tf.get_variable(name='emax', dtype=tf.float32, initializer=1.0) self.update_emax = tf.assign( self.emax, tf.maximum(self.emax, self.predict_mse)) self.e_normalize = tf.div(self.predict_mse, self.emax) self.encoder_loss = tf.reduce_sum( tf.square(self.state_input_t - self.encoder_output_t)) self.train_encoder = tf.train.AdamOptimizer( learning_rate).minimize(self.encoder_loss) self.M_loss = self.predict_mse self.train_M = tf.train.AdamOptimizer(learning_rate).minimize( self.M_loss) self.eval_output_selected = tf.reduce_sum( self.eval_output * tf.one_hot(self.actions_selected, n_actions), axis=1) self.eval_output_target = self.rewards + self.decays * tf.reduce_max( self.target_output, axis=1) * (1. - self.done) self.loss = tf.reduce_mean( tf.squared_difference(self.eval_output_selected, self.eval_output_target)) self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) self.eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope + '/eval_net') self.target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope + '/target_net') self.update = [ tf.assign(x, y) for x, y in zip(self.target_params, self.eval_params) ] self.sess.run(tf.global_variables_initializer()) def act(self, state): if random.uniform(0, 1) < self.epsilon: return random.randint(0, 1) else: copy_state = copy.deepcopy(state) #for debug #exchange t = copy_state[self.order] copy_state[self.order] = copy_state[0] copy_state[0] = t action = self.sess.run( self.eval_output, feed_dict={self.common_eval_input: np.array([copy_state])}) return np.argmax(action, axis=1)[0].tolist() def check(self, state): copy_state = copy.deepcopy(state) # exchange t = copy_state[self.order] copy_state[self.order] = copy_state[0] copy_state[0] = t action = self.sess.run( self.eval_output, feed_dict={self.common_eval_input: np.array([copy_state])}) return np.argmax(action, axis=1)[0].tolist() def learn(self): self.learn_times += 1 state, action, reward, state_next, done, decays = self.process_data() self.sess.run(self.train, feed_dict={ self.common_eval_input: state, self.actions_selected: action, self.rewards: reward, self.common_target_input: state_next, self.done: done, self.decays: decays }) if self.epsilon > self.epsilon_lower: self.sess.run(self.update_epsilon) if self.learn_times % 10 == 0: print('start update target network') self.sess.run(self.update) def store(self, state, action, reward, state_after, episode_ended): state_copy = copy.deepcopy(state) state_after_copy = copy.deepcopy(state_after) #exchange t = state_copy[self.order] state_copy[self.order] = state_copy[0] state_copy[0] = t t = state_after_copy[self.order] state_after_copy[self.order] = state_after_copy[0] state_after_copy[0] = t self.memory.store( np.array( [state_copy, action, reward, state_after_copy, episode_ended])) def process_data(self): state, action, reward, state_next, done, decays = [], [], [], [], [], [] temp = self.memory.sample(self.batch_size) for i in range(self.batch_size): state.append(temp[i][0]) action.append(temp[i][1]) reward.append(temp[i][2]) state_next.append(temp[i][3]) if temp[i][4] == False: done.append(np.array(0)) else: done.append(np.array(1)) decays.append(self.decay) return state, action, reward, state_next, done, decays @property def epsilon(self): return self.sess.run(self._epsilon) def return_new_reward(self, reward, state_t, state_tpo, episode, action): self.sess.run(self.update_emax, feed_dict={ self.state_input_t: np.array([state_t]), self.state_input_tpo: np.array([state_tpo]), self.action_plus_state_input: np.array([state_t + [action]]) }) temp = self.sess.run(self.e_normalize, feed_dict={ self.state_input_t: np.array([state_t]), self.state_input_tpo: np.array([state_tpo]), self.action_plus_state_input: np.array([state_t + [action]]), }) return reward + (self.beta / self.C) * temp def update_M(self): state, action, reward, state_next, done, decays = self.process_data() self.sess.run(self.train_M, feed_dict={ self.state_input_tpo: state_next, self.action_plus_state_input: np.hstack((state, np.array([action]).T)) }) def update_encoder(self): state, action, reward, state_next, done, decays = self.process_data() self.sess.run(self.train_encoder, feed_dict={self.state_input_t: state})