class DDPG(object): """deep deterministic policy gradient """ def __init__(self, n_state, n_action, a_bound, gamma=0.99, tau=0.01, actor_lr=0.0005, critic_lr=0.001, noise_std=0.1, noise_decay=0.9995, noise_decay_steps=1000, buffer_size=20000, save_interval=5000, assess_interval=10, logger=None, checkpoint_queen=None): self.logger = logger self.logger.save_config(locals()) self.n_action = n_action self.n_state = n_state self.a_bound = a_bound self.noise_std = noise_std self.noise_decay = noise_decay self.noise_decay_steps = noise_decay_steps self.pointer = 0 self.buffer_size = buffer_size self.save_interval = save_interval self.assess_interval = assess_interval self.actor = Actor(self.n_state, self.n_action, gamma=gamma, lr=actor_lr, tau=tau, l2_reg=0) self.critic = Critic(self.n_state, self.n_action, gamma=gamma, lr=critic_lr, tau=tau, l2_reg=0) self.merge = self._merge_summary() self.ckpt_queen = checkpoint_queen self.prefix = self.__class__.__name__.lower() def _merge_summary(self): tf.summary.histogram('critic_output', self.critic.model.output) tf.summary.histogram('actor_output', self.actor.model.output) tf.summary.histogram('critic_dense1', self.critic.model.get_layer('l1').weights[0]) tf.summary.histogram('actor_dense1', self.actor.model.get_layer('l1').weights[0]) tf.summary.histogram('critic_dense2', self.critic.model.get_layer('l2').weights[0]) tf.summary.histogram('actor_dense2', self.actor.model.get_layer('l2').weights[0]) return tf.summary.merge_all() def policy_action(self, state): return self.actor.predict(state) def bellman_q_value(self, rewards, q_nexts, dones): """ Use the Bellman Equation to compute the critic target """ q_target = np.zeros_like( rewards) # asarry( copy = False), array(cope=True) for i in range(rewards.shape[0]): if dones[i]: q_target[i] = rewards[i] else: q_target[i] = rewards[i] + self.critic.gamma * q_nexts[i] return q_target def update_model(self, states, actions, q_values): # train critic loss_names, loss_values = self.critic.train_on_batch( states, actions, q_values) # train actor # p_actions = self.actor.predict(states) #actions with no noise grad_ys = self.critic.gradients( states, self.actor.predict(states)) #(batch, n-action) actor_output = self.actor.train(states, self.actor.predict(states), grad_ys) # copy network self.actor.copy_weights() self.critic.copy_weights() # print(grad_ys, grad_ys.shape) # print(actor_output[0],actor_output[0].shape) # print(np.mean(grad_ys*actor_output[0])) return loss_names, loss_values, grad_ys, actor_output def save_weights(self, path): self.actor.save(path) self.critic.save(path) def save_model(self, path, file): self.actor.model.save( os.path.join(path, self.prefix + '_actor_' + file + '.h5')) self.critic.model.save( os.path.join(path, self.prefix + '_critic_' + file + '.h5')) def checkpoint(self, path, step, metric_value): signature = str(step) + '_' + '{:.4f}'.format(metric_value) to_delete, need_save = self.ckpt_queen.add((metric_value, signature)) if to_delete: actor = os.path.join( path, self.prefix + '_actor_' + to_delete[1] + '.h5') critic = os.path.join( path, self.prefix + '_critic_' + to_delete[1] + '.h5') os.remove(actor) os.remove(critic) if need_save: self.save_model(path, signature) def train(self, args, summary_writer, train_data=None, val_data=None, test_data=None): results = [] max_val_rate = 0 val_data = np.asarray(val_data) # none will be array(None) # First, gather experience tqdm_e = tqdm(range(args.batchs), desc='score', leave=True, unit=" epoch") if train_data is None: dataset = CsvBuffer(args.file_dir, args.reg_pattern, chunksize=args.batch_size) # 100*(20+1) assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available' # noise = OrnsteinUhlenbeckProcess(size=self.n_action) else: dataset = Dataset(train_data, args.batch_size, shuffle=True) for e in tqdm_e: batch_data = next(dataset) states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int) a = self.policy_action(states) #(batch, n_action) a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape), self.a_bound[0], self.a_bound[1]) # a = np.clip(np.random.normal(a, self.noise_std), self.a_bound[0], self.a_bound[1]) # a = np.clip(a + noise.generate(time, a.shape[0]), self.a_bound[0], self.a_bound[1]) llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5) r = np.where(labels == 1, llr.ravel(), -llr.ravel()) #(batch,) # q_nexts = self.critic.target_predict(new_states, self.actor.target_predict(new_states)) q_ = self.bellman_q_value(rewards=r, q_nexts=0, dones=[True] * r.shape[0]) #(batch,) loss_names, loss_values, grad_ys, actor_output = self.update_model( states, a, q_.reshape(-1, 1)) score = r.mean() if ((e + 1) % self.noise_decay_steps - 1) == 0: self.noise_std *= self.noise_decay self.logger.log_tabular('noise', self.noise_std) if e % self.assess_interval == 0 or e == args.batchs - 1: if val_data is not None: val_pred = self.actor.predict(val_data[:, :-1]) val_y = val_data[:, -1] val_rate, top_k = top_ratio_hit_rate( val_y.ravel(), val_pred.ravel()) self.logger.log_tabular('val_rate', val_rate) self.logger.log_tabular('val_k', int(top_k)) self.checkpoint(args.model_path, e, val_rate) max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate if test_data is not None: test_pred = self.actor.predict(test_data[:, :-1]) test_y = test_data[:, -1] test_rate, top_k = top_ratio_hit_rate( test_y, test_pred.ravel()) self.logger.log_tabular('test_rate', test_rate) self.logger.log_tabular('test_k', int(top_k)) summary_writer.add_summary(tf_summary(['mean-reward'], [score]), global_step=e) summary_writer.add_summary(tf_summary(loss_names, [loss_values]), global_step=e) merge = keras.backend.get_session().run( self.merge, feed_dict={ self.critic.model.input[0]: states, self.critic.model.input[1]: a, self.actor.model.input: states }) summary_writer.add_summary(merge, global_step=e) for name, val in zip(loss_names, [loss_values]): self.logger.log_tabular(name, val) # print(grad_ys,grad_ys.shape) # print(actor_output) self.logger.log_tabular( 'dQ/da', '%.4f+%.4f' % (grad_ys.mean(), grad_ys.std())) # grad_ys (batch,act_dim) self.logger.log_tabular( 'aout', '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std())) self.logger.log_tabular('aloss', '%.4f' % (actor_output[1])) self.logger.log_tabular('reward', '%.4f+%.4f' % (score, r.std())) self.logger.dump_tabular() tqdm_e.set_description("score: " + '{:.4f}'.format(score)) tqdm_e.set_postfix(noise_std='{:.4f}'.format(self.noise_std), max_val_rate='{:.4f}'.format(max_val_rate), val_rate='{:.4f}'.format(val_rate), top_k=top_k) tqdm_e.refresh() return results
class TD3(object): """deep deterministic policy gradient """ def __init__(self, n_state, n_action, a_bound, discount=0.99, tau=0.05, actor_lr=0.001, critic_lr=0.001, policy_freq=2, exp_noise_std=0.1, noise_decay=0.9995, noise_decay_steps=1000, smooth_noise_std=0.1, clip=0.2, buffer_size=20000, save_interval=5000, assess_interval=20, logger=None, checkpoint_queen=None): #self.__dict__.update(locals()) self.logger = logger self.logger.save_config(locals()) self.n_action = n_action self.n_state = n_state self.a_bound = a_bound self.noise_std = exp_noise_std self.noise_decay = noise_decay self.noise_decay_steps = noise_decay_steps self.policy_freq = policy_freq self.smooth_noise_std = smooth_noise_std self.clip = clip self.discount = discount self.pointer = 0 self.buffer = MemoryBuffer(buffer_size, with_per=True) self.save_interval = save_interval self.assess_interval = assess_interval self.actor = Actor(self.n_state, self.n_action, gamma=discount, lr=actor_lr, tau=tau) self.critic1 = Critic(self.n_state, self.n_action, gamma=discount, lr=critic_lr, tau=tau) self.critic2 = Critic(self.n_state, self.n_action, gamma=discount, lr=critic_lr, tau=tau) self.merge = self._merge_summary() self.ckpt_queen = checkpoint_queen self.prefix = self.__class__.__name__ def _merge_summary(self): tf.summary.histogram('critic_output', self.critic1.model.output) tf.summary.histogram('actor_output', self.actor.model.output) tf.summary.histogram('critic_dense1', self.critic1.model.get_layer('l1').weights[0]) tf.summary.histogram('actor_dense1', self.actor.model.get_layer('l1').weights[0]) tf.summary.histogram('critic_dense2', self.critic1.model.get_layer('l2').weights[0]) tf.summary.histogram('actor_dense2', self.actor.model.get_layer('l2').weights[0]) return tf.summary.merge_all() def select_action(self, state): return self.actor.predict(state) def bellman_q_value(self, rewards, q_nexts, dones): """ Use the Bellman Equation to compute the critic target """ q_target = np.zeros_like( rewards) #asarry( copy = False), array(cope=True) for i in range(rewards.shape[0]): if dones[i]: q_target[i] = rewards[i] else: q_target[i] = rewards[i] + self.discount * q_nexts[i] return q_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ if (self.buffer.with_per): q_val = reward q_val_t = self.critic1.target_predict(state, action) td_error = abs(q_val_t - q_val)[0] # print(td_error) else: td_error = 0 state = state.reshape(-1) action = action.reshape(-1) self.buffer.memorize(state, action, reward, done, new_state, td_error) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_actor(self, states): actions = self.actor.predict(states) grad_ys = self.critic1.gradients(states, actions) actor_output = self.actor.train(states, actions, grad_ys) self.actor.copy_weights() self.critic1.copy_weights() self.critic2.copy_weights() return grad_ys, actor_output def update_critic(self, states, actions, q_values): loss_names, loss_values = self.critic1.train_on_batch( states, actions, q_values) self.critic2.train_on_batch(states, actions, q_values) return loss_names, loss_values def save_weights(self, path): self.actor.save(path) self.critic1.save(path) self.critic2.save(path) def save_model(self, path, file): self.actor.model.save( os.path.join(path, self.prefix + '_actor_' + file + '.h5')) self.critic1.model.save( os.path.join(path, self.prefix + '_critic1_' + file + '.h5')) self.critic2.model.save( os.path.join(path, self.prefix + '_critic2_' + file + '.h5')) def checkpoint(self, path, step, metric_value): signature = str(step) + '_' + '{:.4}'.format(metric_value) to_delete, need_save = self.ckpt_queen.add((metric_value, signature)) if to_delete: delete_actor = os.path.join( path, self.prefix + '_actor_' + to_delete[1] + '.h5') delete_critic1 = os.path.join( path, self.prefix + '_critic1_' + to_delete[1] + '.h5') delete_critic2 = os.path.join( path, self.prefix + '_critic2_' + to_delete[1] + '.h5') os.remove(delete_actor) os.remove(delete_critic1) os.remove(delete_critic2) if need_save: self.save_model(path, signature) def train(self, args, summary_writer, train_data=None, val_data=None, test_data=None): results = [] max_val_rate = 0 val_data = np.asarray(val_data) # none will be array(None) # First, gather experience tqdm_e = tqdm(range(args.batchs), desc='score', leave=True, unit="epoch") if train_data is None: dataset = CsvBuffer(args.file_dir, args.reg_pattern, chunksize=args.batch_size) # 100*(20+1) assert dataset.is_buffer_available, 'neither train_data nor csv buffer is available' # noise = OrnsteinUhlenbeckProcess(size=self.n_action) else: dataset = Dataset(train_data, 1, shuffle=True) warm_up = 20 * args.batch_size for e in tqdm_e: batch_data = next(dataset) states, labels = batch_data[:, :-1], batch_data[:, -1].astype(int) a = self.select_action(states) #(batch, n_action) a = np.clip(a + np.random.normal(0, self.noise_std, size=a.shape), self.a_bound[0], self.a_bound[1]) llr = np.clip(np.log(a / (1 - a) + 1e-6), -5, 5) # rewards = np.where(labels==1, llr.ravel(), -llr.ravel()) #(batch,) rewards = np.where(labels == 1, np.where(llr > 0, llr.ravel(), 2 * llr.ravel()), np.where(llr < 0, -llr.ravel(), -2 * llr.ravel())) #(batch,) # print(rewards) # a_ = self.actor.target_predict(next_states) # noise = np.clip(np.random.normal(0, self.smooth_noise_std), 0, self.clip) # a_ = a_ + noise # q_next1 = self.critic1.target_predict(new_states, a_) # q_next2 = self.critic2.target_predict(new_states,a_) # q_nexts = np.where(q_next1<q_next2, q_next1, q_next2) self.memorize(states, a, rewards, True, None) if e < warm_up: continue states, a, rewards, _, _, _ = self.sample_batch(args.batch_size) # print(states.shape, a.shape, rewards.shape) q_ = self.bellman_q_value(rewards=rewards, q_nexts=0, dones=[True] * rewards.shape[0]) #(batch,) loss_names, loss_values = self.update_critic( states, a, q_.reshape(-1, 1)) if e % self.policy_freq == 0 or e == warm_up: grad_ys, actor_output = self.update_actor(states) if ((e + 1) % self.noise_decay_steps - 1) == 0 or e == warm_up: self.noise_std *= self.noise_decay self.logger.log_tabular('noise', self.noise_std) if e % self.assess_interval == 0 or e == args.batchs - 1 or e == warm_up: if val_data is not None: val_pred = self.actor.predict(val_data[:, :-1]) val_y = val_data[:, -1] # print(val_pred.shape,val_pred[:10]) # print(val_y.shape, val_y[:10]) val_rate, top_k = top_ratio_hit_rate( val_y.ravel(), val_pred.ravel()) self.logger.log_tabular('val_rate', val_rate) self.logger.log_tabular('val_k', int(top_k)) self.checkpoint(args.model_path, e, val_rate) max_val_rate = val_rate if val_rate > max_val_rate else max_val_rate if test_data is not None: test_pred = self.actor.predict(test_data[:, :-1]) test_y = test_data[:, -1] test_rate, top_k = top_ratio_hit_rate( test_y, test_pred.ravel()) self.logger.log_tabular('test_rate', test_rate) self.logger.log_tabular('test_k', int(top_k)) score = rewards.mean() summary_writer.add_summary(tf_summary(['mean-reward'], [score]), global_step=e) summary_writer.add_summary(tf_summary(loss_names, [loss_values]), global_step=e) merge = keras.backend.get_session().run( self.merge, feed_dict={ self.critic1.model.input[0]: states, self.critic1.model.input[1]: a, self.actor.model.input: states }) summary_writer.add_summary(merge, global_step=e) for name, val in zip(loss_names, [loss_values]): self.logger.log_tabular(name, val) self.logger.log_tabular( 'dQ/da', '%.4f+%.4f' % (grad_ys.mean(), grad_ys.std())) # grad_ys (batch,act_dim) self.logger.log_tabular( 'aout', '%.4f+%.4f' % (actor_output[0].mean(), actor_output[0].std())) self.logger.log_tabular('aloss', '%.4f' % (actor_output[1])) self.logger.log_tabular('reward', '%.4f+%.4f' % (score, rewards.std())) self.logger.dump_tabular() tqdm_e.set_description("score: " + '{:.4f}'.format(score)) tqdm_e.set_postfix(noise_std='{:.4}'.format(self.noise_std), max_val_rate='{:.4}'.format(max_val_rate), val_rate='{:.4}'.format(val_rate), top_k=top_k) tqdm_e.refresh() return results