class DDQN: """ Deep Q-Learning Main Algorithm """ def __init__(self, action_dim, state_dim, args): """ Initialization """ # Environment and DDQN parameters self.with_per = args.with_per self.action_dim = action_dim self.state_dim = (args.consecutive_frames,) + state_dim # self.lr = 2.5e-4 self.gamma = 0.95 self.epsilon = 0.8 self.epsilon_decay = 0.99 self.buffer_size = 20000 # if(len(state_dim) < 3): self.tau = 1e-2 else: self.tau = 1.0 # Create actor and critic networks self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau, args.dueling) # Memory Buffer for Experience Replay self.buffer = MemoryBuffer(self.buffer_size, args.with_per) def policy_action(self, s): """ Apply an espilon-greedy policy to pick next action """ if random() <= self.epsilon: return randrange(self.action_dim) else: return np.argmax(self.agent.predict(s)[0]) def train_agent(self, batch_size): """ Train Q-network on batch sampled from the buffer """ # Sample experience from memory buffer (optionally with PER) s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size) # Apply Bellman Equation on batch samples to train our DDQN q = self.agent.predict(s) next_q = self.agent.predict(new_s) q_targ = self.agent.target_predict(new_s) for i in range(s.shape[0]): old_q = q[i, a[i]] if d[i]: q[i, a[i]] = r[i] else: next_best_action = np.argmax(next_q[0,:]) q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action] if(self.with_per): # Update PER Sum Tree self.buffer.update(idx[i], abs(old_q - q[i, a[i]])) # Train on batch self.agent.fit(s, q) # Decay epsilon self.epsilon *= self.epsilon_decay def train(self, env, args, summary_writer): """ Main DDQN Training Algorithm """ results = [] tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() while not done: if args.render: env.render() # Actor picks an action (following the policy) a = self.policy_action(old_state) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # Memorize for experience replay self.memorize(old_state, a, r, done, new_state) # Update current state old_state = new_state cumul_reward += r time += 1 # Train DDQN and transfer weights to target network if(self.buffer.size() > args.batch_size): self.train_agent(args.batch_size) self.agent.transfer_weights() # Gather stats every episode for plotting if(args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ if(self.with_per): q_val = self.agent.predict(new_state) q_val_t = self.agent.target_predict(new_state) next_best_action = np.argmax(q_val) new_val = reward + self.gamma * q_val_t[0, next_best_action] td_error = abs(new_val - q_val)[0] else: td_error = 0 self.buffer.memorize(state, action, reward, done, new_state, td_error)
class DDQN: """ Deep Q-Learning Main Algorithm 深度Q学习主要算法 """ def __init__(self, action_dim, state_dim, args): """ Initialization 初始化 """ # Environment and DDQN parameters 环境和DDQN参数 self.with_per = args.with_per self.action_dim = action_dim self.state_dim = (args.consecutive_frames, ) + state_dim # self.lr = 2.5e-4 self.gamma = 0.95 self.epsilon = 0.8 self.epsilon_decay = 0.99 self.buffer_size = 20000 # if (len(state_dim) < 3): self.tau = 1e-2 else: self.tau = 1.0 # Create actor and critic networks 建立演员和评论家网络 self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau, args.dueling) # Memory Buffer for Experience Replay 用于经验重播的内存缓冲区 self.buffer = MemoryBuffer(self.buffer_size, args.with_per) def policy_action(self, s): """ Apply an espilon-greedy policy to pick next action 应用epsilon-greedy策略选择下一步操作 """ if random() <= self.epsilon: return randrange(self.action_dim) else: return np.argmax(self.agent.predict(s)[0]) def train_agent(self, batch_size): """ Train Q-network on batch sampled from the buffer 从缓冲区采样的批次训练Q网络 """ # Sample experience from memory buffer (optionally with PER) 来自内存缓冲区的示例体验(可选配PER) s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size) # Apply Bellman Equation on batch samples to train our DDQN 在批次样本里应用Bellman方程来训练我们的DDQN q = self.agent.predict(s) next_q = self.agent.predict(new_s) q_targ = self.agent.target_predict(new_s) for i in range(s.shape[0]): old_q = q[i, a[i]] if d[i]: q[i, a[i]] = r[i] else: next_best_action = np.argmax(next_q[i, :]) q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action] if (self.with_per): # Update PER Sum Tree 更新PER Sum树 self.buffer.update(idx[i], abs(old_q - q[i, a[i]])) # Train on batch 批量训练 self.agent.fit(s, q) # Decay epsilon 衰变epsilon self.epsilon *= self.epsilon_decay def train(self, env, args, summary_writer): """ Main DDQN Training Algorithm DDQN主要训练算法 """ results = [] tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode 重设episode time, cumul_reward, done = 0, 0, False old_state = env.reset() while not done: if args.render: env.render() # Actor picks an action (following the policy) 演员选择动作(遵循政策) a = self.policy_action(old_state) # Retrieve new state, reward, and whether the state is terminal 检索新状态,奖励以及该状态是否为终端 new_state, r, done, _ = env.step(a) # Memorize for experience replay 保存经验重播 self.memorize(old_state, a, r, done, new_state) # Update current state 更新当前状态 old_state = new_state cumul_reward += r time += 1 # Train DDQN and transfer weights to target network 训练DDQN并将权重转移到目标网络 if (self.buffer.size() > args.batch_size): self.train_agent(args.batch_size) self.agent.transfer_weights() # Gather stats every episode for plotting 收集每个情节的统计数据以进行绘图 if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard 为Tensorboard导出结果 score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score 显示分数 tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer 将经验存储在内存缓冲区中 """ if (self.with_per): q_val = self.agent.predict(state) q_val_t = self.agent.target_predict(new_state) next_best_action = np.argmax(self.agent.predict(new_state)) new_val = reward + self.gamma * q_val_t[0, next_best_action] td_error = abs(new_val - q_val)[0] else: td_error = 0 self.buffer.memorize(state, action, reward, done, new_state, td_error) def save_weights(self, path): path += '_LR_{}'.format(self.lr) if (self.with_per): path += '_PER' self.agent.save(path) def load_weights(self, path): self.agent.load_weights(path)
class DDPG: """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, act_dim, env_dim, act_range, k, buffer_size = 20000, gamma = 0.99, lr = 0.00005, tau = 0.001): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = (k,) + env_dim self.gamma = gamma self.lr = lr # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) self.buffer = MemoryBuffer(buffer_size) def policy_action(self, s): """ Use the actor to predict value """ return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """ Use the Bellman Equation to compute the critic target """ critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ self.buffer.memorize(state, action, reward, done, new_state) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience """ # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) # Train actor self.actor.train(states, actions, np.array(grads).reshape((-1, self.act_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def train(self, env, args, summary_writer): results = [] # First, gather experience tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() actions, states, rewards = [], [], [] noise = OrnsteinUhlenbeckProcess(size=self.act_dim) while not done: if args.render: env.render() # Actor picks an action (following the deterministic policy) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment a = np.clip(a+noise.generate(time), -self.act_range, self.act_range) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # Add outputs to memory buffer self.memorize(old_state, a, r, done, new_state) # Sample experience from buffer states, actions, rewards, dones, new_states, _ = self.sample_batch(args.batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict([new_states, self.actor.target_predict(new_states)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states, actions, critic_target) # Update current state old_state = new_state cumul_reward += r time += 1 # Gather stats every episode for plotting if(args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() return results def save_weights(self, path): path += '_LR_{}'.format(self.lr) self.actor.save(path) self.critic.save(path) def load_weights(self, path_actor, path_critic): self.critic.load_weights(path_critic) self.actor.load_weights(path_actor)
class DDQN: """ Deep Q-Learning Main Algorithm """ def __init__(self, action_dim, state_dim, args): """ Initialization """ # Environment and DDQN parameters self.with_per = args.with_per self.action_dim = action_dim self.state_dim = state_dim # self.lr = 2.5e-4 self.gamma = 0.95 self.epsilon = 0.8 self.epsilon_decay = 0.99 self.buffer_size = 20000 # self.tau = 1e-2 # Create actor and critic networks self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau, args.dueling, args.hidden_dim) # Memory Buffer for Experience Replay self.buffer = MemoryBuffer(self.buffer_size, args.with_per) def policy_action(self, s): """ Apply an espilon-greedy policy to pick next action """ if random() <= self.epsilon: return randrange(self.action_dim) else: return np.argmax(self.agent.predict(s)[0]) def train_agent(self, batch_size): """ Train Q-network on batch sampled from the buffer """ # Sample experience from memory buffer (optionally with PER) s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size) # Apply Bellman Equation on batch samples to train our DDQN q = self.agent.predict(s) next_q = self.agent.predict(new_s) q_targ = self.agent.target_predict(new_s) for i in range(s.shape[0]): old_q = q[i, a[i]] if d[i]: q[i, a[i]] = r[i] else: next_best_action = np.argmax(next_q[i, :]) q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action] if (self.with_per): # Update PER Sum Tree self.buffer.update(idx[i], abs(old_q - q[i, a[i]])) # Train on batch self.agent.fit(s, q) # Decay epsilon self.epsilon *= self.epsilon_decay def train(self, env, args, summary_writer, envtest=None): """ Main DDQN Training Algorithm """ results = [] tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") epoch = 0 gross_profit = 0 WritetoCsvFile("logFile_1.csv", [ "stage", "file", "history_win", "stop", "usevol", "dueling", "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2", "madifference", "hidema", "candlenum", "hidden_dim", "maxProfit", "maxLOSS", "avgProfit", "avgLOSS", "countprofit", "countloss", "maxdrop", "Total profit", "total_reward", "TRADES", "epoch" ]) WritetoCsvFile("logFileDetail.csv", [ "stage", "file", "history_win", "stop", "usevol", "dueling", "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2", "madifference", "hidema", "candlenum", "hidden_dim", 'maxProfit', 'maxLOSS', 'avgProfit', 'avgLOSS', 'maxdrop', 'Total profit', 'gross profit', "total_reward", 'TRADES', 'epoch' ]) for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() ########################################## total_reward = 0 total_profit = 0 total_loss = 0 total_profitMax = 0 total_profitMin = 0 max_drop = 0 profitLst = [] lossLst = [] trades = 0 step = 0 #####################################3#### while not done: #if args.render: env.render() # Actor picks an action (following the policy) a = self.policy_action(old_state) # Retrieve new state, reward, and whether the state is terminal #new_state, r, done, _ = env.step(a) ####################################################### new_state, r, done, buy, sell, profit = env.step(a) total_reward += r if profit != 0: trades += 1 total_profit += profit if total_profit > total_profitMax: total_profitMax = total_profit total_profitMin = total_profit if total_profit < total_profitMin: total_profitMin = total_profit try: if total_profitMax != 0 and max_drop < ( total_profitMax - total_profitMin) / total_profitMax: max_drop = (total_profitMax - total_profitMin) / total_profitMax except: max_drop = 0 if profit > 0: profitLst.append(profit) elif profit < 0: lossLst.append(profit) step += 1 if step % 1500 == 0: print( 'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}/{} TRADES: {} ' .format(np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), max_drop, total_profit, gross_profit, trades)) WritetoCsvFile("logFileDetail.csv", [ "train", args.trainf, args.history_win, args.stop, args.usevol, args.dueling, args.traineval, args.allprices, args.allprices2, args.allprices3, args.ma1, args.ma2, args.madifference, args.hidema, args.candlenum, args.hidden_dim, np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), max_drop, total_profit, gross_profit, total_reward, trades, epoch ]) #done = True if step == len(env.data) - 3 else False ###################################################### # Memorize for experience replay self.memorize(old_state, a, r, done, new_state) # Update current state old_state = new_state cumul_reward += r time += 1 # Train DDQN and transfer weights to target network if (self.buffer.size() > args.batch_size): self.train_agent(args.batch_size) self.agent.transfer_weights() gross_profit += total_profit # Gather stats every episode for plotting if (args.gather_stats): mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard score = tfSummary('score', cumul_reward) l_profit = tfSummary('profit', total_profit) l_aprofit = tfSummary('average profit', np.mean(profitLst)) l_aloss = tfSummary('l_aloss', -np.mean(lossLst)) l_trades = tfSummary('l_trades', trades) np.mean(profitLst), -np.mean(lossLst) summary_writer.add_summary(score, global_step=e) summary_writer.add_summary(l_profit, global_step=e) summary_writer.add_summary(l_aprofit, global_step=e) summary_writer.add_summary(l_aloss, global_step=e) summary_writer.add_summary(l_trades, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() self.agent.saveModel("./models/model_ep", "") results = [ np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), len(profitLst), len(lossLst), max_drop, total_profit, total_reward, trades ] WritetoCsvFile("logFile_1.csv", [ "train", args.trainf, args.history_win, args.stop, args.usevol, args.dueling, args.traineval, args.allprices, args.allprices2, args.allprices3, args.ma1, args.ma2, args.madifference, args.hidema, args.candlenum, args.hidden_dim ] + results + [epoch]) if envtest: # Если задано окружение для тестирования то тестируем каждую эпоху newargs = args newargs.traineval = False self.evaluate(envtest, newargs, summary_writer, model=None, epoch=epoch) epoch += 1 return results def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ if (self.with_per): q_val = self.agent.predict(state) q_val_t = self.agent.target_predict(new_state) next_best_action = np.argmax(q_val) new_val = reward + self.gamma * q_val_t[0, next_best_action] td_error = abs(new_val - q_val)[0] else: td_error = 0 self.buffer.memorize(state, action, reward, done, new_state, td_error) def evaluate(self, env, args, summary_writer, model, epoch=0): """ Evaluate """ results = [] if model: self.agent.loadModel_versoin(model, "") done = False old_state = env.reset() ########################################## total_reward = 0 total_profit = 0 total_loss = 0 total_profitMax = 0 total_profitMin = 0 max_drop = 0 profitLst = [] lossLst = [] step = 0 trades = 0 #####################################3#### while not done: # if args.render: env.render() # Actor picks an action (following the policy) a = self.policy_action(old_state) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, buy, sell, profit = env.step(a) ####################################################### total_reward += r if profit != 0: trades += 1 total_profit += profit if total_profit > total_profitMax: total_profitMax = total_profit total_profitMin = total_profit if total_profit < total_profitMin: total_profitMin = total_profit try: if total_profitMax != 0 and max_drop < ( total_profitMax - total_profitMin) / total_profitMax: max_drop = (total_profitMax - total_profitMin) / total_profitMax except: max_drop = 0 if profit > 0: profitLst.append(profit) elif profit < 0: lossLst.append(profit) step += 1 if step % 1500 == 0: print( 'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {} Total reward: {} TRADES: {} ' .format(np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), max_drop, total_profit, total_reward, trades)) WritetoCsvFile("logFileDetail.csv", [ "eval", args.trainf, args.history_win, args.stop, args.usevol, args.dueling, args.traineval, args.allprices, args.allprices2, args.allprices3, args.ma1, args.ma2, args.madifference, args.hidema, args.candlenum, args.hidden_dim, np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), max_drop, total_profit, total_profit, total_reward, trades, epoch ]) #done = True if step == len(env.data) - 2 else False ###################################################### # Memorize for experience replay if args.traineval: self.memorize(old_state, a, r, done, new_state) # Train DDQN and transfer weights to target network if (self.buffer.size() > args.batch_size): self.train_agent(args.batch_size) self.agent.transfer_weights() # Update current state old_state = new_state print( 'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {} Total reward: {} TRADES: {} ' .format(np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), max_drop, total_profit, total_reward, trades)) results = [ np.max(profitLst + [0]), -np.min(lossLst + [0]), np.mean(profitLst + [0]), -np.mean(lossLst + [0]), len(profitLst), len(lossLst), max_drop, total_profit, total_reward, trades ] WritetoCsvFile("logFile_1.csv", [ "eval", args.trainf, args.history_win, args.stop, args.usevol, args.dueling, args.traineval, args.allprices, args.allprices2, args.allprices3, args.ma1, args.ma2, args.madifference, args.hidema, args.candlenum, args.hidden_dim ] + results + [epoch]) return results
class ddpgAgent(): """Deep Deterministic Policy Gradient(DDPG) Agent """ def __init__(self, env_, is_discrete=False, batch_size=100, w_per=True): # gym environments self.env = env_ self.discrete = is_discrete self.obs_dim = env_.observation_space.shape[0] self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[ 0] self.action_bound = (env_.action_space.high - env_.action_space.low ) / 2 if not is_discrete else 1. self.action_shift = (env_.action_space.high + env_.action_space.low ) / 2 if not is_discrete else 0. # initialize actor & critic and its targets self.discount_factor = 0.99 self.actor = ActorNet(self.obs_dim, self.act_dim, self.action_bound, lr_=1e-4, tau_=1e-3) self.critic = CriticNet(self.obs_dim, self.act_dim, lr_=1e-3, tau_=1e-3, discount_factor=self.discount_factor) # Experience Buffer self.buffer = MemoryBuffer(BUFFER_SIZE, with_per=w_per) self.with_per = w_per self.batch_size = batch_size # OU-Noise-Process self.noise = OrnsteinUhlenbeckProcess(size=self.act_dim) ################################################### # Network Related ################################################### def make_action(self, obs, t, noise=True): """ predict next action from Actor's Policy """ action_ = self.actor.predict(obs)[0] a = np.clip(action_ + self.noise.generate(t) if noise else 0, -self.action_bound, self.action_bound) return a def update_networks(self, obs, acts, critic_target): """ Train actor & critic from sampled experience """ # update critic self.critic.train(obs, acts, critic_target) # get next action and Q-value Gradient n_actions = self.actor.network.predict(obs) q_grads = self.critic.Qgradient(obs, n_actions) # update actor self.actor.train(obs, self.critic.network, q_grads) # update target networks self.actor.target_update() self.critic.target_update() def replay(self, replay_num_): if self.with_per and (self.buffer.size() <= self.batch_size): return for _ in range(replay_num_): # sample from buffer states, actions, rewards, dones, new_states, idx = self.sample_batch( self.batch_size) # get target q-value using target network q_vals = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # bellman iteration for target critic value critic_target = np.asarray(q_vals) for i in range(q_vals.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[ i] = self.discount_factor * q_vals[i] + rewards[i] if self.with_per: self.buffer.update(idx[i], abs(q_vals[i] - critic_target[i])) # train(or update) the actor & critic and target networks self.update_networks(states, actions, critic_target) #################################################### # Buffer Related #################################################### def memorize(self, obs, act, reward, done, new_obs): """store experience in the buffer """ if self.with_per: q_val = self.critic.network( [np.expand_dims(obs, axis=0), self.actor.predict(obs)]) next_action = self.actor.target_network.predict( np.expand_dims(new_obs, axis=0)) q_val_t = self.critic.target_predict( [np.expand_dims(new_obs, axis=0), next_action]) new_val = reward + self.discount_factor * q_val_t td_error = abs(new_val - q_val)[0] else: td_error = 0 self.buffer.memorize(obs, act, reward, done, new_obs, td_error) def sample_batch(self, batch_size): """ Sampling from the batch """ return self.buffer.sample_batch(batch_size) ################################################### # Save & Load Networks ################################################### def save_weights(self, path): """ Agent's Weights Saver """ self.actor.save_network(path) self.critic.save_network(path) def load_weights(self, pretrained): """ Agent's Weights Loader """ self.actor.load_network(pretrained) self.critic.load_network(pretrained)
class DDPG: """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, act_dim, env_dim, act_range, k, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = (40, ) self.gamma = gamma self.lr = lr # Create actor and critic networks self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau) self.critic = Critic(self.env_dim, act_dim, lr, tau) self.buffer = MemoryBuffer(buffer_size) def policy_action(self, s): """ Use the actor to predict value """ return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """ Use the Bellman Equation to compute the critic target """ critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ self.buffer.memorize(state, action, reward, done, new_state) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience """ # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) # Train actor self.actor.train(states, actions, np.array(grads).reshape((-1, self.act_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def train(self, summary_writer): env = CarEnv() results = [] i = 0 # First, gather experience tqdm_e = tqdm(range(2000), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: # Reset episode time, cumul_reward, done = 0, 0, False old_state = env.reset() old_state = np.array(old_state).reshape(40, ) actions, states, rewards = [], [], [] noise = OrnsteinUhlenbeckProcess(size=self.act_dim) while not done: # if args.render: env.render() # Actor picks an action (following the deterministic policy) a = self.policy_action(old_state) # Clip continuous values to be valid w.r.t. environment a = np.clip(a + noise.generate(time), -self.act_range, self.act_range) a = float(a[0]) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a, time) print("Now r is {}".format(r)) # Add outputs to memory buffer temp_next = old_state.copy() temp_next[:4] = temp_next[4:8] temp_next[4:8] = temp_next[8:12] temp_next[8:12] = temp_next[12:16] temp_next[12:16] = temp_next[16:20] temp_next[16:20] = temp_next[20:24] temp_next[20:24] = temp_next[24:28] temp_next[24:28] = temp_next[28:32] temp_next[28:32] = temp_next[32:36] temp_next[32:36] = temp_next[36:40] temp_next[36:40] = new_state temp_next = np.array(temp_next).reshape(40, ) self.memorize(old_state, a, r, done, temp_next) old_state = temp_next.copy() cumul_reward += r time += 1 # since episode is over destroying actors in the scenario for actor in env.actor_list: actor.destroy() # Sample experience from buffer for i in range(50): states, actions, rewards, dones, new_states, _ = self.sample_batch( 64) # Predict target q-values using target networks q_values = self.critic.target_predict( [new_states, self.actor.target_predict(new_states)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states, actions, critic_target) print("learning happened") # mean, stdev, velocity_data, action_data, vehicle_x_data, vehicle_y_data, obj_x_data, obj_y_data = gather_stats(self, env, velocity_data, action_data, vehicle_x_data, vehicle_y_data, obj_x_data, obj_y_data) mean, stdev = gather_stats(self, env) results.append([e, mean, stdev]) # Export results for Tensorboard print(cumul_reward) score = tfSummary('score', cumul_reward) summary_writer.add_summary(score, global_step=e) summary_writer.flush() # Display score tqdm_e.set_description("Score: " + str(cumul_reward)) tqdm_e.refresh() i += 1 if i % 10 == 0: df = pd.DataFrame(np.array(results)) df.to_csv("DDPG" + "/logs.csv", header=['Episode', 'Mean', 'Stddev'], float_format='%10.5f') return results def save_weights(self, path): path += '_LR_{}'.format(self.lr) self.actor.save(path) self.critic.save(path) def load_weights(self, path_actor, path_critic): self.critic.load_weights(path_critic) self.actor.load_weights(path_actor)
class Agent: """ Stock Trading Bot """ def __init__(self, buffer_size, state_size, action_size=3, learning_rate=0.001): # agent config self.buffer = MemoryBuffer(buffer_size, True) self.state_size = state_size self.action_size = action_size self.inventory = [] # model config self.gamma = 0.95 # affinity for long term reward self.loss = huber_loss self.optimizer = Adam(lr=learning_rate) # target network self.model = self._model() self.target_model = clone_model(self.model) self.target_model.set_weights(self.model.get_weights()) def _model(self): inputs = Input(shape=self.state_size) x = Dense(64, activation='relu')(inputs) x = Dense(128, activation='relu')(x) value = Dense(self.action_size, activation='linear')(x) a = Dense(self.action_size, activation='linear')(x) meam = Lambda(lambda x: K.mean(x, axis=1, keepdims=True))(a) advantage = Subtract()([a, meam]) q = Add()([value, advantage]) model = Model(inputs=inputs, outputs=q) model.compile(loss=self.loss, optimizer=self.optimizer) return model def remember(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) def act(self, state, epsilon, is_eval=False): # take random action in order to diversify experience at the beginning if not is_eval and random.random() <= epsilon: return random.randrange(self.action_size) state = state.reshape((-1, ) + self.state_size) action_probs = self.model.predict(state) return np.argmax(action_probs[0]) def epsilon_decay(self, epsilon, epsilon_min, epsilon_decay): if epsilon > epsilon_min: epsilon *= epsilon_decay return epsilon def remember_sumtree( self, state, action, reward, new_state, done, ): state = state.reshape((-1, ) + self.state_size) new_state = new_state.reshape((-1, ) + self.state_size) q_val = self.model.predict(state) q_val_t = self.target_model.predict(new_state) next_best_action = np.argmax(q_val) new_val = reward + self.gamma * q_val_t[0, next_best_action] td_error = abs(new_val - q_val + 1e-8)[0] self.buffer.memorize(state, action, reward, done, new_state, td_error) def target_model_update(self, done, tau=0.1, type='reset', reset_every=5000): if type == 'reset': if self.n_iter % reset_every == 0: print('update target model') # reset target model weights self.target_model.set_weights(self.model.get_weights()) if type == 'transfer': if done: W = self.model.get_weights() tgt_W = self.target_model.get_weights() for i in range(len(W)): tgt_W[i] = tau * W[i] + (1 - tau) * tgt_W[i] self.target_model.set_weights(tgt_W) def train_experience_replay_sumtree( self, batch_size, ): state, action, reward, done, new_state, idx = self.buffer.sample_batch( batch_size) state = state.reshape((-1, ) + self.state_size) new_state = new_state.reshape((-1, ) + self.state_size) q = self.model.predict(state) next_q = self.model.predict(new_state) q_targ = self.target_model.predict(new_state) for i in range(state.shape[0]): old_q = q[i, action[i]] if done[i]: q[i, action[i]] = reward[i] else: next_best_action = np.argmax(next_q[i, :]) q[i, action[i]] = reward[i] + self.gamma * q_targ[ i, next_best_action] self.buffer.update(idx[i], abs(old_q - q[i, action[i]])) loss = self.model.fit((state), q, epochs=1, verbose=0).history["loss"][0] return loss def save(self, name): if not os.path.exists('save/' + name): os.makedirs('save/' + name) np.save('save/' + name + '/data.npy', self.buffer.buffer.data) np.save('save/' + name + '/tree.npy', self.buffer.buffer.tree) self.model.save('save/' + name + '/model.h5') self.target_model.save('save/' + name + '/target_model.h5') else: print('already exist, please check.') def load(self, name): if not os.path.exists('save/' + name): print('not exist, please check.') else: self.buffer.buffer.data = np.load('save/' + name + '/data.npy', allow_pickle=True) self.buffer.buffer.tree = np.load('save/' + name + '/tree.npy', allow_pickle=True) self.model = load_model('save/' + name + '/model.h5') self.target_model = load_model('save/' + name + '/target_model.h5')
class DDPG(object): """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, action_dim, state_dim, batch_size, step, buffer_size, train_indicator, episode, gamma, lra, lrc, tau, load_weight=True): """ Initialization """ # Environment and A2C parameters self.action_dim = action_dim self.state_dim = state_dim self.batch_size = batch_size self.step = step self.gamma = gamma self.lra = lra self.lrc = lrc self.tau = tau self.episode = episode self.train_indicator = train_indicator # Create actor and critic networks self.actor = Actor(state_dim, action_dim, batch_size, lra, tau) self.critic = Critic(state_dim, action_dim, batch_size, lrc, tau) self.buffer = MemoryBuffer(buffer_size) # !: weights folder need to be specified & ensure only one set of A&C weights are in this folder self.weights_dir_path = os.getcwd() + r"\saved_model\*.h5" if load_weight: try: weights_actor_path = "" weights_critic_path = "" weights_file_path = glob.glob(self.weights_dir_path) for file_path in weights_file_path: if file_path.find("actor") < 0: weights_critic_path = file_path if file_path.find("critic") < 0: weights_actor_path = file_path self.load_weights(weights_actor_path, weights_critic_path) print("") print("Actor-Critic Models are loaded with weights...") print("") except: print("") print( "Weights are failed to be loaded, please check weights loading path..." ) print("") def policy_action(self, s): """ Use the actor to predict value """ return self.actor.predict(s)[0] def bellman(self, rewards, q_values, dones): """ Use the Bellman Equation to compute the critic target (one action only) """ critic_target = np.asarray(q_values) for i in range(q_values.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[i] = rewards[i] + self.gamma * q_values[i] return critic_target def memorize(self, state_old, action, reward, done, state_new): """ Store experience in memory buffer """ self.buffer.memorize(state_old, action, reward, done, state_new) def sample_batch(self, batch_size): return self.buffer.sample_batch(batch_size) def update_models(self, states, actions, critic_target): """ Update actor and critic networks from sampled experience """ # Train critic self.critic.train_on_batch(states, actions, critic_target) # Q-Value Gradients under Current Policy actions = self.actor.model.predict(states) grads = self.critic.gradients(states, actions) # Train actor self.actor.train(states, actions, np.array(grads).reshape((-1, self.action_dim))) # Transfer weights to target networks at rate Tau self.actor.transfer_weights() self.critic.transfer_weights() def run(self, env): # First, gather experience for e in range(self.episode): # Reset episode # set initial state loss, cumul_reward, cumul_loss = 0, 0, 0 done = False state_old = env.get_vissim_state( 1, 180 * 5, [45, 55, 60, 65, 70, 75, 80 ]) #TODO: make sure states are recieved correctly actions, states, rewards = [], [], [] print("Episode: ", e, " ========================:") for t in range(self.step): action_original = self.policy_action(state_old) #TODO: OU function params? noise = OrnsteinUhlenbeckProcess(x0=action_original, size=self.action_dim) # action = action_orig + noise action = noise.apply_ou(t) # adjust too-low or too-high action adj_action = np.zeros(len(action)) for index, value in enumerate(action): adj_action[index] = clip(value, -1, 1) #action_mapping function transformed_action = Transformation.convert_actions(adj_action) reward, state_new = env.get_vissim_reward( 180 * 5, transformed_action) # TODO: if we know what the optimal discharging rate, then we set that as done if t == self.step - 1: #we consider the manually setted last step as done done = True # ======================================================================================= Training section if (self.train_indicator): # Add outputs to memory buffer self.memorize(state_old, adj_action, reward, done, state_new) # Sample experience from buffer states_old, actions, rewards, dones, states_new = self.sample_batch( self.batch_size) # Predict target q-values using target networks q_values = self.critic.target_predict( [states_new, self.actor.target_predict(states_new)]) # Compute critic target critic_target = self.bellman(rewards, q_values, dones) # Train both networks on sampled batch, update target networks self.update_models(states_old, actions, critic_target) # calculate loss loss = self.critic.train_on_batch(states_old, actions, critic_target) state_old = state_new cumul_reward += reward cumul_loss += loss # ======================================================================================= # ======================================================================================= report print("|---> Step: ", t, " | Action: ", transformed_action, " | Reward: ", reward, " | Loss: ", loss) # ======================================================================================= # ======================================================================================= save model if np.mod(e, 10) == 0: print("====================> Saving model...") self.save_weights("./saved_model/") """ with open("actormodel.json", "w") as outfile: json.dump(actor.model.to_json(), outfile) with open("criticmodel.json", "w") as outfile: json.dump(critic.model.to_json(), outfile) """ # ======================================================================================= save model print("") print("*-------------------------------------------------*") print("Average Accumulated Reward: " + str(cumul_reward / self.step)) print("Average Accumulated Loss: " + str(cumul_loss / self.step)) print("*-------------------------------------------------*") print("") # garbage recycling gc.collect() def save_weights(self, path): t = datetime.datetime.now() time = "_" + str(t.date()) + "_" + str(t.hour) + "h-" + str( t.minute) + "m" path_actor = path + '_LR_{}'.format(self.lra) + time path_critic = path + '_LR_{}'.format(self.lrc) + time self.actor.save(path_actor) self.critic.save(path_critic) def load_weights(self, path_actor, path_critic): self.actor.load(path_actor) self.critic.load(path_critic)
class DDQN: """ Deep Q-Learning Main Algorithm """ def __init__(self, action_dim, state_dim, args, input_size, hp, export_path, env): """ Initialization """ self.export_path = export_path # Environment and DDQN parameters self.with_per = args.with_per self.action_dim = action_dim self.state_dim = (args.consecutive_frames, ) + state_dim # self.lr = hp["lr"] self.gamma = 0.99 # Exploration parameters for epsilon greedy strategy self.explore_start = self.epsilon = 1.0 # exploration probability at start self.explore_stop = 0.1 # minimum exploration probability self.decay_rate = 0.000001 # exponential decay rate for exploration prob self.buffer_size = 20000 self.input_size = input_size self.video_dir = args.video_dir # Create actor and critic networks self.agent = Agent(self.state_dim, action_dim, self.lr, args.dueling, input_size, args.load) # Memory Buffer for Experience Replay self.buffer = MemoryBuffer(self.buffer_size, args.with_per) try: # Init buffer threads = 16 p = Pool(processes=threads) while self.buffer.size() < self.buffer_size: # Set up threaded frame accumulation buffers = p.map_async(init_buffer, [env] * threads) datas = buffers.get() # Record in global memory for data in datas: for entry in data: self.memorize(*entry) # Mitigate memory leak del buffers del datas print("Buffer size: {}".format(self.buffer.size())) except KeyboardInterrupt: p.close() p.join() p.close() p.join() # Train on pure randomness for a while tqdm_e = tqdm(range(2000), desc='Score', leave=True, unit=" episodes") for e in tqdm_e: record = False if e % 100 == 0: record = True self.train_agent(args.batch_size, record) if e % 1000 == 0: self.agent.transfer_weights() # Display score tqdm_e.refresh() def policy_action(self, s): """ Apply an espilon-greedy policy to pick next action """ if np.random.random() <= self.epsilon: return np.random.randint(self.action_dim) else: a_vect = self.agent.predict(s)[0] return np.argmax(a_vect) def train_agent(self, batch_size, record=False): """ Train Q-network on batch sampled from the buffer """ # Sample experience from memory buffer (optionally with PER) s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size) # Apply Bellman Equation on batch samples to train our DDQN q = self.agent.predict(s) next_q = self.agent.predict(new_s) q_targ = self.agent.target_predict(new_s) for i in range(s.shape[0]): old_q = q[i, a[i]] if d[i]: q[i, a[i]] = r[i] else: next_best_action = np.argmax(next_q[i, :]) q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action] # Train on batch self.agent.fit(s, q, record=record) def train(self, env, args): """ Main DDQN Training Algorithm """ results = [] tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes") decay_step = 0 self.t = 0 for e in tqdm_e: # Reset episode time, cumul_reward, cumul_r_r, done = 0, 0, 0, False position = deque(maxlen=50) position.append(0) old_state = env.reset() while not done: decay_step += 1 env.render() # Actor picks an action (following the policy) a = self.policy_action(old_state) # Retrieve new state, reward, and whether the state is terminal new_state, r, done, _ = env.step(a) # Memorize for experience replay if r == 0: r_r = 0 elif r > 0: r_r = 1 else: r_r = -1 # Reward for not staying in place if a == 2: position.append(position[-1] + 1) if a == 3: position.append(position[-1] - 1) r_w = abs(max(position) - min(position)) / 10000 r_r += r_w self.memorize(old_state, a, r_r, done, new_state) # Update current state old_state = new_state cumul_reward += r cumul_r_r += r_r time += 1 self.epsilon = self.explore_stop + ( self.explore_start - self.explore_stop) * np.exp( -self.decay_rate * decay_step) # Train DDQN if (self.buffer.size() > args.batch_size) and self.t % 2000 == 0: self.train_agent(args.batch_size) self.t += 1 if self.t % 10000 == 0: self.agent.transfer_weights() if e % 50 == 0: self.agent.save("./model.h5") wandb.save("./model.h5") if e % 100 == 0: # wandb logging evaluate(cumul_reward, self.epsilon) self.train_agent(args.batch_size, record=True) # Display score text = "Score: {}, Fake Score: {:.2f}".format( str(cumul_reward), cumul_r_r) tqdm_e.set_description(text) tqdm_e.refresh() # render gameplay video if (e % 50 == 0): mp4list = glob.glob('video/' + self.video_dir + '/*.mp4') if len(mp4list) > 0: mp4 = mp4list[-1] video = io.open(mp4, 'r+b').read() encoded = base64.b64encode(video) # log gameplay video in wandb wandb.log( {"gameplays": wandb.Video(mp4, fps=4, format="gif")}) return results def memorize(self, state, action, reward, done, new_state): """ Store experience in memory buffer """ if (self.with_per): q_val = self.agent.predict(state) q_val_t = self.agent.target_predict(new_state) next_best_action = np.argmax(q_val) new_val = reward + self.gamma * q_val_t[0, next_best_action] td_error = abs(new_val - q_val)[0] else: td_error = 0 self.buffer.memorize(state, action, reward, done, new_state, td_error) def save(self, path): self.agent.save(path) def load_weights(self, path): self.agent.load_weights(path)
class td3Agent(): """Twin Delayed Deep Deterministic Policy Gradient(TD3) Agent """ def __init__(self, env_, is_discrete=False, batch_size=100, w_per=True, update_delay=2): # gym environments self.env = env_ self.discrete = is_discrete self.obs_dim = env_.observation_space.shape[0] self.act_dim = env_.action_space.n if is_discrete else env_.action_space.shape[ 0] self.action_bound = (env_.action_space.high - env_.action_space.low ) / 2 if not is_discrete else 1. self.action_shift = (env_.action_space.high + env_.action_space.low ) / 2 if not is_discrete else 0. # initialize actor & critic and its targets self.discount_factor = 0.99 self.actor = ActorNet(self.obs_dim, self.act_dim, self.action_bound, lr_=3e-4, tau_=5e-3) self.critic = CriticNet(self.obs_dim, self.act_dim, lr_=3e-4, tau_=5e-3, discount_factor=self.discount_factor) # Experience Buffer self.buffer = MemoryBuffer(BUFFER_SIZE, with_per=w_per) self.with_per = w_per self.batch_size = batch_size # OU-Noise-Process self.noise = OrnsteinUhlenbeckProcess(size=self.act_dim) # for Delayed Policy Update self._update_step = 0 self._target_update_interval = update_delay ################################################### # Network Related ################################################### def make_action(self, obs, t, noise=True): """ predict next action from Actor's Policy """ action_ = self.actor.predict(obs)[0] sigma = 0.1 # std of gaussian a = np.clip( action_ + np.random.normal(0, self.action_bound * sigma) if noise else 0, -self.action_bound, self.action_bound) #a = np.clip(action_ + self.noise.generate(t) if noise else 0, -self.action_bound, self.action_bound) return a def make_target_action(self, obs, noise=True): """ predict next action from Actor's Target Policy """ action_ = self.actor.target_predict(obs) sigma = 0.2 #return action_ cliped_noise = np.clip(np.random.normal(0, self.action_bound * sigma), -self.action_bound * 0.5, self.action_bound * 0.5) a = np.clip(action_ + cliped_noise if noise else 0, -self.action_bound, self.action_bound) return a def update_networks(self, obs, acts, critic_target): """ Train actor & critic from sampled experience """ # update critic self.critic.train(obs, acts, critic_target) if self._update_step % self._target_update_interval == 0: # update actor self.actor.train(obs, self.critic.network_1) # update target networks self.actor.target_update() self.critic.target_update() self._update_step = self._update_step + 1 def train(self): if self.with_per and (self.buffer.size() <= self.batch_size): return # sample from buffer states, actions, rewards, dones, new_states, idx = self.sample_batch( self.batch_size) # get target q-value using target network new_actions = self.make_target_action(new_states) q1_vals = self.critic.target_network_1.predict( [new_states, new_actions]) q2_vals = self.critic.target_network_2.predict( [new_states, new_actions]) # bellman iteration for target critic value q_vals = np.min(np.vstack([q1_vals.transpose(), q2_vals.transpose()]), axis=0) critic_target = np.asarray(q_vals) # print(np.vstack([q1_vals.transpose(),q2_vals.transpose()])) # print(q_vals) for i in range(q1_vals.shape[0]): if dones[i]: critic_target[i] = rewards[i] else: critic_target[ i] = self.discount_factor * q_vals[i] + rewards[i] if self.with_per: self.buffer.update(idx[i], abs(q_vals[i] - critic_target[i])) # train(or update) the actor & critic and target networks self.update_networks(states, actions, critic_target) #################################################### # Buffer Related #################################################### def memorize(self, obs, act, reward, done, new_obs): """store experience in the buffer """ if self.with_per: # not implemented for td3, yet. q_val = self.critic.network( [np.expand_dims(obs, axis=0), self.actor.predict(obs)]) next_action = self.actor.target_network.predict( np.expand_dims(new_obs, axis=0)) q_val_t = self.critic.target_network.predict( [np.expand_dims(new_obs, axis=0), next_action]) new_val = reward + self.discount_factor * q_val_t td_error = abs(new_val - q_val)[0] else: td_error = 0 self.buffer.memorize(obs, act, reward, done, new_obs, td_error) def sample_batch(self, batch_size): """ Sampling from the batch """ return self.buffer.sample_batch(batch_size) ################################################### # Save & Load Networks ################################################### def save_weights(self, path): """ Agent's Weights Saver """ self.actor.save_network(path) self.critic.save_network(path) def load_weights(self, pretrained): """ Agent's Weights Loader """ self.actor.load_network(pretrained) self.critic.load_network(pretrained)