def train_dqn(env, args): agent = DQN(env, args) agent.train() total_episodes = args.episodes max_steps = 10 for episode in range(total_episodes): print(episode, agent.epsilon, end='\r') state = env.reset() done = False for step in range(max_steps): action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.push(state, action, reward, next_state, done) agent.learn(episode) state = next_state if done: break if episode % 5 == 0: max_steps += 10 return agent
def main(argv): env_name = FLAGS.env_name env = gym.make(env_name) agent = DQN(env, load_path=f'train/{env_name}/') for episodes in range(FLAGS.num_episodes): done = False obs = env.reset() episode_reward = 0 while not done: env.render() action = agent.act(np.expand_dims(obs, axis=0)) obs, rew, done, info = env.step(action) episode_reward += rew print(f'Episode Reward:{episode_reward}')
if game_params[curr_task]['pygame']: env.reset_game() state = env.getGameState() else: state = env.reset() state = np.reshape(state, [1, sim_params['num_inputs']]) state = normalise_state(state, game_params[curr_task]['state_means'], game_params[curr_task]['state_stds'], task_id=arch_params['task_id'], num_tasks=sim_params['num_tasks'],curr_task=curr_task) total_r = 0 done = False for t in range(sim_params['episode_length']): # Choose action action, maxQ = agent.act(state, curr_task, test=test) totalq += maxQ if game_params[curr_task]['pygame']: reward = env.act(game_params[curr_task]['actions'][action]) if reward < 0: reward = -1 next_state = env.getGameState() done = env.game_over() if t > (sim_params['episode_length']-2): done = True if train_params['catcherscale']: reward = reward * train_params['r_scale'] else: next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state,[1,sim_params['num_inputs']]) if not train_params['catcherscale']:
memory = ReplayMemory(env_config['memory_size']) # Initialize optimizer used for training the DQN. We use Adam rather than RMSProp. optimizer = torch.optim.Adam(dqn.parameters(), lr=env_config['lr']) # Keep track of best evaluation mean return achieved so far. best_mean_return = -float("Inf") for episode in range(env_config['n_episodes']): done = False obs = preprocess(env.reset(), envID=args.env, env=env).unsqueeze(0) obs_stack = torch.cat(env_config['obs_stack_size'] * [obs]).unsqueeze(0).to(device) count = 0 while not done: # TODO: Get action from DQN. action = dqn.act(obs_stack) # Act in the true environment. #print(env) #old_obs = obs obs, reward, done, info = env.step(action.item() + ENV_CONFIGS[args.env]['offset']) # Preprocess incoming observation. if not done: obs = preprocess(obs, envID=args.env, env=env).unsqueeze(0) next_obs_stack = torch.cat( (obs_stack[:, 1:, ...], obs.unsqueeze(1)), dim=1).to(device) else: next_obs_stack = None #action = action - ENV_CONFIGS[args.env]['offset']
class RlBidAgent(): def _load_config(self): """ Parse the config.cfg file """ cfg = configparser.ConfigParser(allow_no_value=True) env_dir = os.path.dirname(__file__) cfg.read(env_dir + '/config.cfg') self.exp_type = str(cfg['experiment_type']['type']) self.T = int( cfg[self.exp_type]['T']) # Number of timesteps in each episode def __init__(self): self._load_config() # Beta parameter adjsuting the lambda parameter, that regulates the agent's bid amount self.BETA = [-0.08, -0.03, -0.01, 0, 0.01, 0.03, 0.08] # Starting value of epsilon in the adaptive eps-greedy policy self.eps = 0.9 # Parameter controlling the annealing speed of epsilon self.anneal = 2e-5 if self.exp_type in ('improved_drlb', 'improved_drlb_eval'): # DQN Network to learn Q function self.dqn_agent = DQN(state_size=6, action_size=7) # Reward Network to learn the reward function self.reward_net = RewardNet(state_action_size=7, reward_size=1) else: self.dqn_agent = DQN(state_size=7, action_size=7) self.reward_net = RewardNet(state_action_size=8, reward_size=1) # Number of timesteps in each episode (4 15min intervals x 24 hours = 96) # self.T = 672 # Initialize the DQN action for t=0 (index 3 - no adjustment of lambda, 0 ind self.BETA) self.dqn_action = 3 self.ctl_lambda = None # Arrays saving the training history self.step_memory = [] self.episode_memory = [] # Params for tracking the progress self.global_T = 0 # Tracking the global time step self.episode_budgets = None self.budget = None self.total_wins = 0 self.total_rewards = 0 self.rewards_prev_t = 0 self.rewards_prev_t_ratio = 0 self.rnet_r = 0 self.wins_e = 0 self.rewards_e = 0 self.ROL = self.T self.ROL_ratio = 1 def _get_state(self): """ Returns the state that will be used as input in the DQN """ if self.exp_type in ('improved_drlb', 'improved_drlb_eval'): return np.asarray([ self. rem_budget_ratio, # 2. the ratio of the remaining budget to total available budget at time-step t self. ROL_ratio, # 3. The ratio of the number of Lambda regulation opportunities left self.BCR, # 4. Budget consumption rate self. CPI, # 5. Cost per impression between t-1 and t, in relation to the highest cost possible in the training set (300) self.WR, # 6. Auction win rate at state t self.rewards_prev_t_ratio ]) # 7. Ratio of acquired/total clicks at timestep t-1 else: return np.asarray([ self.t_step, # 1. Current time step self.rem_budget, # 2. the remaining budget at time-step t self. ROL, # 3. The number of Lambda regulation opportunities left self.BCR, # 4. Budget consumption rate self. CPM, # 5. Cost per mille of impressions between t-1 and t: self.WR, # 6. Auction win rate at state t self.rewards_prev_t ]) # 7. Clicks acquired at timestep t-1 def _reset_episode(self): """ Function to reset the state when episode changes """ # Reset the count of time steps self.t_step = 0 # Lambda regulation parameter - set according to the greedy approximation algorithm, as suggested by the paper if self.exp_type == 'vanilla_drlb': self.ctl_lambda = 0.01 if self.budget is None else self.calc_greedy( self.greedy_memory, self.budget) # Clean up the array used to save all the necessary information to solve the knapsack problem with the GA algo self.greedy_memory = [] elif self.exp_type == 'episode_lambda': self.ctl_lambda = 0.01 else: pass # Next episode -> next step self._reset_step() # Set the budget for the episode self.budget = self.episode_budgets.pop(0) self.rem_budget = self.budget self.rem_budget_ratio = 1 self.budget_spent_t = 0 self.budget_spent_e = 0 if self.exp_type not in ('free_lambda', 'free_lambda_eval', 'improved_drlb', 'improved_drlb_eval'): self.ROL = self.T # 3. The number of Lambda regulation opportunities left self.ROL_ratio = 1 self.cur_day = 0 self.cur_min = 0 self.total_wins += self.rewards_e self.total_rewards += self.wins_e # Impressions won in each episode self.wins_e = 0 # Clicks won in each episode self.rewards_e = 0 # Dict and Value necessary for learning the RewardNet self.reward_net.V = 0 self.reward_net.S = [] def _update_step(self): """ Function that is called before transitioning into step t+1 (updates state t) """ self.global_T += 1 self.t_step += 1 self.prev_budget = self.rem_budget self.rem_budget = self.prev_budget - self.budget_spent_t self.budget_spent_e += self.budget_spent_t self.rewards_prev_t = self.reward_t self.ROL -= 1 self.BCR = 0 if self.prev_budget == 0 else -( (self.rem_budget - self.prev_budget) / self.prev_budget) if self.exp_type in ('improved_drlb', 'improved_drlb_eval'): self.CPI = 0 if self.wins_t == 0 else (self.cost_t / self.wins_t) / 300 self.rewards_prev_t_ratio = 1 if self.possible_clicks_t == 0 else self.reward_t / self.possible_clicks_t self.ROL_ratio = self.ROL / self.T self.rem_budget_ratio = self.rem_budget / self.budget else: self.CPM = 0 if self.wins_t == 0 else ( (self.cost_t / self.wins_t) * 1000) self.WR = self.wins_t / self.imp_opps_t # Adaptive eps-greedy policy self.eps = max(0.95 - self.anneal * self.global_T, 0.05) def _reset_step(self): """ Function to call every time a new time step is entered. """ self.possible_clicks_t = 0 self.total_rewards_t = 0 self.reward_t = 0 self.cost_t = 0 self.wins_t = 0 self.imp_opps_t = 0 self.BCR = 0 if self.exp_type in ('improved_drlb', 'improved_drlb_eval'): self.CPI = 0 else: self.CPM = 0 self.WR = 0 self.budget_spent_t = 0 def _update_reward_cost(self, bid, reward, potential_reward, cost, win): """ Internal function to update reward and action to compute the cumulative reward and cost within the given step. """ self.possible_clicks_t += potential_reward if win: self.budget_spent_t += cost self.wins_t += 1 self.wins_e += 1 self.total_wins += 1 self.reward_t += reward self.rewards_e += reward self.total_rewards += reward self.cost_t += cost def _model_upd(self, eval_mode): if not eval_mode: self.reward_net.step() # update reward net next_state = self._get_state( ) # observe state s_t+1 (state at the beginning of t+1) # get action a_t+1 (adjusting lambda_t to lambda_t+1) from the adaptive greedy policy a_beta = self.dqn_agent.act(next_state, eps=self.eps, eval_mode=eval_mode) self.ctl_lambda *= (1 + self.BETA[a_beta]) if not eval_mode: # updates for the RewardNet sa = np.append(self.cur_state, self.BETA[ self.dqn_action]) #self.dqn_action) # state-action pair for t self.rnet_r = float( self.reward_net.act(sa)) # get reward r_t from RewardNet self.reward_net.V += self.reward_t self.reward_net.S.append( (self.cur_state, self.BETA[self.dqn_action])) # Store in D1 and sample a mini batch and perform grad-descent step self.dqn_agent.step(self.cur_state, self.dqn_action, self.rnet_r, next_state) self.cur_state = next_state # set state t+1 as state t self.dqn_action = a_beta # analogously with the action t+1 def act(self, obs, eval_mode): """ This function gets called with every bid request. By looking at the weekday and hour to progress between the steps and episodes during training. Returns the bid decision based on the scaled version of the bid price using the DQN agent output. """ # within the time step if obs['min'] == self.cur_min and obs['weekday'] == self.cur_day: pass # within the episode, changing the time step elif obs['min'] != self.cur_min and obs['weekday'] == self.cur_day: self._update_step() self._model_upd(eval_mode) self.cur_min = obs['min'] # save history self.step_memory.append([ self.global_T, int(self.rem_budget), self.ctl_lambda, self.eps, self.dqn_action, self.dqn_agent.loss, self.rnet_r, self.reward_net.loss ]) self._reset_step() # transition to next episode elif obs['weekday'] != self.cur_day: self._update_step() self._model_upd(eval_mode) self.step_memory.append([ self.global_T, int(self.rem_budget), self.ctl_lambda, self.eps, self.dqn_action, self.dqn_agent.loss, self.rnet_r, self.reward_net.loss ]) # Updates for the RewardNet at the end of each episode (only when training) if not eval_mode: for (s, a) in self.reward_net.S: sa = tuple(np.append(s, a)) max_r = max(self.reward_net.get_from_M(sa), self.reward_net.V) self.reward_net.add_to_M(sa, max_r) self.reward_net.add(sa, max_r) print( "Episode Result with Step={} Budget={} Spend={} impressions={} clicks={}" .format(self.global_T, int(self.budget), int(self.budget_spent_e), self.wins_e, self.rewards_e)) # Save history self.episode_memory.append([ self.budget, int(self.budget_spent_e), self.wins_e, self.rewards_e ]) self._reset_episode() self.cur_day = obs['weekday'] self.cur_min = obs['min'] self.imp_opps_t += 1 bid = self.calc_bid(obs['pCTR']) if self.exp_type == 'vanilla_drlb': self.greedy_memory.append([ obs['pCTR'], obs['payprice'], obs['pCTR'] / max(obs['payprice'], 1) ]) return bid def calc_bid(self, imp_value): # Calculate the theoretically optimal bid bid_amt = round(imp_value / self.ctl_lambda, 2) curr_budget_left = self.rem_budget - self.budget_spent_t if bid_amt > curr_budget_left: bid_amt = curr_budget_left return bid_amt def calc_greedy(self, items, budget_limit): # Borrowed from: https://bitbucket.org/trebsirk/algorithms/src/master/knapsack.py # Greedy approximation algorithm (Dantzig, 1957) bids = [] spending = 0 ctr = 0 items_sorted = sorted(items, key=itemgetter(2), reverse=True) while len(items_sorted) > 0: item = items_sorted.pop() if item[1] + spending <= budget_limit: # should be item[1], currently adds pCTR instead of price????? bids.append(item) spending += bids[-1][1] ctr += bids[-1][0] else: break ctrs = np.array(bids)[:, 0] costs = np.array(bids)[:, 1] # Take the max lambda to be more conservative at the beginning of a time step opt_lambda = np.max(np.divide(ctrs, costs)) return opt_lambda
class Generator(nn.Module): def __init__(self, args, data, g, level2_parients): super(Generator, self).__init__() self.args = args self.data = data self.g = g self.level2_parients = level2_parients self.cnn = VanillaConv(args, vocab_size=data.size()) self.pathEncoder = PathEncoder(args, self.g) self.pathDecoder = PathDecoder(args) self.DQN = DQN(args) self.pathHist = [] # 保存着已经选择的路径(只保留最后的一个ICD) self.attn = nn.Linear(args.node_embedding_size * 4, args.node_embedding_size * 3) self.attn_combine = nn.Linear(args.node_embedding_size * 2, args.node_embedding_size) #self.atten=selfAttention(hidden_dim=args.node_embedding_size*4) # Attentional affine transformation # self.r_x = nn.Linear(args.node_embedding_size, args.node_embedding_size * 3) # nn.init.normal_(self.r_x.weight, mean=0, std=1) # self.b_x = nn.Linear(args.node_embedding_size, args.node_embedding_size * 3) # nn.init.normal_(self.b_x.weight, mean=0, std=1) self.r_x = nn.Parameter( torch.FloatTensor(args.node_embedding_size, args.node_embedding_size * 3)) # 使用xaview_uniform_方法初始化权重 nn.init.xavier_uniform_(self.r_x.data) # (2,8285,16) self.b_x = nn.Parameter( torch.FloatTensor(args.node_embedding_size, args.node_embedding_size * 3)) # nn.init.xavier_normal_(self.fc2.weight) nn.init.xavier_uniform_(self.b_x.data) # (95,2) self.optimizer = optim.Adam(self.parameters(), lr=args.lr) # 传入的是一个batch的数据量 # K表示针对每个hop 选择出前K个最有可能的action def forward(self, ehrs, hier_labels): batchPaths = [] # 针对batch 中的每个样本(每个电子病历)进行单独的取样 for i in range(len(ehrs)): example_states = [] example_rewards = [] example_done = [] example_actionIndexs = [] # 首先初始化hidden hidden = torch.Tensor(np.zeros( (1, self.args.path_hidden_size))).to(self.args.device) # 1.得到电子病历的表示 ehrRrep = self.cnn(ehrs[i]) # 放在此处是为了每个样本都重置下环境 self.sequences = [[[self.args.node2id.get('ROOT')], 1.0]] # 根节点 for hop in range(self.args.hops): # 每个样本的尝试次数(对应于每个样本的标签个数,即路径个数), # 其实不同路径之间也应该相互影响,已选择的路径要影响到下一个路径开始节点的选择,这个怎么建模呢?? # 输入EHR的,得到G网络生成的路径 if hop != 0: hidden = hidden.sum(dim=1) # 2.得到attentive的EHR表示 #atten_weights = F.softmax(self.attn(torch.cat((hidden, ehrRrep), 1))) #[1,300] # attn_ehrRrep = torch.mul(atten_weights, ehrRrep) # [1,300] #attn_ehrRrep=self.atten(torch.cat((hidden,ehrRrep),1)) # print('hidden:',hidden) state = F.relu(self.attn(torch.cat((hidden, ehrRrep), 1))) # print('ehrRrep:',ehrRrep) # print('attn_ehrRrep:', attn_ehrRrep) # # # 3.得到融合了EHR和路径信息的表示,即state的表示 # #state = self.r_x(hidden) * attn_ehrRrep + self.b_x(hidden) # [32, 300], state:[1,300] # state = torch.mm(hidden,self.r_x)+ attn_ehrRrep #print('state:',state) # 4.首先获得当前跳的action_space空间 然后DQN根据state在该空间中选择action if hop == 0: children = torch.Tensor(self.level2_parients).long().to( self.args.device) children_len = torch.Tensor([13 ]).long().to(self.args.device) else: # selected_action作为父节点 选择对应的孩子节点 children, children_len = action_space( selected_action, self.args) # action:[32] # print('children.shape:',children.shape) #[1, 101] # 在选择action 之前 也要执行以下判断,如果children_len中包含有0 说明选择出了叶子节点 action_values, actions, actionIndexs = self.DQN.act( state, children, children_len) print('hop:', hop) # print('actions:',actions) selected_action, actionList = self.beam_search( action_values, actions) # 4.将当前选择的节点和上一步选择的节点(保存在self.actionList中) 输入到path encoder中得到路径的表示 path = self.pathEncoder(selected_action, actionList) # 5.将路径表示输入到path Decoder中以更新hidden的表示 path = path.unsqueeze(0) output = self.pathDecoder(path) hidden = self.pathDecoder.hidden # 执行这些选择出来的action, 更改环境的变化 reward, done = self.step(actionList, hier_labels[i], hop) example_rewards.append(reward) example_states.append(state) example_done.append(done) example_actionIndexs.append(actionIndexs) # 最后一个hop之后得到的state(训练时要使用) hidden = hidden.sum(dim=1) state = F.relu(self.attn(torch.cat((hidden, ehrRrep), 1))) example_states.append(state) # 将这些数据转换成(state,action,reward,next_state,done)保存到memory中 for i in range(len(example_states)): example_states[i] = example_states[i].data.cpu().numpy() for i in range(len(example_rewards)): for j in range(len(example_actionIndexs[i])): self.DQN.buffer.push(example_states[i], example_actionIndexs[i][j][0], example_rewards[i], example_states[i + 1], example_done[i]) batchPaths.append(actionList) return batchPaths def beam_search(self, data, actions_store): # print('data:',data) # print('actions_store:',actions_store) # 若 选择出的action是叶子节点 则要中断这个路径 all_candidates = list() for sequence, row, actions in zip(self.sequences, data, actions_store): seq, score = sequence for j in range(len(row)): candidate = [seq + [actions[j].item()], score + row[j].item()] all_candidates.append(candidate) # order all candidates by scores ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True) # select k best self.sequences = ordered[:self.args.k] #print('self.sequences:',self.sequences) selected_action = [row[0][-1] for row in self.sequences] print('selected_action:', selected_action) selected_path = [row[0] for row in self.sequences] return selected_action, selected_path def step(self, actionList, hier_label, hop): # 对比当前预测出的路径以及真实的路径 设定对应的reward hop_tures = [] for row in hier_label: hop_tures.extend(row) for row in actionList: if row[hop + 1] in hop_tures: reward = 1 else: reward = -1 if hop == 3: done = True else: done = False return reward, done
LOG_DIR = '{}/{}_DQN_{}'.format(args.dir, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env) writer = SummaryWriter(logdir=LOG_DIR) total_numsteps = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() while not done: if total_numsteps < args.start_steps: action = env.action_space.sample() # Sample random action else: action = agent.act(state) # Sample action from policy next_state, reward, done, _ = env.step(action) # Step episode_steps += 1 total_numsteps += 1 episode_reward += reward # Ignore the "done" signal if it comes from hitting the time horizon. # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) mask = 1 if episode_steps == env._max_episode_steps else float(not done) agent.step(state, action, reward, next_state, mask) if total_numsteps >= args.start_steps and total_numsteps % args.update_freq == 0: q_loss = agent.update() writer.add_scalar('loss/q', q_loss, total_numsteps) state = next_state
target.eval() # fixed the target net, we don't want to train it mse = nn.MSELoss() optimizer = optim.RMSprop(policy.parameters()) replay_buffer = ReplayBuffer(BUFFER_SIZE) # training phase total_game_step = 0 for current_episode in range(EPISODE): state = env.reset() # get the initial observation game_step = 0 total_reward = 0 state = torch.tensor([state]).float().to(DEVICE) while True: game_step += 1 total_game_step += 1 action = policy.act(state, total_game_step, isTrain = True).to(DEVICE) # sample an action next_state, reward, done, _ = env.step(action.item()) # take action in environment total_reward += reward reward = torch.tensor([reward]).float().to(DEVICE) if done: # whether this episode is terminate (game end) next_state = None else: next_state = torch.tensor([next_state]).float().to(DEVICE) replay_buffer.store(state, action, reward, next_state) state = next_state # optimze model with batch_size sample from buffer if replay_buffer.lenth() > BATCH_SIZE: # only optimize when replay buffer have sufficient number of data
import gym from gym.wrappers import Monitor from dqn import DQN env = gym.make('LunarLander-v2') #env = Monitor(env, 'videos', video_callable=lambda episode_id: True) dqn = DQN(env) dqn.load() for x in range(20): state = env.reset() done = False while not done: env.render() action = dqn.act(state, False) next_state, reward, done, info = env.step(action) state = next_state env.close()
# Create a DQN with a replay buffer capacity up to 150000 experiences agent = DQN(state_size, action_size, 150000) # Initialize episode counter e = 0 while True: # Make a new episode game.new_episode() episode_rewards = [] # Get the current environment state and add it to the previously stacked frames state = game.get_state() state, stacked_frames = preprocessor.stack_frames( stacked_frames, state, True) for time in range(max_steps): # Get next action from the DQN action = agent.act(state) # Perform that action and recieve its reward reward = game.make_action(possible_actions[action]) episode_rewards.append(reward) # Check whether the episode is finished or not done = game.is_episode_finished() or time == max_steps if done: # Episode finished agent.update_target_model() print("Episode: {}, score: {}, e: {:.2}".format( e, np.sum(episode_rewards), agent.epsilon)) # exit episode loop break else: # Get the next environment state and stack it to the previously stacked frames
policy = DQN(POLICY_ARGS).to(DEVICE) policy.load_state_dict(torch.load(PATH)) policy.eval() env = gym.make('SpaceInvaders-ram-v0').unwrapped print('play 10 episode') for episode in range(10): state = env.reset() game_step = 0 total_reward = 0 state = torch.FloatTensor([state]).to(DEVICE) while True: env.render() time.sleep(0.05) game_step += 1 action = policy.act(state, 1, isTrain=False).to(DEVICE) # print(action) # print(state.squeeze()[:20]) # print(action.item()) # i = game_step%4 next_state, reward, done, _ = env.step( action.item()) # take action in environment total_reward += reward reward = torch.FloatTensor([reward]).to(DEVICE) if done: print('--------------------') print('episode: {episode}, game_step: {game_step}, total_reward: {total_reward}' \ .format(episode=episode, game_step=game_step, total_reward=total_reward)) # wandb.log({'total_reward': total_reward}) break else:
def main(): parser = argparse.ArgumentParser() parser.add_argument('--episodes', type=int, default=1000, help='No of episodes') parser.add_argument('--episode_len', type=int, default=500, help='length of episode') parser.add_argument('--openai_env', type=str, required=True, help='env like MountainCar-v0, CartPole-v0 etc') parser.add_argument('--epsilon', type=float, default=1, help='exploration parameter') parser.add_argument('--epsilon_decay', type=float, default=0.995, help='epsilon decay rate') parser.add_argument('--gamma', type=float, default=0.95, help='discount factor') parser.add_argument('--learning_rate', type=float, default=0.01, help='learning_rate') parser.add_argument('--batch_size', type=int, default=64, help='batch size') args = parser.parse_args() parameters = {} for key, value in vars(args).items(): parameters[key] = value env = gym.make(args.openai_env) model = DQN(env, parameters) model.build_model() saver = tf.train.Saver(max_to_keep=1) total_reward = 0 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(args.episodes): curr_state = env.reset().reshape(-1, env.observation_space.shape[0]) j = 0 done = False if model.epsilon > 0.15: model.epsilon *= model.epsilon_decay print(model.epsilon) #for j in range(args.episode_len): while done == False: print("episode:{} trial:{}".format(i, j)) env.render() _, action = model.act(sess, curr_state) next_state, reward, done, _ = env.step(action) total_reward += reward print("action:{} next_state:{} ".format(action, next_state)) next_state = next_state.reshape(-1, env.observation_space.shape[0]) model.add_to_memory(curr_state, action, reward, next_state, done) model.replay(sess) curr_state = next_state j += 1 if j < 199: print("Comleted in {} episodes".format(i)) saver.save(sess, "checkpoint/ckpt-" + str(i), write_meta_graph=False) break else: saver.save(sess, "checkpoint/ckpt-" + str(i), write_meta_graph=False)
best_mean_return = -float("Inf") # Used for eps annealing step_number = 0 # Used for the plot all_means = [] for episode in range(env_config['n_episodes']): done = False obs = preprocess(env.reset(), env=args.env) while not done: # Get action from DQN. action = dqn.act(obs, step_number) # Act in the true environment. next_obs, reward, done, _ = env.step(action.item()) reward = preprocess(reward, env=args.env) step_number += 1 # Preprocess incoming observation. if done: next_obs = None else: next_obs = preprocess(next_obs, env=args.env) # Add the transition to the replay memory
target = 270 sample_size = 50 env = gym.make('LunarLander-v2') dqn = DQN(env) e = 1 scores_list = [] while True: state = env.reset() done = False score = 0 while not done: env.render() action = dqn.act(state) next_state, reward, done, info = env.step(action) dqn.remember(state, action, reward, next_state, done) dqn.replay() state = next_state score += reward scores_list.append(score) last_rewards_mean = np.mean(scores_list[sample_size * -1:]) print( str(e) + ": \t\t" + str(round(float(score))) + "\t\t" + str(round(float(last_rewards_mean))) + "\t\t" + str(round(dqn.epsilon, 3))) dqn.adjust_epsilon() e += 1
def main(): # Our environment env = gym.make("MountainCar-v0") trials = 200 trial_len = 500 updateTargetNetwork = 1000 #Initialize our DQN agent dqn_agent = DQN(env=env, tau=1, file_name=file_name) steps = [] max_show = -50 # Re-run environment [trial] times for trial in range(trials): print("Trial {}".format(trial)) # Start car at on every trial start cur_state = env.reset().reshape(1, 2) # Local variables local_max = -50 max_position = -0.4 for step in range(trial_len): #Predict action using our DQN action function action, temp_max = dqn_agent.act(cur_state) max_show = max(temp_max, max_show) local_max = max(temp_max, local_max) env.render() # Make a move in env using predicted action new_state, reward, done, _ = env.step(action) new_state = new_state.reshape(1, 2) # Adjust reward - i.e. Give more reward if max position reached! if cur_state[0][0] > max_position: max_position = cur_state[0][0] normalized_max = max_position + 0.6 # Reward range: 0 to 1 reward = reward + 11 * ( normalized_max**3 ) # incentivize closer to flag! Max reward of 10. n^3 reward # if done: # reward = 20 # elif step == 199: # reward = reward - 1 # print("Reward: {}".format(reward)) # Now remember, train, and reorient goals dqn_agent.remember(cur_state, action, reward, new_state, done) if done: #remember twice because important dqn_agent.remember(cur_state, action, reward, new_state, done) dqn_agent.replay() if step % 20 == 0: dqn_agent.target_train(False) # print("Retraining") cur_state = new_state if done: break if step >= 199: print("Failed to complete trial, best q: {}, max-Pos: {}".format( local_max, max_position)) else: print("Completed in {} trials, best q: {}, max-Pos: {}".format( trial, local_max, max_position)) print( "_______________!!!!!!!!!!!!!!!!!_______________!!!!!!!!!!!!!!!!!" ) # Need to save model, so can reuse and get better over time dqn_agent.save_model(file_name)
def main(): original_size = (782, 600) env = ENV(actions, (original_size[0]/6, original_size[1]/6)) gamma = 0.9 epsilon = .95 model_ph = 'models' if not os.path.exists(model_ph): os.mkdir(model_ph) trials = 500 trial_len = 1000 rewards = [] q_values = [] dqn_agent = DQN(env=env) success_num = 0 rewards = [] q_values = [] Q = [] for trial in range(1, trials): t_reward = [] t_qvalue = [] cur_state = env.reset() for step in range(trial_len): action = dqn_agent.act(cur_state) new_state, reward, done, success = env.step(action) t_reward.append(reward) # reward = reward if not done else -20 dqn_agent.remember(cur_state, action, reward, new_state, done) q_value = dqn_agent.replay() # internally iterates default (prediction) model if q_value: t_qvalue.append(q_value) Q.append(q_value) else: t_qvalue.append(0.0) Q.append(0.0) dqn_agent.target_train() # iterates target model cur_state = new_state dqn_agent.log_result() save_q(Q) if success: success_num += 1 dqn_agent.step = 100 print("Completed in {} trials".format(trial)) dqn_agent.save_model(os.path.join(model_ph, "success-model.h5")) break if done: print("Failed to complete in trial {}, step {}".format(trial, step)) dqn_agent.save_model(os.path.join(model_ph, "trial-{}-model.h5").format(trial)) break rewards.append(np.sum(t_reward) if t_reward else 0.0) q_values.append(np.mean(t_qvalue) if t_qvalue else 0.0) with open('reward_and_Q/reward.txt', 'wb') as f: pickle.dump(rewards, f) with open('reward_and_Q/qvalue.txt', 'wb') as f: pickle.dump(q_values, f) print('trial: {}, success acc: {}'.format(trial, success_num / float(trial)))
def train(model_dict): def update_current_state(current_state, state, channels): # current_state: [processes, channels*stack, height, width] state = torch.from_numpy(state).float() # (processes, channels, height, width) # if num_stack > 1: #first stack*channel-channel frames = last stack*channel-channel , so slide them forward current_state[:, :-channels] = current_state[:, channels:] current_state[:, -channels:] = state #last frame is now the new one return current_state def update_rewards(reward, done, final_rewards, episode_rewards, current_state): # Reward, Done: [P], [P] # final_rewards, episode_rewards: [P,1]. [P,1] # current_state: [P,C*S,H,W] reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() #[P,1] episode_rewards += reward #keeps track of current episode cumulative reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #[P,1] final_rewards *= masks #erase the ones that are done final_rewards += (1 - masks) * episode_rewards #set it to the cumulative episode reward episode_rewards *= masks #erase the done ones masks = masks.type(dtype) #cuda if current_state.dim() == 4: # if state is a frame/image current_state *= masks.unsqueeze(2).unsqueeze(2) #[P,1,1,1] else: current_state *= masks #restart the done ones, by setting the state to zero return reward, masks, final_rewards, episode_rewards, current_state num_frames = model_dict['num_frames'] cuda = model_dict['cuda'] which_gpu = model_dict['which_gpu'] num_steps = model_dict['num_steps'] num_processes = model_dict['num_processes'] seed = model_dict['seed'] env_name = model_dict['env'] save_dir = model_dict['save_to'] num_stack = model_dict['num_stack'] algo = model_dict['algo'] save_interval = model_dict['save_interval'] log_interval = model_dict['log_interval'] save_params = model_dict['save_params'] vid_ = model_dict['vid_'] gif_ = model_dict['gif_'] ls_ = model_dict['ls_'] vae_ = model_dict['vae_'] grad_var_ = model_dict['grad_var_'] os.environ['OMP_NUM_THREADS'] = '1' os.environ['CUDA_VISIBLE_DEVICES'] = str(which_gpu) if cuda: torch.cuda.manual_seed(seed) dtype = torch.cuda.FloatTensor model_dict['dtype']=dtype else: torch.manual_seed(seed) dtype = torch.FloatTensor model_dict['dtype']=dtype # Create environments print (num_processes, 'processes') monitor_rewards_dir = os.path.join(save_dir, 'monitor_rewards') if not os.path.exists(monitor_rewards_dir): os.makedirs(monitor_rewards_dir) print ('Made dir', monitor_rewards_dir) envs = SubprocVecEnv([make_env(env_name, seed, i, monitor_rewards_dir) for i in range(num_processes)]) if vid_: print ('env for video') envs_video = make_env_monitor(env_name, save_dir) if gif_: print ('env for gif') envs_gif = make_env_basic(env_name) # if ls_: # print ('env for ls') # envs_ls = make_env_basic(env_name) # if vae_: # print ('env for vae') # envs_vae = make_env_basic(env_name) # if grad_var_: # print ('env for grad_var_') # envs_grad_var = make_env_basic(env_name) obs_shape = envs.observation_space.shape # (channels, height, width) obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:]) # (channels*stack, height, width) shape_dim0 = envs.observation_space.shape[0] #channels model_dict['obs_shape']=obs_shape model_dict['shape_dim0']=shape_dim0 model_dict['action_size'] = envs.action_space.n print (envs.action_space.n, 'actions') # Create agent if algo == 'a2c': agent = a2c(envs, model_dict) print ('init a2c agent') elif algo == 'dqn': agent = DQN(envs, model_dict) print ('init DQN agent') print (agent.q_net) # Init state state = envs.reset() # (processes, channels, height, width) current_state = torch.zeros(num_processes, *obs_shape) # (processes, channels*stack, height, width) current_state = update_current_state(current_state, state, shape_dim0).type(dtype) #add the new frame, remove oldest, since its a stack # agent.insert_first_state(current_state) #storage has states: (num_steps + 1, num_processes, *obs_shape), set first step # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([num_processes, 1]) #keeps track of current episode cumulative reward final_rewards = torch.zeros([num_processes, 1]) num_updates = int(num_frames) // num_steps // num_processes save_interval_num_updates = int(save_interval /num_processes/num_steps) # dqn_epsilon = .1 #lower means less likely to do random .9 # .1 epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 50000 epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) #Begin training # count =0 start = time.time() start2 = time.time() for j in range(num_updates): dqn_epsilon = epsilon_by_frame(j) #Num steps till agent update # for step in range(num_steps): # Act, [P,1], [P,1], [P,1], [P] # state_pytorch = Variable(agent.rollouts.states[step]) state_pytorch = Variable(current_state) # value, action, action_log_probs, dist_entropy = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True)) action = agent.act(state_pytorch, epsilon=dqn_epsilon)#, volatile=True)) # Apply to Environment, S:[P,C,H,W], R:[P], D:[P] # cpu_actions = action.data.squeeze(1).cpu().numpy() #[P] frame, reward, done, info = envs.step(action) # Record rewards and update state reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) new_current_state = update_current_state(current_state, frame, shape_dim0) agent.replay_buffer.push(current_state, action, reward, new_current_state, done.astype(int)) current_state = new_current_state if len(agent.replay_buffer) > 100: agent.update() # agent.update() # agent.update() # agent.update() # print ('save_interval_num_updates', save_interval_num_updates) # print ('num_updates', num_updates) # print ('j', j) total_num_steps = (j + 1) * num_processes * num_steps # if total_num_steps % save_interval == 0 and save_dir != "": if j % save_interval_num_updates == 0 and save_dir != "" and j != 0: #Save model if save_params: do_params(save_dir, agent, total_num_steps, model_dict) #make video if vid_: do_vid(envs_video, update_current_state, shape_dim0, dtype, agent, model_dict, total_num_steps) #make gif if gif_: do_gifs(envs_gif, agent, model_dict, update_current_state, update_rewards, total_num_steps) #make vae prob gif if vae_: do_prob_state(envs_vae, agent, model_dict, vae, update_current_state, total_num_steps) # #make vae prob gif # if grad_var_: # do_grad_var(envs_grad_var, agent, model_dict, update_current_state, total_num_steps) #Print updates if j % log_interval == 0:# and j!=0: end = time.time() to_print_info_string = "{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}, {:.2f}, {:.2f}, {:.5f}".format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start, end - start2, dqn_epsilon, agent.loss.data.cpu().numpy()[0]) # torch.mean(discrim_errors).data.cpu().numpy()[0]) print(to_print_info_string) # if vae_: # elbo = "{:.2f}".format(elbo.data.cpu().numpy()[0]) # if next_state_pred_: # state_pred_error_print = "{:.2f}".format(agent.state_pred_error.data.cpu().numpy()[0]) # print(to_print_info_string+' '+state_pred_error_print+' '+elbo) # to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, pred_error, elbo" # else: # if vae_: # print(to_print_info_string+' '+elbo) # else: # print(to_print_info_string) to_print_legend_string = "Upts, n_timesteps, min/med/mean/max, FPS, total_T, step_T, discrim_E"#, elbo" start2 = time.time() if j % (log_interval*30) == 0: if ls_: do_ls(envs_ls, agent, model_dict, total_num_steps, update_current_state, update_rewards) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval*30) == 0: #writes to file do_grad_var(envs_grad_var, agent, model_dict, total_num_steps, update_current_state, update_rewards) # print("Upts, n_timesteps, min/med/mean/max, FPS, Time, Plot updated, LS updated") # print(to_print_info_string + ' LS recorded')#, agent.current_lr) # else: #update plots try: if ls_: update_ls_plot(model_dict) # if grad_var_ and j % (log_interval*300) == 0: if grad_var_ and j % (log_interval*30) == 0: update_grad_plot(model_dict) to_print_legend_string += ' grad_var_plot updated ' make_plots(model_dict) print(to_print_legend_string + " Plot updated") # print (len(agent.replay_buffer)) except: raise #pass print(to_print_legend_string + " problem with plot") try: make_plots(model_dict) except: print ()
def train_dqn(episode, rand_obs=0, rand_act=0, noise_obs_level=0.01, noise_act_level=0.1): loss = [] agent = DQN(env.action_space.n, env.observation_space.shape[0]) all_actions = [] all_rand_acts = [] all_rewards = [] for e in range(episode): curr_acts = [] curr_rand_acts = [] curr_rewards = [] state = env.reset() state = np.reshape(state, (1, 8)) score = 0 max_steps = 5000 for i in range(max_steps): if rand_obs == 1: state = get_observation(state, option=0, noise_obs_level=noise_obs_level) action = agent.act(state) if rand_act == 1: action, is_rand = get_action(action) else: action, is_rand = action, 0 curr_acts.append(action) curr_rand_acts.append(is_rand) # env.render() next_state, reward, done, _ = env.step(action) curr_rewards.append(reward) score += reward next_state = np.reshape(next_state, (1, 8)) agent.remember(state, action, reward, next_state, done) state = next_state agent.replay() if done: print("episode: {}/{}, score: {}".format(e, episode, score)) break loss.append(score) all_actions.append(np.array(curr_acts)) all_rand_acts.append(np.array(curr_rand_acts)) all_rewards.append(np.array(curr_rewards)) # Average score of last 100 episode is_solved = np.mean(loss[-100:]) # if is_solved > 50: # print('\n Task Completed! \n') # break print("Average over last 100 episode: {0:.2f} \n".format(is_solved)) # np.savez("./saved/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + ".npz", # acts=np.array(all_actions), # rand_actions=np.array(all_rand_acts), # rewards=np.array(all_rewards), # scores=np.array(loss)) # np.savez("./saved_dqn/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + "_noise_obs_lvl_" + str(noise_obs_level) + ".npz", # acts=np.array(all_actions), # rand_actions=np.array(all_rand_acts), # rewards=np.array(all_rewards), # scores=np.array(loss)) np.savez("./saved_dqn/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + "_noise_act_lvl_" + str(noise_act_level) + ".npz", acts=np.array(all_actions), rand_actions=np.array(all_rand_acts), rewards=np.array(all_rewards), scores=np.array(loss)) return loss