def test(env, args): current_model = DQN(env, args).to(args.device) current_model.eval() load_model(current_model, args) episode_reward = 0 episode_length = 0 state = env.reset() while True: if args.render: env.render() action = current_model.act( torch.FloatTensor(state).to(args.device), 0.) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward episode_length += 1 if done: break print("Test Result - Reward {} Length {}".format(episode_reward, episode_length))
def test(env, args): p1_current_model = DQN(env, args).to(args.device) p2_current_model = DQN(env, args).to(args.device) p1_current_model.eval() p2_current_model.eval() load_model(p1_current_model, args, 1) load_model(p2_current_model, args, 2) p1_reward_list = [] p2_reward_list = [] length_list = [] for _ in range(30): (p1_state, p2_state) = env.reset() p1_episode_reward = 0 p2_episode_reward = 0 episode_length = 0 while True: if args.render: env.render() from time import sleep sleep(0.2) p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), 0.0) p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), 0.0) actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += reward[0] p2_episode_reward += reward[1] episode_length += 1 if done: p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) break print("Test Result - p1/Reward {} p2/Reward Length {}".format( np.mean(p1_reward_list), np.mean(p2_reward_list)))
class DQNTrainer(): def __init__(self, env, args): super(DQNTrainer).__init__() self.model = DQN(env, args, Nash=False).to(args.device) self.target = DQN(env, args, Nash=False).to(args.device) self.replay_buffer = ReplayBuffer(args.buffer_size) self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr) self.args = args def push(self, s, a, r, s_, d): self.replay_buffer.push(s, a, r, s_, np.float32(d)) def update(self): state, action, reward, next_state, done = self.replay_buffer.sample( self.args.batch_size) state = torch.FloatTensor(np.float32(state)).to(self.args.device) next_state = torch.FloatTensor(np.float32(next_state)).to( self.args.device) action = torch.LongTensor(action).to(self.args.device) reward = torch.FloatTensor(reward).to(self.args.device) done = torch.FloatTensor(done).to(self.args.device) # Q-Learning with target network q_values = self.model(state) target_next_q_values = self.target(next_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = target_next_q_values.max(1)[0] expected_q_value = reward + ( self.args.gamma**self.args.multi_step) * next_q_value * (1 - done) # Huber Loss loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none') loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.item() def act(self, s, args): return self.model.act(s, args) def save_model(self, model_path): torch.save(self.model.state_dict(), model_path + 'dqn') torch.save(self.target.state_dict(), model_path + 'dqn_target')
def test(env, args): current_model = DQN(env, args).to(args.device) current_model.eval() load_model(current_model, args) episode_reward = 0 episode_length = 0 state_buffer = deque(maxlen=args.action_repeat) states_deque = actions_deque = rewards_deque = None state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) while True: action = current_model.act(torch.FloatTensor(state).to(args.device), 0.) next_state, _, done, end = env.step(action, save_screenshots=True) add_state(next_state, state_buffer) next_state = recent_state(state_buffer) state = next_state if end: break # delete the agents that have reached the goal r_index = 0 for r in range(len(done)): if done[r] is True: state_buffer, states_deque, actions_deque, rewards_deque = \ del_record(r_index, state_buffer, states_deque, actions_deque, rewards_deque) r_index -= 1 r_index += 1 next_state = recent_state(state_buffer) state = next_state PanicEnv.display(True) print("Test Result - Reward {} Length {}".format(episode_reward, episode_length))
def train(env, args, writer): p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) if args.noisy: p1_current_model.update_noisy_modules() p1_target_model.update_noisy_modules() p2_current_model.update_noisy_modules() p2_target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(p1_current_model, args, 1) load_model(p2_current_model, args, 2) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) p1_state_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) length_list = [] p1_reward_list, p1_loss_list = [], [] p2_reward_list, p2_loss_list = [], [] p1_episode_reward, p2_episode_reward = 0, 0 episode_length = 0 prev_time = time.time() prev_frame = 1 (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.noisy: p1_current_model.sample_noise() p1_target_model.sample_noise() p2_current_model.sample_noise() p2_target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon) if args.render: env.render() actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) if args.negative: p1_reward_deque.append(reward[0] - 1) else: p1_reward_deque.append(reward[0]) p1_action_deque.append(p1_action) if args.negative: p2_reward_deque.append(reward[1] - 1) else: p2_reward_deque.append(reward[1]) p2_action_deque.append(p2_action) if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += (reward[0]) p2_episode_reward += (reward[1]) if args.negative: p1_episode_reward -= 1 p2_episode_reward -= 1 episode_length += 1 if done or episode_length > args.max_episode_length: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0 p1_state_deque.clear() p2_state_deque.clear() p1_reward_deque.clear() p2_reward_deque.clear() p1_action_deque.clear() p2_action_deque.clear() if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta) p1_loss_list.append(loss.item()) writer.add_scalar("data/p1_loss", loss.item(), frame_idx) loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta) p2_loss_list.append(loss.item()) writer.add_scalar("data/p2_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list) print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_loss_list.clear(), p2_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2) save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model: # and os.path.isfile(args.load_model) load_model(current_model, args) load_model(target_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_buffer = deque(maxlen=args.action_repeat) states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 episode = 0 prev_time = time.time() prev_frame = 1 state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) for frame_idx in range(1, args.max_frames + 1): if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, end = env.step(action, save_screenshots=False) add_state(next_state, state_buffer) next_state = recent_state(state_buffer) for agent_index in range(len(done)): states_deque[agent_index].append((state[agent_index])) rewards_deque[agent_index].append(reward[agent_index]) actions_deque[agent_index].append(action[agent_index]) if len(states_deque[agent_index] ) == args.multi_step or done[agent_index]: n_reward = multi_step_reward(rewards_deque[agent_index], args.gamma) n_state = states_deque[agent_index][0] n_action = actions_deque[agent_index][0] replay_buffer.push(n_state, n_action, n_reward, next_state[agent_index], np.float32(done[agent_index])) # delete the agents that have reached the goal r_index = 0 for r in range(len(done)): if done[r] is True: state_buffer, states_deque, actions_deque, rewards_deque = del_record( r_index, state_buffer, states_deque, actions_deque, rewards_deque) r_index -= 1 r_index += 1 next_state = recent_state(state_buffer) state = next_state episode_reward += np.array(reward).mean() episode_length += 1 if end: if args.save_video and episode % 10 == 0: evaluate(env, current_model, args) state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) reward_list.append(episode_reward) length_list.append(episode_length) writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0, 0 for d in range(len(states_deque)): states_deque[d].clear() rewards_deque[d].clear() actions_deque[d].clear() states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] episode += 1 if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) losses = 0 for _ in range(1): loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) losses += loss.item() loss_list.append(losses) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
def train(env, args): # Init WandB wandb.init(config=args) current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, _ = env.step(action) state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += reward episode_length += 1 if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) wandb.log({ 'episode_reward': episode_reward, 'episode_length': episode_length, }) episode_reward, episode_length = 0, 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) wandb.log({'loss': loss.item()}) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
target_network_update_freq = 200 train_freq = 1 checkpoint_freq = 3000 num_episodes=0 model_file = os.path.join(os.getcwd(),"turtlebot_model_test") state = env.reset() ep_no = 0 #epsiode number counter teleop=False #teleop=True if teleop==False: #RL Learning happens, no teleop mode try: for frame_idx in range(1, num_timesteps+ 1): epsilon = epsilon_by_frame(frame_idx) action = dqn.act(state, epsilon,device=device) next_state, reward, done, crashed = env.step(action) if crashed: print('CRASHED') #print([next_state,reward,done]) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() ep_no=ep_no+1 print('Episode {} reward was {} and resulted in {} and epsilon {} '.format(ep_no,episode_reward,(reward==10),epsilon_by_frame(frame_idx))) all_rewards.append(episode_reward)
def train(env, args, writer): # RL Model for Player 1 p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) # RL Model for Player 2 p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) # SL Model for Player 1, 2 p1_policy = Policy(env).to(args.device) p2_policy = Policy(env).to(args.device) if args.load_model and os.path.isfile(args.load_model): load_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) # Replay Buffer for Reinforcement Learning - Best Response p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) # Reservoir Buffer for Supervised Learning - Average Strategy # TODO(Aiden): How to set buffer size of SL? p1_reservoir_buffer = ReservoirBuffer(args.buffer_size) p2_reservoir_buffer = ReservoirBuffer(args.buffer_size) # Deque data structure for multi-step learning p1_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) # RL Optimizer for Player 1, 2 p1_rl_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_rl_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) # SL Optimizer for Player 1, 2 # TODO(Aiden): Is it necessary to seperate learning rate for RL/SL? p1_sl_optimizer = optim.Adam(p1_policy.parameters(), lr=args.lr) p2_sl_optimizer = optim.Adam(p2_policy.parameters(), lr=args.lr) # Logging length_list = [] p1_reward_list, p1_rl_loss_list, p1_sl_loss_list = [], [], [] p2_reward_list, p2_rl_loss_list, p2_sl_loss_list = [], [], [] p1_episode_reward, p2_episode_reward = 0, 0 tag_interval_length = 0 prev_time = time.time() prev_frame = 1 # Main Loop (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): is_best_response = False # TODO(Aiden): # Action should be decided by a combination of Best Response and Average Strategy if random.random() > args.eta: p1_action = p1_policy.act( torch.FloatTensor(p1_state).to(args.device)) p2_action = p2_policy.act( torch.FloatTensor(p1_state).to(args.device)) else: is_best_response = True epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act( torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act( torch.FloatTensor(p2_state).to(args.device), epsilon) actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, info = env.step(actions) # print(actions) # {'1': 3, '2': 2} # print(p1_next_state) # [[[127 127 ..... #print(reward, done, info) # [0 0] False None # Save current state, reward, action to deque for multi-step learning p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) p1_reward = reward[0] - 1 if args.negative else reward[0] p2_reward = reward[1] - 1 if args.negative else reward[1] p1_reward_deque.append(p1_reward) p2_reward_deque.append(p2_reward) p1_action_deque.append(p1_action) p2_action_deque.append(p2_action) # Store (state, action, reward, next_state) to Replay Buffer for Reinforcement Learning if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) # Store (state, action) to Reservoir Buffer for Supervised Learning if is_best_response: p1_reservoir_buffer.push(p1_state, p1_action) p2_reservoir_buffer.push(p2_state, p2_action) (p1_state, p2_state) = (p1_next_state, p2_next_state) # Logging p1_episode_reward += p1_reward p2_episode_reward += p2_reward tag_interval_length += 1 if info is not None: length_list.append(tag_interval_length) tag_interval_length = 0 # Episode done. Reset environment and clear logging records if done or tag_interval_length >= args.max_tag_interval: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) writer.add_scalar("p1/episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("p2/episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/tag_interval_length", tag_interval_length, frame_idx) p1_episode_reward, p2_episode_reward, tag_interval_length = 0, 0, 0 p1_state_deque.clear(), p2_state_deque.clear() p1_reward_deque.clear(), p2_reward_deque.clear() p1_action_deque.clear(), p2_action_deque.clear() if (len(p1_replay_buffer) > args.rl_start and len(p1_reservoir_buffer) > args.sl_start and frame_idx % args.train_freq == 0): # Update Best Response with Reinforcement Learning loss = compute_rl_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_rl_optimizer, args) p1_rl_loss_list.append(loss.item()) writer.add_scalar("p1/rl_loss", loss.item(), frame_idx) loss = compute_rl_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_rl_optimizer, args) p2_rl_loss_list.append(loss.item()) writer.add_scalar("p2/rl_loss", loss.item(), frame_idx) # Update Average Strategy with Supervised Learning loss = compute_sl_loss(p1_policy, p1_reservoir_buffer, p1_sl_optimizer, args) p1_sl_loss_list.append(loss.item()) writer.add_scalar("p1/sl_loss", loss.item(), frame_idx) loss = compute_sl_loss(p2_policy, p2_reservoir_buffer, p2_sl_optimizer, args) p2_sl_loss_list.append(loss.item()) writer.add_scalar("p2/sl_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) # Logging and Saving models if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, (p1_reward_list, p2_reward_list), length_list, (p1_rl_loss_list, p2_rl_loss_list), (p1_sl_loss_list, p2_sl_loss_list)) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_rl_loss_list.clear(), p2_rl_loss_list.clear() p1_sl_loss_list.clear(), p2_sl_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args) # Render if rendering argument is on if args.render: env.render() save_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) for para in target_model.parameters(): para.requires_grad = False if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() #target_model.eval() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) update_target(current_model, target_model) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) args.buffer_size = replay_buffer.it_capacity else: replay_buffer = ReplayBuffer(args.buffer_size) print_args(args) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) if args.optim == 'adam': optimizer = optim.Adam(current_model.parameters(), lr=args.lr, eps=args.adam_eps, betas=(0.9, args.beta2)) elif args.optim == 'laprop': optimizer = laprop.LaProp(current_model.parameters(), lr=args.lr, betas=(0.9, args.beta2)) reward_list, length_list, loss_list = [], [], [] episode_reward = 0. episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() evaluation_interval = args.evaluation_interval for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, raw_reward, done, _ = env.step(action) if args.clip_rewards: reward = np.clip(raw_reward, -1., 1.) else: reward = raw_reward state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += raw_reward episode_length += 1 if episode_length >= 9950: while not done: _, _, done, _ = env.step(random.randrange(env.action_space.n)) if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) if episode_length > 10000: print('{:.2f}'.format(episode_reward), end='') writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0., 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % evaluation_interval == 0: if len(length_list) > 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) else: evaluation_interval += args.evaluation_interval if frame_idx % 200000 == 0: if args.adam_eps == 1.5e-4: save_model(current_model, args, name="{}_{}".format(args.optim, frame_idx)) else: save_model(current_model, args, name="{}{:.2e}_{}".format(args.optim, args.adam_eps, frame_idx)) reward_list.append(episode_reward) length_list.append(episode_length) print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args)