def multiple_layer_perceptron_regression(X_train, y_train, X_test, y_test, show_infor=True, save_model=False): neural_number = X_train.shape[1] input_size = X_train.shape[1] hidden_layer_size = 3 MLP = models.get_mlp_model(input_size, hidden_layer_size, neural_number, ker_init.he_normal()) history = MLP.fit(X_train, y_train, batch_size=32, epochs=600) y_pred = MLP.predict(X_test) y_pred = y_pred.flatten() if save_model: utl.save_model(MLP, cf.cf_model_mlp['path']) if show_infor: show_actual_and_predict(y_test, y_pred) show_residual_actual_and_predict(y_test, y_pred) # utl.show_history(history) show_residual_and_frequency(y_test, y_pred) print_test_infor(y_test, y_pred)
def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): s = self.env.reset() policy_loss, critic_loss = 0, 0 while True: self.env.render() a = self.select_action(s) s_, r, done, _ = self.env.step(a) self.memory.push(s, a, r, s_, done) if len(self.memory) > self.batch_size: policy_loss, critic_loss = self.learn() s = s_ if done: break self.writer.add_scalar('loss/actor_loss', policy_loss, epoch) self.writer.add_scalar('loss/critic_loss', critic_loss, epoch) if (epoch + 1) % self.save_model_frequency == 0: save_model(self.critic, 'model/{}_model/critic_{}'.format(self.env_name, epoch)) save_model(self.actor, 'model/{}_model/actor_{}'.format(self.env_name, epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, epoch) if eval_r > best_eval: best_eval = eval_r save_model(self.critic, 'model/{}_model/best_critic'.format(self.env_name)) save_model(self.actor, 'model/{}_model/best_actor'.format(self.env_name))
def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): s = self.env.reset() s = self.state_normalize(s) policy_loss, critic_loss, alpha_loss = 0, 0, 0 while True: self.env.render() a, _ = self.select_action(s) s_, r, done, _ = self.env.step(a) s_ = self.state_normalize(s_) self.memory.push(s, a, r, s_, done) self.total_step += 1 if len( self.memory ) > self.batch_size and self.total_step > self.warmup_step: policy_loss, critic_loss, alpha_loss = self.learn() s = s_ if done: break if (epoch + 1) % self.save_log_frequency == 0: self.writer.add_scalar('loss/critic_loss', critic_loss, self.total_step) self.writer.add_scalar('loss/policy_loss', policy_loss, self.total_step) self.writer.add_scalar('alpha', self.log_alpha.exp().item(), self.total_step) self.writer.add_scalar('loss/alpha_loss', alpha_loss, self.total_step) if (epoch + 1) % self.save_model_frequency == 0: save_model( self.critic, 'model/{}_model/critic_{}'.format(self.env_name, epoch)) save_model( self.actor, 'model/{}_model/actor_{}'.format(self.env_name, epoch)) ZFilter.save( self.state_normalize, 'model/{}_model/rs_{}'.format(self.env_name, epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, self.total_step) if eval_r > best_eval: best_eval = eval_r save_model( self.critic, 'model/{}_model/best_critic'.format(self.env_name)) save_model( self.actor, 'model/{}_model/best_actor'.format(self.env_name)) ZFilter.save( self.state_normalize, 'model/{}_model/best_rs'.format(self.env_name))
def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): num_sample = 0 self.trace.clear() s = self.env.reset() s = self.state_normalize(s) while num_sample < self.sample_size: self.env.render() a, log_prob = self.select_action(s) torch_s = torch.tensor(s, dtype=torch.float).unsqueeze(0).to( self.device) v = self.critic(torch_s) s_, r, done, _ = self.env.step(a.cpu().detach().numpy()[0]) s_ = self.state_normalize(s_) self.trace.push(s, a, log_prob, r, s_, not done, v) # 这里怎么写才能在learn里面不用reshape呢 num_sample += 1 self.total_step += 1 s = s_ if done: s = self.env.reset() s = self.state_normalize(s) policy_loss, critic_loss = self.learn() self.writer.add_scalar('loss/actor_loss', policy_loss, self.total_step) self.writer.add_scalar('loss/critic_loss', critic_loss, self.total_step) if (epoch + 1) % self.save_model_frequency == 0: save_model( self.critic, 'model_epoch_update/{}_model/critic_{}'.format( self.env_name, self.total_step)) save_model( self.actor, 'model_epoch_update/{}_model/actor_{}'.format( self.env_name, self.total_step)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, epoch) if eval_r > best_eval: best_eval = eval_r save_model( self.critic, 'model_epoch_update/{}_model/best_critic'.format( self.env_name)) save_model( self.actor, 'model_epoch_update/{}_model/best_actor'.format( self.env_name))
def main(args): args.exp_dir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(log_dir=args.exp_dir / 'summary') if args.resume: checkpoint, model, optimizer = load_model(args.checkpoint) args = checkpoint['args'] best_dev_loss = checkpoint['best_dev_loss'] start_epoch = checkpoint['epoch'] del checkpoint else: model = build_model(args) if args.data_parallel: model = torch.nn.DataParallel(model) optimizer = build_optim(args, model.parameters()) best_dev_loss = 1e9 start_epoch = 0 logging.info(args) logging.info(model) train_loader, dev_loader, display_loader = create_data_loaders(args) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_step_size, args.lr_gamma) for epoch in range(start_epoch, args.num_epochs): scheduler.step(epoch) train_loss, train_time = train_epoch(args, epoch, model, train_loader, optimizer, writer) dev_loss, dev_time = evaluate(args, epoch, model, dev_loader, writer) visualize(args, epoch, model, display_loader, writer) is_new_best = dev_loss < best_dev_loss best_dev_loss = min(best_dev_loss, dev_loss) save_model(args, args.exp_dir, epoch, model, optimizer, best_dev_loss, is_new_best) logging.info( f'Epoch = [{epoch:4d}/{args.num_epochs:4d}] TrainLoss = {train_loss:.4g} ' f'DevLoss = {dev_loss:.4g} TrainTime = {train_time:.4f}s DevTime = {dev_time:.4f}s', ) writer.close()
def random_forest_regressions(X_train, y_train, X_test, y_test, show_infor=True, save_model=False): RF = models.get_random_model(random_state) RF.fit(X_train, y_train) y_pred = RF.predict(X_test) var_score = metrics.explained_variance_score(y_test, y_pred) if save_model: utl.save_model(RF, cf.cf_model_randf['path']) if show_infor: show_actual_and_predict(y_test, y_pred) show_residual_actual_and_predict(y_test, y_pred) show_residual_and_frequency(y_test, y_pred) print_test_infor(y_test, y_pred) return var_score
def knn_regressions(X_train, y_train, X_test, y_test, show_infor=True, save_model=False): # KNN = KNeighborsRegressor(n_neighbors=5, weights='distance') KNN = models.get_knn_model(5) KNN.fit(X_train, y_train) y_pred = KNN.predict(X_test) var_score = metrics.explained_variance_score(y_test, y_pred) if save_model: utl.save_model(KNN, cf.cf_model_knn['path']) if show_infor: show_actual_and_predict(y_test, y_pred) show_residual_actual_and_predict(y_test, y_pred) show_residual_and_frequency(y_test, y_pred) print_test_infor(y_test, y_pred) return var_score
def linear_regressions(X_train, y_train, X_test, y_test, show_infor=True, save_model=False): ML = models.get_linear_model() ML.fit(X_train, y_train) y_pred = ML.predict(X_test) var_score = metrics.explained_variance_score(y_test, y_pred) if save_model: utl.save_model(ML, cf.cf_model_mlinear['path']) if show_infor: show_actual_and_predict(y_test, y_pred) show_residual_actual_and_predict(y_test, y_pred) # utl.show_history(history) show_residual_and_frequency(y_test, y_pred) print_test_infor(y_test, y_pred) return var_score
def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): s = self.env.reset() while True: self.env.render() a, log_prob = self.select_action(s) s_, r, done, _ = self.env.step(a) r = self.reward_shaping_func((s_, r, done, _)) policy_loss, critic_loss = self.learn(s, a, log_prob, r, s_, done) s = s_ if done: break self.writer.add_scalar('loss/actor_loss', policy_loss, epoch) self.writer.add_scalar('loss/critic_loss', critic_loss, epoch) if (epoch + 1) % self.save_model_frequency == 0: save_model( self.critic, 'model_step_update/{}_model/critic_{}'.format( self.env_name, epoch)) save_model( self.actor, 'model_step_update/{}_model/actor_{}'.format( self.env_name, epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, epoch) if eval_r > best_eval: best_eval = eval_r save_model( self.critic, 'model_step_update/{}_model/best_critic'.format( self.env_name)) save_model( self.actor, 'model_step_update/{}_model/best_actor'.format( self.env_name))
def train(env, args, writer): p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) if args.noisy: p1_current_model.update_noisy_modules() p1_target_model.update_noisy_modules() p2_current_model.update_noisy_modules() p2_target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(p1_current_model, args, 1) load_model(p2_current_model, args, 2) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) p1_state_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) length_list = [] p1_reward_list, p1_loss_list = [], [] p2_reward_list, p2_loss_list = [], [] p1_episode_reward, p2_episode_reward = 0, 0 episode_length = 0 prev_time = time.time() prev_frame = 1 (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.noisy: p1_current_model.sample_noise() p1_target_model.sample_noise() p2_current_model.sample_noise() p2_target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon) if args.render: env.render() actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) if args.negative: p1_reward_deque.append(reward[0] - 1) else: p1_reward_deque.append(reward[0]) p1_action_deque.append(p1_action) if args.negative: p2_reward_deque.append(reward[1] - 1) else: p2_reward_deque.append(reward[1]) p2_action_deque.append(p2_action) if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += (reward[0]) p2_episode_reward += (reward[1]) if args.negative: p1_episode_reward -= 1 p2_episode_reward -= 1 episode_length += 1 if done or episode_length > args.max_episode_length: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0 p1_state_deque.clear() p2_state_deque.clear() p1_reward_deque.clear() p2_reward_deque.clear() p1_action_deque.clear() p2_action_deque.clear() if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta) p1_loss_list.append(loss.item()) writer.add_scalar("data/p1_loss", loss.item(), frame_idx) loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta) p2_loss_list.append(loss.item()) writer.add_scalar("data/p2_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list) print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_loss_list.clear(), p2_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2) save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model: # and os.path.isfile(args.load_model) load_model(current_model, args) load_model(target_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_buffer = deque(maxlen=args.action_repeat) states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 episode = 0 prev_time = time.time() prev_frame = 1 state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) for frame_idx in range(1, args.max_frames + 1): if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, end = env.step(action, save_screenshots=False) add_state(next_state, state_buffer) next_state = recent_state(state_buffer) for agent_index in range(len(done)): states_deque[agent_index].append((state[agent_index])) rewards_deque[agent_index].append(reward[agent_index]) actions_deque[agent_index].append(action[agent_index]) if len(states_deque[agent_index] ) == args.multi_step or done[agent_index]: n_reward = multi_step_reward(rewards_deque[agent_index], args.gamma) n_state = states_deque[agent_index][0] n_action = actions_deque[agent_index][0] replay_buffer.push(n_state, n_action, n_reward, next_state[agent_index], np.float32(done[agent_index])) # delete the agents that have reached the goal r_index = 0 for r in range(len(done)): if done[r] is True: state_buffer, states_deque, actions_deque, rewards_deque = del_record( r_index, state_buffer, states_deque, actions_deque, rewards_deque) r_index -= 1 r_index += 1 next_state = recent_state(state_buffer) state = next_state episode_reward += np.array(reward).mean() episode_length += 1 if end: if args.save_video and episode % 10 == 0: evaluate(env, current_model, args) state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) reward_list.append(episode_reward) length_list.append(episode_length) writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0, 0 for d in range(len(states_deque)): states_deque[d].clear() rewards_deque[d].clear() actions_deque[d].clear() states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] episode += 1 if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) losses = 0 for _ in range(1): loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) losses += loss.item() loss_list.append(losses) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
def train(env, args): # Init WandB wandb.init(config=args) current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, _ = env.step(action) state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += reward episode_length += 1 if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) wandb.log({ 'episode_reward': episode_reward, 'episode_length': episode_length, }) episode_reward, episode_length = 0, 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) wandb.log({'loss': loss.item()}) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
def test(env, args, current_model, best_iou, writer, episode, datetime): total_reward = 0 total_iou = 0 lowest_reward = 1e4 highest_iou = -1.0 lowest_iou = 1.0 plan = env.plan episode_reward = 0 count_brick_save = None count_step_save = None if args.env in ["3DStatic", "3DDynamic"]: fig = plt.figure(figsize=[10, 5]) ax1 = fig.add_subplot(1, 2, 1, projection='3d') ax2 = fig.add_subplot(1, 2, 2) else: fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(1, 1, 1) for i in range(args.evaluation_episodes): state = env.reset() count_brick = 0 count_step = 0 while True: count_step += 1 if args.noisy: current_model.sample_noise() epsilon = 0.0 if args.env in ['2DDynamic']: action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) else: action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) if action == 2: count_brick += 1 next_state, reward, done = env.step(action) state = next_state episode_reward += reward if done: total_reward += episode_reward lowest_reward = min(lowest_reward, episode_reward) environment_memory = env.environment_memory[ 0, args.half_window_size:34 - args.half_window_size] iou = env._iou() total_iou += iou if iou > highest_iou: highest_iou = iou best_env_memory = environment_memory count_brick_save = count_brick count_step_save = count_step lowest_iou = min(iou, lowest_iou) episode_reward = 0 break avg_reward = total_reward / args.evaluation_episodes avg_iou = total_iou / args.evaluation_episodes writer.add_scalar("Average Reward/Test)", avg_reward, episode) writer.add_scalar("Average IOU/Test)", avg_iou, episode) print( "\tTest Result - Average Reward: {} Lowest Reward: {} Average IOU: {}". format(avg_reward, lowest_reward, avg_iou)) if avg_iou > best_iou: best_iou = avg_iou print("\n\nNEW TOP RESULT.\n\n") if args.env in ["3DStatic", "3DDynamic"]: ax1.clear() ax2.clear() env.render(ax1, ax2, args, best_env_memory, plan, highest_iou, count_step_save, count_brick_save, datetime, iou_min=lowest_iou, iou_average=avg_iou) else: ax.clear() env.render(ax, args, best_env_memory, plan, highest_iou, count_step_save, count_brick_save, datetime, iou_min=lowest_iou, iou_average=avg_iou) save_model(current_model, args, datetime) plt.close(fig) return best_iou
def train(env, args, writer): # RL Model for Player 1 p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) # RL Model for Player 2 p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) # SL Model for Player 1, 2 p1_policy = Policy(env).to(args.device) p2_policy = Policy(env).to(args.device) if args.load_model and os.path.isfile(args.load_model): load_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) # Replay Buffer for Reinforcement Learning - Best Response p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) # Reservoir Buffer for Supervised Learning - Average Strategy # TODO(Aiden): How to set buffer size of SL? p1_reservoir_buffer = ReservoirBuffer(args.buffer_size) p2_reservoir_buffer = ReservoirBuffer(args.buffer_size) # Deque data structure for multi-step learning p1_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) # RL Optimizer for Player 1, 2 p1_rl_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_rl_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) # SL Optimizer for Player 1, 2 # TODO(Aiden): Is it necessary to seperate learning rate for RL/SL? p1_sl_optimizer = optim.Adam(p1_policy.parameters(), lr=args.lr) p2_sl_optimizer = optim.Adam(p2_policy.parameters(), lr=args.lr) # Logging length_list = [] p1_reward_list, p1_rl_loss_list, p1_sl_loss_list = [], [], [] p2_reward_list, p2_rl_loss_list, p2_sl_loss_list = [], [], [] p1_episode_reward, p2_episode_reward = 0, 0 tag_interval_length = 0 prev_time = time.time() prev_frame = 1 # Main Loop (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): is_best_response = False # TODO(Aiden): # Action should be decided by a combination of Best Response and Average Strategy if random.random() > args.eta: p1_action = p1_policy.act( torch.FloatTensor(p1_state).to(args.device)) p2_action = p2_policy.act( torch.FloatTensor(p1_state).to(args.device)) else: is_best_response = True epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act( torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act( torch.FloatTensor(p2_state).to(args.device), epsilon) actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, info = env.step(actions) # print(actions) # {'1': 3, '2': 2} # print(p1_next_state) # [[[127 127 ..... #print(reward, done, info) # [0 0] False None # Save current state, reward, action to deque for multi-step learning p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) p1_reward = reward[0] - 1 if args.negative else reward[0] p2_reward = reward[1] - 1 if args.negative else reward[1] p1_reward_deque.append(p1_reward) p2_reward_deque.append(p2_reward) p1_action_deque.append(p1_action) p2_action_deque.append(p2_action) # Store (state, action, reward, next_state) to Replay Buffer for Reinforcement Learning if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) # Store (state, action) to Reservoir Buffer for Supervised Learning if is_best_response: p1_reservoir_buffer.push(p1_state, p1_action) p2_reservoir_buffer.push(p2_state, p2_action) (p1_state, p2_state) = (p1_next_state, p2_next_state) # Logging p1_episode_reward += p1_reward p2_episode_reward += p2_reward tag_interval_length += 1 if info is not None: length_list.append(tag_interval_length) tag_interval_length = 0 # Episode done. Reset environment and clear logging records if done or tag_interval_length >= args.max_tag_interval: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) writer.add_scalar("p1/episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("p2/episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/tag_interval_length", tag_interval_length, frame_idx) p1_episode_reward, p2_episode_reward, tag_interval_length = 0, 0, 0 p1_state_deque.clear(), p2_state_deque.clear() p1_reward_deque.clear(), p2_reward_deque.clear() p1_action_deque.clear(), p2_action_deque.clear() if (len(p1_replay_buffer) > args.rl_start and len(p1_reservoir_buffer) > args.sl_start and frame_idx % args.train_freq == 0): # Update Best Response with Reinforcement Learning loss = compute_rl_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_rl_optimizer, args) p1_rl_loss_list.append(loss.item()) writer.add_scalar("p1/rl_loss", loss.item(), frame_idx) loss = compute_rl_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_rl_optimizer, args) p2_rl_loss_list.append(loss.item()) writer.add_scalar("p2/rl_loss", loss.item(), frame_idx) # Update Average Strategy with Supervised Learning loss = compute_sl_loss(p1_policy, p1_reservoir_buffer, p1_sl_optimizer, args) p1_sl_loss_list.append(loss.item()) writer.add_scalar("p1/sl_loss", loss.item(), frame_idx) loss = compute_sl_loss(p2_policy, p2_reservoir_buffer, p2_sl_optimizer, args) p2_sl_loss_list.append(loss.item()) writer.add_scalar("p2/sl_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) # Logging and Saving models if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, (p1_reward_list, p2_reward_list), length_list, (p1_rl_loss_list, p2_rl_loss_list), (p1_sl_loss_list, p2_sl_loss_list)) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_rl_loss_list.clear(), p2_rl_loss_list.clear() p1_sl_loss_list.clear(), p2_sl_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args) # Render if rendering argument is on if args.render: env.render() save_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) for para in target_model.parameters(): para.requires_grad = False if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() #target_model.eval() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) update_target(current_model, target_model) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) args.buffer_size = replay_buffer.it_capacity else: replay_buffer = ReplayBuffer(args.buffer_size) print_args(args) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) if args.optim == 'adam': optimizer = optim.Adam(current_model.parameters(), lr=args.lr, eps=args.adam_eps, betas=(0.9, args.beta2)) elif args.optim == 'laprop': optimizer = laprop.LaProp(current_model.parameters(), lr=args.lr, betas=(0.9, args.beta2)) reward_list, length_list, loss_list = [], [], [] episode_reward = 0. episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() evaluation_interval = args.evaluation_interval for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, raw_reward, done, _ = env.step(action) if args.clip_rewards: reward = np.clip(raw_reward, -1., 1.) else: reward = raw_reward state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += raw_reward episode_length += 1 if episode_length >= 9950: while not done: _, _, done, _ = env.step(random.randrange(env.action_space.n)) if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) if episode_length > 10000: print('{:.2f}'.format(episode_reward), end='') writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0., 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % evaluation_interval == 0: if len(length_list) > 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) else: evaluation_interval += args.evaluation_interval if frame_idx % 200000 == 0: if args.adam_eps == 1.5e-4: save_model(current_model, args, name="{}_{}".format(args.optim, frame_idx)) else: save_model(current_model, args, name="{}{:.2e}_{}".format(args.optim, args.adam_eps, frame_idx)) reward_list.append(episode_reward) length_list.append(episode_length) print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args)
def train(self, epochs): best_eval = -1e6 for epoch in range(epochs): num_sample = 0 self.trace.clear() s = self.env.reset() s = self.state_normalize(s) while True: # self.env.render() a, log_prob = self.select_action(s) log_prob = torch.sum(log_prob, dim=1, keepdim=True) v = self.critic( torch.tensor(s, dtype=torch.float).unsqueeze(0).to( self.device)) s_, r, done, _ = self.env.step(a) s_ = self.state_normalize(s_) self.trace.push(s, a, log_prob.cpu().detach().numpy()[0], r, s_, not done, v) num_sample += 1 self.total_step += 1 s = s_ if done and num_sample >= self.sample_size: break if done: s = self.env.reset() s = self.state_normalize(s) policy_loss, critic_loss = self.learn() if (epoch + 1) % self.save_log_frequency == 0: self.writer.add_scalar('loss/critic_loss', critic_loss, self.total_step) self.writer.add_scalar('loss/policy_loss', policy_loss, self.total_step) if (epoch + 1) % self.save_model_frequency == 0: save_model( self.critic, 'model/{}_model/critic_{}'.format(self.env_name, epoch)) save_model( self.actor, 'model/{}_model/actor_{}'.format(self.env_name, epoch)) ZFilter.save( self.state_normalize, 'model/{}_model/rs_{}'.format(self.env_name, epoch)) if (epoch + 1) % self.eval_frequency == 0: eval_r = self.evaluate() print('epoch', epoch, 'evaluate reward', eval_r) self.writer.add_scalar('reward', eval_r, self.total_step) if eval_r > best_eval: best_eval = eval_r save_model( self.critic, 'model/{}_model/best_critic'.format(self.env_name)) save_model( self.actor, 'model/{}_model/best_actor'.format(self.env_name)) ZFilter.save( self.state_normalize, 'model/{}_model/best_rs'.format(self.env_name))