def __init__(self, config: Dict, agent: Agent, transitions_queue: Queue, global_episode: Counter, global_update_step: Counter, epsilone: Union[ExponentialEpsilon, SinusoidalEpsilone], logger: Logger) -> None: """ Thread responsable de l'update des poids des réseaux de neurones de l'agent :param config: Dictionnaire de configuration de l'expérience :param agent: Agent à optimiser :param transitions_queue: Queue à partir de laquelle les threads Player envoient leurs transitions au thread Trainer :param global_episode: Compteur du nombre d'époside joués (partagé entre les threads) :param global_update_step: Compteur du nombre d'update effectués (partagé entre les threads) :param epsilone: Epsilone processus utilisé pour le bruit ajouté aux actions de l'agent :param logger: Le logger utilisé au cours de l'expérience """ super().__init__() self._config = config self._agent = agent self._episode_queue = transitions_queue self._global_episode = global_episode self._global_update_step = global_update_step self._logger = logger self._epsilone = epsilone self._replay_buffer = PrioritizedReplayBuffer( size=self._config["trainer_config"]["buffer_size"]) # TODO: permettre de switcher entre ReplayBuffer et PrioritizedReplayBuffer via la config # ReplayBuffer(size=self._config["trainer_config"]["buffer_size"]) self._best_test_reward = float('-inf')
def __init__(self, parameters): super(PriorDQN, self).__init__(parameters) self.replay_buffer = PrioritizedReplayBuffer( self.buffersize, parameters["alpha"]) self.beta_start = parameters["beta_start"] self.beta_frames = parameters["beta_frames"]
class PriorDQN(Trainer): def __init__(self, parameters): super(PriorDQN, self).__init__(parameters) self.replay_buffer = PrioritizedReplayBuffer( self.buffersize, parameters["alpha"]) self.beta_start = parameters["beta_start"] self.beta_frames = parameters["beta_frames"] def push_to_buffer(self, state, action, reward, next_state, done): self.replay_buffer.push(state, action, reward, next_state, done) def beta_by_frame(self, frame_idx): beta = self.beta_start + frame_idx * \ (1.0 - self.beta_start) / self.beta_frames return min(1.0, beta) def compute_td_loss(self, batch_size, frame_idx): beta = self.beta_by_frame(frame_idx) if len(self.replay_buffer) < batch_size: return None state, action, reward, next_state, done, indices, weights = self.replay_buffer.sample( batch_size, beta) state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action)) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) weights = Variable(torch.FloatTensor(weights)) q_values = self.current_model(state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_values = self.current_model(next_state) next_q_state_values = self.target_model(next_state) next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - Variable(expected_q_value.data)).pow(2) * weights loss[loss.gt(1)] = 1 prios = loss + 1e-5 loss = loss.mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.replay_buffer.update_priorities(indices, prios.data.cpu().numpy()) return loss
def test_random_sampling(self): prb = PrioritizedReplayBuffer(8) for exp in Transitions: prb.add(exp) indexes, samples, weights = prb.sample(1) assert samples[0] in Transitions assert indexes[0] in [0, 1, 2, 3, 4, 5, 6, 7] assert weights[0] == 1
def __init__(self, config): self.writer = SummaryWriter() self.device = 'cuda' if T.cuda.is_available() else 'cpu' self.dqn_type = config["dqn-type"] self.run_title = config["run-title"] self.env = gym.make(config["environment"]) self.num_states = np.prod(self.env.observation_space.shape) self.num_actions = self.env.action_space.n layers = [ self.num_states, *config["architecture"], self.num_actions ] self.policy_net = Q_Network(self.dqn_type, layers).to(self.device) self.target_net = Q_Network(self.dqn_type, layers).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() capacity = config["max-experiences"] self.p_replay_eps = config["p-eps"] self.prioritized_replay = config["prioritized-replay"] self.replay_buffer = PrioritizedReplayBuffer(capacity, config["p-alpha"]) if self.prioritized_replay \ else ReplayBuffer(capacity) self.beta_scheduler = LinearSchedule(config["episodes"], initial_p=config["p-beta-init"], final_p=1.0) self.epsilon_decay = lambda e: max(config["epsilon-min"], e * config["epsilon-decay"]) self.train_freq = config["train-freq"] self.use_soft_update = config["use-soft-update"] self.target_update = config["target-update"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch-size"] self.time_step = 0 self.optim = T.optim.AdamW(self.policy_net.parameters(), lr=config["lr-init"], weight_decay=config["weight-decay"]) self.lr_scheduler = T.optim.lr_scheduler.StepLR(self.optim, step_size=config["lr-step"], gamma=config["lr-gamma"]) self.criterion = nn.SmoothL1Loss(reduction="none") # Huber Loss self.min_experiences = max(config["min-experiences"], config["batch-size"]) self.save_path = config["save-path"]
def test_update_priorities1(self): prb = PrioritizedReplayBuffer(8) for exp in Transitions: prb.add(exp) prb.update(idx=[0, 1, 3, 4, 5, 6, 7], priorities=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]) indexes, samples, weights = prb.sample(100, beta=0.4) n = 0 sum_p = sum([ 0.1**0.6, 0.1**0.6, 1.0**0.6, 0.1**0.6, 0.1**0.6, 0.1**0.6, 0.1**0.6, 0.1**0.6 ]) assert (prb._st_sum._storage[-8:] == np.array([ 0.1**0.6, 0.1**0.6, 1.0**0.6, 0.1**0.6, 0.1**0.6, 0.1**0.6, 0.1**0.6, 0.1**0.6 ])).all() assert isclose(prb._st_sum._storage[0], sum_p) expected_weight_other = 1.0 expected_weight_2 = (100**(-0.4)) / ((100 * (0.1**0.6))**(-0.4)) for idx, sample, weight in zip(indexes, samples, weights): if idx == 2: n += 1 assert isclose(weight, expected_weight_2) else: assert isclose(weight, expected_weight_other) assert n > 10
def test_circular_buffer(self): prb = PrioritizedReplayBuffer(4) prb.add(Transitions[0]) prb.add(Transitions[1]) prb.add(Transitions[2]) prb.add(Transitions[3]) prb.add(Transitions[4]) prb.add(Transitions[5]) assert (prb._storage == [ Transitions[4], Transitions[5], Transitions[2], Transitions[3] ]).all()
def test_len(self): prb = PrioritizedReplayBuffer(4) prb.add(Transitions[0]).add(Transitions[1]).add(Transitions[2]) assert len(prb) == 3 for i in range(8): prb.add(Transitions[i]) assert len(prb) == 4
return action def reset_noise(self): self.noisy1.reset_noise() self.noisy2.reset_noise() current_model = NoisyDQN(env.observation_space.shape[0], env.action_space.n) target_model = NoisyDQN(env.observation_space.shape[0], env.action_space.n) optimizer = optim.Adam(current_model.parameters(), lr=0.0001) beta_start = 0.4 beta_iterations = 50000 beta_by_iteration = lambda iteration: min(1.0, beta_start + iteration * (1.0 - beta_start) / beta_iterations) replay_buffer = PrioritizedReplayBuffer(25000, alpha=0.6) def update_target(current_model, target_model): target_model.load_state_dict(current_model.state_dict()) update_target(current_model, target_model) def compute_td_loss(batch_size, beta): state, action, reward, next_state, done, weights, indices = replay_buffer.sample(batch_size, beta) state = autograd.Variable(torch.FloatTensor(np.float32(state))) next_state = autograd.Variable(torch.FloatTensor(np.float32(next_state))) action = autograd.Variable(torch.LongTensor(action)) reward = autograd.Variable(torch.FloatTensor(reward)) done = autograd.Variable(torch.FloatTensor(np.float32(done))) weights = autograd.Variable(torch.FloatTensor(weights))
def learn(env, seed=None, num_agents = 2, lr=0.00008, total_timesteps=100000, buffer_size=2000, exploration_fraction=0.2, exploration_final_eps=0.01, train_freq=1, batch_size=16, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=2000, gamma=0.99, target_network_update_freq=1000, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model set_global_seeds(seed) double_q = True grad_norm_clipping = True shared_weights = True play_test = 1000 nsteps = 16 agent_ids = env.agent_ids() # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) print(f'agent_ids {agent_ids}') num_actions = env.action_space.n print(f'num_actions {num_actions}') dqn_agent = MAgent(env, agent_ids, nsteps, lr, replay_buffer, shared_weights, double_q, num_actions, gamma, grad_norm_clipping, param_noise) if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=dqn_agent.q_network) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) dqn_agent.update_target() episode_rewards = [0.0 for i in range(101)] saved_mean_reward = None obs_all = env.reset() obs_shape = obs_all reset = True done = False # Start total timer tstart = time.time() for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break kwargs = {} if not param_noise: update_eps = tf.constant(exploration.value(t)) update_param_noise_threshold = 0. else: update_eps = tf.constant(0.) # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log( 1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True if t % print_freq == 0: time_1000_step = time.time() nseconds = time_1000_step - tstart tstart = time_1000_step print(f'time spend to perform {t-print_freq} to {t} steps is {nseconds} ') print('eps update', exploration.value(t)) mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], [] # mb_states = states epinfos = [] for _ in range(nsteps): # Given observations, take action and value (V(s)) obs_ = tf.constant(obs_all) # print(f'obs_.shape is {obs_.shape}') # obs_ = tf.expand_dims(obs_, axis=1) # print(f'obs_.shape is {obs_.shape}') actions_list, fps_ = dqn_agent.choose_action(obs_, update_eps=update_eps, **kwargs) fps = [[] for _ in agent_ids] # print(f'fps_.shape is {np.asarray(fps_).shape}') for a in agent_ids: fps[a] = np.delete(fps_, a, axis=0) # print(fps) # print(f'actions_list is {actions_list}') # print(f'values_list is {values_list}') # Append the experiences mb_obs.append(obs_all.copy()) mb_actions.append(actions_list) mb_values.append(fps) mb_dones.append([float(done) for _ in range(num_agents)]) # Take actions in env and look the results obs1_all, rews, done, info = env.step(actions_list) rews = [np.max(rews) for _ in range(len(rews))] # for cooperative purpose same reward for every one # print(rews) mb_rewards.append(rews) obs_all = obs1_all # print(rewards, done, info) maybeepinfo = info[0].get('episode') if maybeepinfo: epinfos.append(maybeepinfo) episode_rewards[-1] += np.max(rews) if done: episode_rewards.append(0.0) obs_all = env.reset() reset = True mb_dones.append([float(done) for _ in range(num_agents)]) # print(f'mb_actions is {mb_actions}') # print(f'mb_rewards is {mb_rewards}') # print(f'mb_values is {mb_values}') # print(f'mb_dones is {mb_dones}') mb_obs = np.asarray(mb_obs, dtype=obs_all[0].dtype) mb_actions = np.asarray(mb_actions, dtype=actions_list[0].dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_values = np.asarray(mb_values, dtype=np.float32) # print(f'mb_values.shape is {mb_values.shape}') mb_dones = np.asarray(mb_dones, dtype=np.bool) mb_masks = mb_dones[:-1] mb_dones = mb_dones[1:] # print(f'mb_actions is {mb_actions}') # print(f'mb_rewards is {mb_rewards}') # print(f'mb_values is {mb_values}') # print(f'mb_dones is {mb_dones}') # print(f'mb_masks is {mb_masks}') # print(f'mb_masks.shape is {mb_masks.shape}') if gamma > 0.0: # Discount/bootstrap off value fn last_values = dqn_agent.value(tf.constant(obs_all)) # print(f'last_values is {last_values}') if mb_dones[-1][0] == 0: # print('================ hey ================ mb_dones[-1][0] == 0') mb_rewards = discount_with_dones(np.concatenate((mb_rewards, [last_values])), np.concatenate((mb_dones, [[float(False) for _ in range(num_agents)]])) , gamma)[:-1] else: mb_rewards = discount_with_dones(mb_rewards, mb_dones, gamma) # print(f'after discount mb_rewards is {mb_rewards}') if replay_buffer is not None: replay_buffer.add(mb_obs, mb_actions, mb_rewards, obs1_all, mb_masks[:,0], mb_values, np.tile([exploration.value(t), t], (nsteps, num_agents, 1))) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones, fps, extra_datas = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None obses_t, obses_tp1 = tf.constant(obses_t), None actions, rewards, dones = tf.constant(actions), tf.constant(rewards, dtype=tf.float32), tf.constant(dones) weights, fps, extra_datas = tf.constant(weights), tf.constant(fps), tf.constant(extra_datas) s = obses_t.shape # print(f'obses_t.shape is {s}') obses_t = tf.reshape(obses_t, (s[0] * s[1], *s[2:])) s = actions.shape # print(f'actions.shape is {s}') actions = tf.reshape(actions, (s[0] * s[1], *s[2:])) s = rewards.shape # print(f'rewards.shape is {s}') rewards = tf.reshape(rewards, (s[0] * s[1], *s[2:])) s = weights.shape # print(f'weights.shape is {s}') weights = tf.reshape(weights, (s[0] * s[1], *s[2:])) s = fps.shape # print(f'fps.shape is {s}') fps = tf.reshape(fps, (s[0] * s[1], *s[2:])) # print(f'fps.shape is {fps.shape}') s = extra_datas.shape # print(f'extra_datas.shape is {s}') extra_datas = tf.reshape(extra_datas, (s[0] * s[1], *s[2:])) s = dones.shape # print(f'dones.shape is {s}') dones = tf.reshape(dones, (s[0], s[1], *s[2:])) # print(f'dones.shape is {s}') td_errors = dqn_agent.nstep_train(obses_t, actions, rewards, obses_tp1, dones, weights, fps, extra_datas) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. dqn_agent.update_target() if t % play_test == 0 and t != 0: play_test_games(dqn_agent) mean_100ep_reward = np.mean(episode_rewards[-101:-1]) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: print(f'last 100 episode mean reward {mean_100ep_reward} in {num_episodes} playing') logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular()
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model: # and os.path.isfile(args.load_model) load_model(current_model, args) load_model(target_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_buffer = deque(maxlen=args.action_repeat) states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 episode = 0 prev_time = time.time() prev_frame = 1 state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) for frame_idx in range(1, args.max_frames + 1): if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, end = env.step(action, save_screenshots=False) add_state(next_state, state_buffer) next_state = recent_state(state_buffer) for agent_index in range(len(done)): states_deque[agent_index].append((state[agent_index])) rewards_deque[agent_index].append(reward[agent_index]) actions_deque[agent_index].append(action[agent_index]) if len(states_deque[agent_index] ) == args.multi_step or done[agent_index]: n_reward = multi_step_reward(rewards_deque[agent_index], args.gamma) n_state = states_deque[agent_index][0] n_action = actions_deque[agent_index][0] replay_buffer.push(n_state, n_action, n_reward, next_state[agent_index], np.float32(done[agent_index])) # delete the agents that have reached the goal r_index = 0 for r in range(len(done)): if done[r] is True: state_buffer, states_deque, actions_deque, rewards_deque = del_record( r_index, state_buffer, states_deque, actions_deque, rewards_deque) r_index -= 1 r_index += 1 next_state = recent_state(state_buffer) state = next_state episode_reward += np.array(reward).mean() episode_length += 1 if end: if args.save_video and episode % 10 == 0: evaluate(env, current_model, args) state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) reward_list.append(episode_reward) length_list.append(episode_length) writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0, 0 for d in range(len(states_deque)): states_deque[d].clear() rewards_deque[d].clear() actions_deque[d].clear() states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] episode += 1 if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) losses = 0 for _ in range(1): loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) losses += loss.item() loss_list.append(losses) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
def train(env, args, writer, datetime): best_iou = -1.0 if args.env in ['1DStatic', '1DDynamic']: current_model = DQN_1D(env, args).to(args.device) target_model = DQN_1D(env, args).to(args.device) elif args.env in ['2DStatic', '2DDynamic']: current_model = DQN_2D(env, args).to(args.device) target_model = DQN_2D(env, args).to(args.device) elif args.env in ['3DStatic', '3DDynamic']: current_model = DQN_3D(env, args).to(args.device) target_model = DQN_3D(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 episode = 0 prev_time = time.time() prev_frame = 1 state = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) if args.env in ['2DDynamic']: action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) else: action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done = env.step(action) state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += reward episode_length += 1 if done: episode += 1 state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) writer.add_scalar("Episode_reward/train", episode_reward, episode) writer.add_scalar("Episode_length/train", episode_length, episode) episode_reward = 0 episode_length = 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) writer.add_scalar("Loss/train", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() best_iou = test(env, args, current_model, best_iou, writer, episode, datetime)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) for para in target_model.parameters(): para.requires_grad = False if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() #target_model.eval() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) update_target(current_model, target_model) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) args.buffer_size = replay_buffer.it_capacity else: replay_buffer = ReplayBuffer(args.buffer_size) print_args(args) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) if args.optim == 'adam': optimizer = optim.Adam(current_model.parameters(), lr=args.lr, eps=args.adam_eps, betas=(0.9, args.beta2)) elif args.optim == 'laprop': optimizer = laprop.LaProp(current_model.parameters(), lr=args.lr, betas=(0.9, args.beta2)) reward_list, length_list, loss_list = [], [], [] episode_reward = 0. episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() evaluation_interval = args.evaluation_interval for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, raw_reward, done, _ = env.step(action) if args.clip_rewards: reward = np.clip(raw_reward, -1., 1.) else: reward = raw_reward state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += raw_reward episode_length += 1 if episode_length >= 9950: while not done: _, _, done, _ = env.step(random.randrange(env.action_space.n)) if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) if episode_length > 10000: print('{:.2f}'.format(episode_reward), end='') writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0., 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % evaluation_interval == 0: if len(length_list) > 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) else: evaluation_interval += args.evaluation_interval if frame_idx % 200000 == 0: if args.adam_eps == 1.5e-4: save_model(current_model, args, name="{}_{}".format(args.optim, frame_idx)) else: save_model(current_model, args, name="{}{:.2e}_{}".format(args.optim, args.adam_eps, frame_idx)) reward_list.append(episode_reward) length_list.append(episode_length) print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args)
def main(args): config = load_config(args) prefix = config['env_id'] training_config = config['training_config'] if config['name_suffix']: prefix += config['name_suffix'] if config['path_prefix']: prefix = os.path.join(config['path_prefix'], prefix) if not os.path.exists(prefix): os.makedirs(prefix) train_log = os.path.join(prefix, 'train.log') logger = Logger(open(train_log, "w")) logger.log('Command line:', " ".join(sys.argv[:])) logger.log(args) logger.log(config) env_params = training_config['env_params'] env_id = config['env_id'] if "NoFrameskip" not in env_id: env = make_atari_cart(env_id) else: env = make_atari(env_id) env = wrap_deepmind(env, **env_params) env = wrap_pytorch(env) seed = training_config['seed'] env.seed(seed) np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) state = env.reset() dtype = state.dtype logger.log("env_shape: {}, num of actions: {}".format( env.observation_space.shape, env.action_space.n)) if "NoFrameskip" in env_id: logger.log('action meaning:', env.unwrapped.get_action_meanings()[:env.action_space.n]) robust = training_config.get('robust', False) adv_train = training_config.get('adv_train', False) bound_solver = training_config.get('bound_solver', 'cov') attack_config = {} if adv_train or bound_solver == 'pgd': test_config = config['test_config'] attack_config = training_config["attack_config"] adv_ratio = training_config.get('adv_ratio', 1) if adv_train: logger.log('using adversarial examples for training, adv ratio:', adv_ratio) else: logger.log('using pgd regularization training') if robust or adv_train: schedule_start = training_config['schedule_start'] schedule_length = training_config['schedule_length'] starting_epsilon = training_config['start_epsilon'] end_epsilon = training_config['epsilon'] epsilon_scheduler = EpsilonScheduler( training_config.get("schedule_type", "linear"), schedule_start, schedule_start + schedule_length - 1, starting_epsilon, end_epsilon, 1) max_eps = end_epsilon model_width = training_config['model_width'] robust_model = robust and bound_solver != 'pgd' dueling = training_config.get('dueling', True) current_model = model_setup(env_id, env, robust_model, logger, USE_CUDA, dueling, model_width) target_model = model_setup(env_id, env, robust_model, logger, USE_CUDA, dueling, model_width) load_path = training_config["load_model_path"] if load_path != "" and os.path.exists(load_path): load_frame = int(re.findall('^.*frame_([0-9]+).pth$', load_path)[0]) logger.log('\ntrain from model {}, current frame index is {}\n'.format( load_path, load_frame)) current_model.features.load_state_dict(torch.load(load_path)) target_model.features.load_state_dict(torch.load(load_path)) else: logger.log('\ntrain from scratch') load_frame = 1 lr = training_config['lr'] grad_clip = training_config['grad_clip'] natural_loss_fn = training_config['natural_loss_fn'] optimizer = optim.Adam(current_model.parameters(), lr=lr, eps=training_config['adam_eps']) # Do not evaluate gradient for target model. for param in target_model.features.parameters(): param.requires_grad = False buffer_config = training_config['buffer_params'] replay_initial = buffer_config['replay_initial'] buffer_capacity = buffer_config['buffer_capacity'] use_cpp_buffer = training_config["cpprb"] use_async_rb = training_config['use_async_rb'] num_frames = training_config['num_frames'] batch_size = training_config['batch_size'] gamma = training_config['gamma'] if use_cpp_buffer: logger.log('using cpp replay buffer') if use_async_rb: replay_buffer_ctor = AsyncReplayBuffer(initial_state=state, batch_size=batch_size) else: replay_buffer_ctor = cpprb.PrioritizedReplayBuffer else: logger.log('using python replay buffer') per = training_config['per'] if per: logger.log('using prioritized experience replay.') alpha = buffer_config['alpha'] buffer_beta_start = buffer_config['buffer_beta_start'] buffer_beta_frames = buffer_config.get('buffer_beta_frames', -1) if buffer_beta_frames < replay_initial: buffer_beta_frames = num_frames - replay_initial logger.log('beffer_beta_frames reset to ', buffer_beta_frames) buffer_beta_scheduler = BufferBetaScheduler(buffer_beta_start, buffer_beta_frames, start_frame=replay_initial) if use_cpp_buffer: replay_buffer = replay_buffer_ctor( size=buffer_capacity, # env_dict={"obs": {"shape": state.shape, "dtype": np.uint8}, env_dict={ "obs": { "shape": state.shape, "dtype": dtype }, "act": { "shape": 1, "dtype": np.uint8 }, "rew": {}, # "next_obs": {"shape": state.shape, "dtype": np.uint8}, "next_obs": { "shape": state.shape, "dtype": dtype }, "done": {} }, alpha=alpha, eps=0.0) # We add eps manually in training loop else: replay_buffer = PrioritizedReplayBuffer(buffer_capacity, alpha=alpha) else: logger.log('using regular replay.') if use_cpp_buffer: replay_buffer = cpprb.ReplayBuffer( buffer_capacity, # {"obs": {"shape": state.shape, "dtype": np.uint8}, { "obs": { "shape": state.shape, "dtype": dtype }, "act": { "shape": 1, "dtype": np.uint8 }, "rew": {}, # "next_obs": {"shape": state.shape, "dtype": np.uint8}, "next_obs": { "shape": state.shape, "dtype": dtype }, "done": {} }) else: replay_buffer = ReplayBuffer(buffer_capacity) update_target(current_model, target_model) act_epsilon_start = training_config['act_epsilon_start'] act_epsilon_final = training_config['act_epsilon_final'] act_epsilon_decay = training_config['act_epsilon_decay'] act_epsilon_method = training_config['act_epsilon_method'] if training_config.get('act_epsilon_decay_zero', True): decay_zero = num_frames else: decay_zero = None act_epsilon_scheduler = ActEpsilonScheduler(act_epsilon_start, act_epsilon_final, act_epsilon_decay, method=act_epsilon_method, start_frame=replay_initial, decay_zero=decay_zero) # Use optimized cuda memory management memory_mgr = CudaTensorManager(state.shape, batch_size, per, USE_CUDA, dtype=dtype) losses = [] td_losses = [] batch_cur_q = [] batch_exp_q = [] sa = None kappa = None hinge = False if robust: logger.log( 'using convex relaxation certified classification loss as a regularization!' ) kappa = training_config['kappa'] reg_losses = [] sa = np.zeros( (current_model.num_actions, current_model.num_actions - 1), dtype=np.int32) for i in range(sa.shape[0]): for j in range(sa.shape[1]): if j < i: sa[i][j] = j else: sa[i][j] = j + 1 sa = torch.LongTensor(sa) hinge = training_config.get('hinge', False) logger.log('using hinge loss (default is cross entropy): ', hinge) if training_config['use_async_env']: # Create an environment in a separate process, run asychronously async_env = AsyncEnv(env_id, result_path=prefix, draw=training_config['show_game'], record=training_config['record_game'], env_params=env_params, seed=seed) # initialize parameters in logging all_rewards = [] episode_reward = 0 act_epsilon = np.nan grad_norm = np.nan weights_norm = np.nan best_test_reward = -float('inf') buffer_stored_size = 0 if adv_train: attack_count = 0 suc_count = 0 if robust and bound_solver == 'pgd': ori_margin, adv_margin = np.nan, np.nan start_time = time.time() period_start_time = time.time() # Main Loop for frame_idx in range(load_frame, num_frames + 1): # Step 1: get current action frame_start = time.time() t = time.time() eps = 0 if adv_train or robust: eps = epsilon_scheduler.get_eps(frame_idx, 0) act_epsilon = act_epsilon_scheduler.get(frame_idx) if adv_train and eps != np.nan and eps >= np.finfo(np.float32).tiny: ori_state_tensor = torch.from_numpy( np.ascontiguousarray(state)).unsqueeze(0).cuda().to( torch.float32) if dtype in UINTS: ori_state_tensor /= 255 attack_config['params']['epsilon'] = eps if random.random() < adv_ratio: attack_count += 1 state_tensor = attack(current_model, ori_state_tensor, attack_config) if current_model.act(state_tensor)[0] != current_model.act( ori_state_tensor)[0]: suc_count += 1 else: state_tensor = ori_state_tensor action = current_model.act(state_tensor, act_epsilon)[0] else: with torch.no_grad(): state_tensor = torch.from_numpy( np.ascontiguousarray(state)).unsqueeze(0).cuda().to( torch.float32) if dtype in UINTS: state_tensor /= 255 ori_state_tensor = torch.clone(state_tensor) action = current_model.act(state_tensor, act_epsilon)[0] # torch.cuda.synchronize() log_time('act_time', time.time() - t) # Step 2: run environment t = time.time() if training_config['use_async_env']: async_env.async_step(action) else: next_state, reward, done, _ = env.step(action) log_time('env_time', time.time() - t) # Step 3: save to buffer # For asynchronous env, defer saving if not training_config['use_async_env']: t = time.time() if use_cpp_buffer: replay_buffer.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) else: replay_buffer.push(state, action, reward, next_state, done) log_time('save_time', time.time() - t) if use_cpp_buffer: buffer_stored_size = replay_buffer.get_stored_size() else: buffer_stored_size = len(replay_buffer) beta = np.nan buffer_beta = np.nan t = time.time() if buffer_stored_size > replay_initial: if training_config['per']: buffer_beta = buffer_beta_scheduler.get(frame_idx) if robust: convex_final_beta = training_config['convex_final_beta'] convex_start_beta = training_config['convex_start_beta'] beta = ( max_eps - eps * (1.0 - convex_final_beta)) / max_eps * convex_start_beta res = compute_td_loss(current_model, target_model, batch_size, replay_buffer, per, use_cpp_buffer, use_async_rb, optimizer, gamma, memory_mgr, robust, buffer_beta=buffer_beta, grad_clip=grad_clip, natural_loss_fn=natural_loss_fn, eps=eps, beta=beta, sa=sa, kappa=kappa, dtype=dtype, hinge=hinge, hinge_c=training_config.get('hinge_c', 1), env_id=env_id, bound_solver=bound_solver, attack_config=attack_config) loss, grad_norm, weights_norm, td_loss, batch_cur_q_value, batch_exp_q_value = res[ 0], res[1], res[2], res[3], res[4], res[5] if robust: reg_loss = res[-1] reg_losses.append(reg_loss.data.item()) if bound_solver == 'pgd': ori_margin, adv_margin = res[-3].data.item( ), res[-2].data.item() losses.append(loss.data.item()) td_losses.append(td_loss.data.item()) batch_cur_q.append(batch_cur_q_value.data.item()) batch_exp_q.append(batch_exp_q_value.data.item()) log_time('loss_time', time.time() - t) # Step 2: run environment (async) t = time.time() if training_config['use_async_env']: next_state, reward, done, _ = async_env.wait_step() log_time('env_time', time.time() - t) # Step 3: save to buffer (async) if training_config['use_async_env']: t = time.time() if use_cpp_buffer: replay_buffer.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done) else: replay_buffer.push(state, action, reward, next_state, done) log_time('save_time', time.time() - t) # Update states and reward t = time.time() state = next_state episode_reward += reward if done: if training_config['use_async_env']: state = async_env.reset() else: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 log_time('env_time', time.time() - t) # All kinds of result logging if frame_idx % training_config[ 'print_frame'] == 0 or frame_idx == num_frames or ( robust and abs(frame_idx - schedule_start) < 5 ) or abs(buffer_stored_size - replay_initial) < 5: logger.log( '\nframe {}/{}, learning rate: {:.6g}, buffer beta: {:.6g}, action epsilon: {:.6g}' .format(frame_idx, num_frames, lr, buffer_beta, act_epsilon)) logger.log( 'total time: {:.2f}, epoch time: {:.4f}, speed: {:.2f} frames/sec, last total loss: {:.6g}, avg total loss: {:.6g}, grad norm: {:.6g}, weights_norm: {:.6g}, latest episode reward: {:.6g}, avg 10 episode reward: {:.6g}' .format( time.time() - start_time, time.time() - period_start_time, training_config['print_frame'] / (time.time() - period_start_time), losses[-1] if losses else np.nan, np.average(losses[:-training_config['print_frame'] - 1:-1]) if losses else np.nan, grad_norm, weights_norm, all_rewards[-1] if all_rewards else np.nan, np.average(all_rewards[:-11:-1]) if all_rewards else np.nan)) logger.log('last td loss: {:.6g}, avg td loss: {:.6g}'.format( td_losses[-1] if td_losses else np.nan, np.average(td_losses[:-training_config['print_frame'] - 1:-1]) if td_losses else np.nan)) logger.log( 'last batch cur q: {:.6g}, avg batch cur q: {:.6g}'.format( batch_cur_q[-1] if batch_cur_q else np.nan, np.average(batch_cur_q[:-training_config['print_frame'] - 1:-1]) if batch_cur_q else np.nan)) logger.log( 'last batch exp q: {:.6g}, avg batch exp q: {:.6g}'.format( batch_exp_q[-1] if batch_exp_q else np.nan, np.average(batch_exp_q[:-training_config['print_frame'] - 1:-1]) if batch_exp_q else np.nan)) if robust: logger.log('current input epsilon: {:.6g}'.format(eps)) if bound_solver == 'pgd': logger.log( 'last logit margin: ori: {:.6g}, adv: {:.6g}'.format( ori_margin, adv_margin)) else: logger.log('current bound beta: {:.6g}'.format(beta)) logger.log( 'last cert reg loss: {:.6g}, avg cert reg loss: {:.6g}'. format( reg_losses[-1] if reg_losses else np.nan, np.average( reg_losses[:-training_config['print_frame'] - 1:-1]) if reg_losses else np.nan)) logger.log('current kappa: {:.6g}'.format(kappa)) if adv_train: logger.log( 'current attack epsilon (same as input epsilon): {:.6g}'. format(eps)) diff = ori_state_tensor - state_tensor diff = np.abs(diff.data.cpu().numpy()) logger.log('current Linf distortion: {:.6g}'.format( np.max(diff))) logger.log( 'this batch attacked: {}, success: {}, attack success rate: {:.6g}' .format( attack_count, suc_count, suc_count * 1.0 / attack_count if attack_count > 0 else np.nan)) attack_count = 0 suc_count = 0 logger.log('attack stats reseted.') period_start_time = time.time() log_time.print() log_time.clear() if frame_idx % training_config[ 'save_frame'] == 0 or frame_idx == num_frames: plot(frame_idx, all_rewards, losses, prefix) torch.save(current_model.features.state_dict(), '{}/frame_{}.pth'.format(prefix, frame_idx)) if frame_idx % training_config['update_target_frame'] == 0: update_target(current_model, target_model) if frame_idx % training_config.get('mini_test', 100000) == 0 and ( (robust and beta == 0) or (not robust and frame_idx * 1.0 / num_frames >= 0.8)): test_reward = mini_test(current_model, config, logger, dtype) logger.log('this test avg reward: {:6g}'.format(test_reward)) if test_reward >= best_test_reward: best_test_reward = test_reward logger.log( 'new best reward {:6g} achieved, update checkpoint'.format( test_reward)) torch.save(current_model.features.state_dict(), '{}/best_frame_{}.pth'.format(prefix, frame_idx)) log_time.log_time('total', time.time() - frame_start)
def test_initial_priorities(self): prb = PrioritizedReplayBuffer(8) for exp in Transitions: prb.add(exp) assert (prb._st_sum._storage[-8:] == np.array([1.] * 8)).all()
def learn(logger, device, env, number_timesteps, network, optimizer, save_path, save_interval, ob_scale, gamma, grad_norm, double_q, param_noise, exploration_fraction, exploration_final_eps, batch_size, train_freq, learning_starts, target_network_update_freq, buffer_size, prioritized_replay, prioritized_replay_alpha, prioritized_replay_beta0, atom_num, min_value, max_value): """ Papers: Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep reinforcement learning[J]. Nature, 2015, 518(7540): 529. Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements in Deep Reinforcement Learning[J]. 2017. Parameters: ---------- double_q (bool): if True double DQN will be used param_noise (bool): whether or not to use parameter space noise dueling (bool): if True dueling value estimation will be used exploration_fraction (float): fraction of entire training period over which the exploration rate is annealed exploration_final_eps (float): final value of random action probability batch_size (int): size of a batched sampled from replay buffer for training train_freq (int): update the model every `train_freq` steps learning_starts (int): how many steps of the model to collect transitions for before learning starts target_network_update_freq (int): update the target network every `target_network_update_freq` steps buffer_size (int): size of the replay buffer prioritized_replay (bool): if True prioritized replay buffer will be used. prioritized_replay_alpha (float): alpha parameter for prioritized replay prioritized_replay_beta0 (float): beta parameter for prioritized replay atom_num (int): atom number in distributional RL for atom_num > 1 min_value (float): min value in distributional RL max_value (float): max value in distributional RL """ qnet = network.to(device) qtar = deepcopy(qnet) if prioritized_replay: buffer = PrioritizedReplayBuffer(buffer_size, device, prioritized_replay_alpha, prioritized_replay_beta0) else: buffer = ReplayBuffer(buffer_size, device) generator = _generate(device, env, qnet, ob_scale, number_timesteps, param_noise, exploration_fraction, exploration_final_eps, atom_num, min_value, max_value) if atom_num > 1: delta_z = float(max_value - min_value) / (atom_num - 1) z_i = torch.linspace(min_value, max_value, atom_num).to(device) infos = {'eplenmean': deque(maxlen=100), 'eprewmean': deque(maxlen=100)} start_ts = time.time() for n_iter in range(1, number_timesteps + 1): if prioritized_replay: buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps *data, info = generator.__next__() buffer.add(*data) for k, v in info.items(): infos[k].append(v) # update qnet if n_iter > learning_starts and n_iter % train_freq == 0: b_o, b_a, b_r, b_o_, b_d, *extra = buffer.sample(batch_size) b_o.mul_(ob_scale) b_o_.mul_(ob_scale) if atom_num == 1: with torch.no_grad(): if double_q: b_a_ = qnet(b_o_).argmax(1).unsqueeze(1) b_q_ = (1 - b_d) * qtar(b_o_).gather(1, b_a_) else: b_q_ = (1 - b_d) * qtar(b_o_).max(1, keepdim=True)[0] b_q = qnet(b_o).gather(1, b_a) abs_td_error = (b_q - (b_r + gamma * b_q_)).abs() priorities = abs_td_error.detach().cpu().clamp(1e-6).numpy() if extra: loss = (extra[0] * huber_loss(abs_td_error)).mean() else: loss = huber_loss(abs_td_error).mean() else: with torch.no_grad(): b_dist_ = qtar(b_o_).exp() b_a_ = (b_dist_ * z_i).sum(-1).argmax(1) b_tzj = (gamma * (1 - b_d) * z_i[None, :] + b_r).clamp( min_value, max_value) b_i = (b_tzj - min_value) / delta_z b_l = b_i.floor() b_u = b_i.ceil() b_m = torch.zeros(batch_size, atom_num).to(device) temp = b_dist_[torch.arange(batch_size), b_a_, :] b_m.scatter_add_(1, b_l.long(), temp * (b_u - b_i)) b_m.scatter_add_(1, b_u.long(), temp * (b_i - b_l)) b_q = qnet(b_o)[torch.arange(batch_size), b_a.squeeze(1), :] kl_error = -(b_q * b_m).sum(1) # use kl error as priorities as proposed by Rainbow priorities = kl_error.detach().cpu().clamp(1e-6).numpy() loss = kl_error.mean() optimizer.zero_grad() loss.backward() if grad_norm is not None: nn.utils.clip_grad_norm_(qnet.parameters(), grad_norm) optimizer.step() if prioritized_replay: buffer.update_priorities(extra[1], priorities) # update target net and log if n_iter % target_network_update_freq == 0: qtar.load_state_dict(qnet.state_dict()) logger.info('{} Iter {} {}'.format('=' * 10, n_iter, '=' * 10)) fps = int(n_iter / (time.time() - start_ts)) logger.info('Total timesteps {} FPS {}'.format(n_iter, fps)) for k, v in infos.items(): v = (sum(v) / len(v)) if v else float('nan') logger.info('{}: {:.6f}'.format(k, v)) if n_iter > learning_starts and n_iter % train_freq == 0: logger.info('vloss: {:.6f}'.format(loss.item())) if save_interval and n_iter % save_interval == 0: torch.save( [qnet.state_dict(), optimizer.state_dict()], os.path.join(save_path, '{}.checkpoint'.format(n_iter)))
def train(env, args, writer): p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) if args.noisy: p1_current_model.update_noisy_modules() p1_target_model.update_noisy_modules() p2_current_model.update_noisy_modules() p2_target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(p1_current_model, args, 1) load_model(p2_current_model, args, 2) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) p1_state_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) length_list = [] p1_reward_list, p1_loss_list = [], [] p2_reward_list, p2_loss_list = [], [] p1_episode_reward, p2_episode_reward = 0, 0 episode_length = 0 prev_time = time.time() prev_frame = 1 (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.noisy: p1_current_model.sample_noise() p1_target_model.sample_noise() p2_current_model.sample_noise() p2_target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon) if args.render: env.render() actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) if args.negative: p1_reward_deque.append(reward[0] - 1) else: p1_reward_deque.append(reward[0]) p1_action_deque.append(p1_action) if args.negative: p2_reward_deque.append(reward[1] - 1) else: p2_reward_deque.append(reward[1]) p2_action_deque.append(p2_action) if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += (reward[0]) p2_episode_reward += (reward[1]) if args.negative: p1_episode_reward -= 1 p2_episode_reward -= 1 episode_length += 1 if done or episode_length > args.max_episode_length: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0 p1_state_deque.clear() p2_state_deque.clear() p1_reward_deque.clear() p2_reward_deque.clear() p1_action_deque.clear() p2_action_deque.clear() if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta) p1_loss_list.append(loss.item()) writer.add_scalar("data/p1_loss", loss.item(), frame_idx) loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta) p2_loss_list.append(loss.item()) writer.add_scalar("data/p2_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list) print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_loss_list.clear(), p2_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2) save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2)
def train(env, args): # Init WandB wandb.init(config=args) current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, _ = env.step(action) state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += reward episode_length += 1 if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) wandb.log({ 'episode_reward': episode_reward, 'episode_length': episode_length, }) episode_reward, episode_length = 0, 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) wandb.log({'loss': loss.item()}) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
class DQN: def __init__(self, config): self.writer = SummaryWriter() self.device = 'cuda' if T.cuda.is_available() else 'cpu' self.dqn_type = config["dqn-type"] self.run_title = config["run-title"] self.env = gym.make(config["environment"]) self.num_states = np.prod(self.env.observation_space.shape) self.num_actions = self.env.action_space.n layers = [ self.num_states, *config["architecture"], self.num_actions ] self.policy_net = Q_Network(self.dqn_type, layers).to(self.device) self.target_net = Q_Network(self.dqn_type, layers).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() capacity = config["max-experiences"] self.p_replay_eps = config["p-eps"] self.prioritized_replay = config["prioritized-replay"] self.replay_buffer = PrioritizedReplayBuffer(capacity, config["p-alpha"]) if self.prioritized_replay \ else ReplayBuffer(capacity) self.beta_scheduler = LinearSchedule(config["episodes"], initial_p=config["p-beta-init"], final_p=1.0) self.epsilon_decay = lambda e: max(config["epsilon-min"], e * config["epsilon-decay"]) self.train_freq = config["train-freq"] self.use_soft_update = config["use-soft-update"] self.target_update = config["target-update"] self.tau = config["tau"] self.gamma = config["gamma"] self.batch_size = config["batch-size"] self.time_step = 0 self.optim = T.optim.AdamW(self.policy_net.parameters(), lr=config["lr-init"], weight_decay=config["weight-decay"]) self.lr_scheduler = T.optim.lr_scheduler.StepLR(self.optim, step_size=config["lr-step"], gamma=config["lr-gamma"]) self.criterion = nn.SmoothL1Loss(reduction="none") # Huber Loss self.min_experiences = max(config["min-experiences"], config["batch-size"]) self.save_path = config["save-path"] def act(self, state, epsilon=0): """ Act on environment using epsilon-greedy policy """ if np.random.sample() < epsilon: return int(np.random.choice(np.arange(self.num_actions))) else: self.policy_net.eval() return self.policy_net(T.tensor(state, device=self.device).float().unsqueeze(0)).argmax().item() def _soft_update(self, tau): """ Polyak averaging: soft update model parameters. θ_target = τ*θ_current + (1 - τ)*θ_target """ for target_param, current_param in zip(self.target_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(tau*target_param.data + (1.0-tau)*current_param.data) def update_target(self, tau): if self.use_soft_update: self._soft_update(tau) elif self.time_step % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) def optimize(self, beta=None): if len(self.replay_buffer) < self.min_experiences: return None, None self.policy_net.train() if self.prioritized_replay: transitions, (is_weights, t_idxes) = self.replay_buffer.sample(self.batch_size, beta) else: transitions = self.replay_buffer.sample(self.batch_size) is_weights, t_idxes = np.ones(self.batch_size), None # transpose the batch --> transition of batch-arrays batch = Transition(*zip(*transitions)) # compute a mask of non-final states and concatenate the batch elements non_final_mask = T.tensor(tuple(map(lambda state: state is not None, batch.next_state)), device=self.device, dtype=T.bool) non_final_next_states = T.cat([T.tensor([state]).float() for state in batch.next_state if state is not None]).to(self.device) state_batch = T.tensor(batch.state, device=self.device).float() action_batch = T.tensor(batch.action, device=self.device).long() reward_batch = T.tensor(batch.reward, device=self.device).float() state_action_values = self.policy_net(state_batch).gather(1, action_batch.unsqueeze(1)) next_state_values = T.zeros(self.batch_size, device=self.device) if self.dqn_type == "vanilla": next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach() else: self.policy_net.eval() action_next_state = self.policy_net(non_final_next_states).max(1)[1] self.policy_net.train() next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, action_next_state.unsqueeze(1)).squeeze().detach() # compute the expected Q values (RHS of the Bellman equation) expected_state_action_values = (next_state_values * self.gamma) + reward_batch # compute temporal difference error td_error = T.abs(state_action_values.squeeze() - expected_state_action_values).detach().cpu().numpy() # compute Huber loss loss = self.criterion(state_action_values, expected_state_action_values.unsqueeze(1)) loss = T.mean(loss * T.tensor(is_weights, device=self.device)) # optimize the model self.optim.zero_grad() loss.backward() for param in self.policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.optim.step() return td_error, t_idxes def run_episode(self, epsilon, beta): total_reward, done = 0, False state = self.env.reset() while not done: # use epsilon-greedy to get an action action = self.act(state, epsilon) # caching the information of current state prev_state = state # take action state, reward, done, _ = self.env.step(action) # accumulate reward total_reward += reward # store the transition in buffer if done: state = None self.replay_buffer.push(prev_state, action, state, reward) # optimize model if self.time_step % self.train_freq == 0: td_error, t_idxes = self.optimize(beta=beta) # update priorities if self.prioritized_replay and td_error is not None: self.replay_buffer.update_priorities(t_idxes, td_error + self.p_replay_eps) # update target network self.update_target(self.tau) # increment time-step self.time_step += 1 return total_reward def train(self, episodes, epsilon, solved_reward): total_rewards = np.zeros(episodes) for episode in range(episodes): # compute beta using linear scheduler beta = self.beta_scheduler.value(episode) # run episode and get rewards reward = self.run_episode(epsilon, beta) # exponentially decay epsilon epsilon = self.epsilon_decay(epsilon) # reduce learning rate by self.lr_scheduler.step() total_rewards[episode] = reward avg_reward = total_rewards[max(0, episode-100):(episode+1)].mean() last_lr = self.lr_scheduler.get_last_lr()[0] # log into tensorboard self.writer.add_scalar(f'dqn-{self.dqn_type}/reward', reward, episode) self.writer.add_scalar(f'dqn-{self.dqn_type}/reward_100', avg_reward, episode) self.writer.add_scalar(f'dqn-{self.dqn_type}/lr', last_lr, episode) self.writer.add_scalar(f'dqn-{self.dqn_type}/epsilon', epsilon, episode) print(f"Episode: {episode} | Last 100 Average Reward: {avg_reward:.5f} | Learning Rate: {last_lr:.5E} | Epsilon: {epsilon:.5E}", end='\r') if avg_reward > solved_reward: break self.writer.close() print(f"Environment solved in {episode} episodes") T.save(self.policy_net.state_dict(), os.path.join(self.save_path, f"{self.run_title}.pt")) def visualize(self, load_path=None): done = False state = self.env.reset() if load_path is not None: self.policy_net.load_state_dict(T.load(load_path, map_location=self.device)) self.policy_net.eval() while not done: self.env.render() action = self.act(state) state, _, done, _ = self.env.step(int(action)) sleep(0.01)
class Trainer(threading.Thread): def __init__(self, config: Dict, agent: Agent, transitions_queue: Queue, global_episode: Counter, global_update_step: Counter, epsilone: Union[ExponentialEpsilon, SinusoidalEpsilone], logger: Logger) -> None: """ Thread responsable de l'update des poids des réseaux de neurones de l'agent :param config: Dictionnaire de configuration de l'expérience :param agent: Agent à optimiser :param transitions_queue: Queue à partir de laquelle les threads Player envoient leurs transitions au thread Trainer :param global_episode: Compteur du nombre d'époside joués (partagé entre les threads) :param global_update_step: Compteur du nombre d'update effectués (partagé entre les threads) :param epsilone: Epsilone processus utilisé pour le bruit ajouté aux actions de l'agent :param logger: Le logger utilisé au cours de l'expérience """ super().__init__() self._config = config self._agent = agent self._episode_queue = transitions_queue self._global_episode = global_episode self._global_update_step = global_update_step self._logger = logger self._epsilone = epsilone self._replay_buffer = PrioritizedReplayBuffer( size=self._config["trainer_config"]["buffer_size"]) # TODO: permettre de switcher entre ReplayBuffer et PrioritizedReplayBuffer via la config # ReplayBuffer(size=self._config["trainer_config"]["buffer_size"]) self._best_test_reward = float('-inf') def test(self) -> None: """ Teste l'agent sur un episode complet sans bruit """ start_test_time = time() env = make_env(self._config) obs, done = env.reset(), False rews = [] nb_step = 0 while not done: act = self._agent(obs=obs) obs, rew, done, _ = env.step(act) rews.append(rew) nb_step += 1 rew_mean = np.mean(rews) # logging self._logger.add_scalar(label="test/reward", value=sum(rews), step=self._global_update_step.val()) self._logger.add_scalar(label="test/nb_step", value=nb_step, step=self._global_update_step.val()) self._logger.add_scalar(label="test/reward_mean", value=float(rew_mean), step=self._global_update_step.val()) self._logger.add_scalar(label="test/reward_var", value=float(np.var(rews)), step=self._global_update_step.val()) if self._best_test_reward < int(rew_mean): self._agent.save(episode=self._global_episode.val(), update_step=self._global_update_step.val(), test_reward=int(sum(rews))) self._logger.add_scalar(label="test/test_speed", value=time() - start_test_time, step=self._global_update_step.val()) def _should_stop(self) -> bool: """ Check si le thread doit terminer :return: True si le thread doit terminer, False sinon """ if self._config["trainer_config"]["max_update_step"] and \ self._global_update_step.val() > self._config["trainer_config"]["max_update_step"]: return True if self._config["trainer_config"]["max_episode"] and \ self._global_episode.val() > self._config["trainer_config"]["max_episode"]: return True if self._config["trainer_config"]["max_time"] and \ time() - self._start_training_time > self._config["trainer_config"]["max_time"]: return True return False def run(self) -> None: """ Lance le thread """ self._start_training_time = time() # Initialise le replay buffer avec la policy random jusqu'à qu'il contienne min_replay_size transitions p_bar = tqdm(total=self._config["agent_config"]["min_replay_size"]) env = make_env(self._config) while len(self._replay_buffer ) < self._config["agent_config"]["min_replay_size"]: obs, done = env.reset(), False while not done: act = env.action_space.sample() if len(act.shape) < 1: act = [act] next_obs, rew, done, _ = env.step(act) transition = Transition(observation=obs, action=act, new_observation=next_obs, reward=rew, done=done) obs = next_obs self._replay_buffer.add(transition) p_bar.update(1) print("buffer initialization done") while True: start_get_replays_time = time() # Récupération d'une transition dans la queue et placement de cette transition dans le replay buffer while True: if self._should_stop(): break try: transition = self._episode_queue.get_nowait() self._replay_buffer.add(transition) break except Empty: sleep(0.01) pass if self._should_stop(): break end_get_replays_time = time() start_update_time = time() indexes_b, transition_b, weights_b = self._replay_buffer.sample( self._config["agent_config"]["batch_size"]) if self._global_update_step.val( ) % self._config["trainer_config"]["log_freq"] == 0: update_step = self._global_update_step.val() else: update_step = None td_error = self._agent.update(transition_b, weights_b, update_step) # Update les priorités du replay buffer avec l'erreur du critic new_priorities = np.abs( td_error ) + 1e-16 # attention les priorités doivent toujours être strictement positives self._replay_buffer.update(indexes_b, new_priorities) # logging if update_step: self._logger.add_scalar(label="multithreading/queue_size", value=self._episode_queue.qsize(), step=update_step) self._logger.add_scalar(label="trainer/buffer_size", value=len(self._replay_buffer), step=update_step) self._logger.add_scalar(label="trainer/priority_mean", value=weights_b.mean(), step=update_step) self._logger.add_scalar(label="trainer/priority_var", value=weights_b.var(), step=update_step) self._logger.add_scalar(label="trainer/td_error_mean", value=np.abs(td_error).mean(), step=update_step) self._logger.add_scalar(label="trainer/update_step", value=self._global_episode.val(), step=update_step) self._logger.add_scalar(label="trainer/update_time", value=time() - start_update_time, step=update_step) self._logger.add_scalar(label="trainer/idle_time", value=end_get_replays_time - start_get_replays_time, step=update_step) self._global_update_step.inc() self._epsilone.step() if self._global_update_step.val( ) % self._config["trainer_config"]["test_freq"] == 0: self.test() self.test() print("trainer end")
def learn(env, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16): torch.set_num_threads(num_cpu) if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative obs, xy_per_marine = common.init(env, obs) group_id = 0 reset = True dqn = DQN(num_actions, lr, cuda) print('\nCollecting experience...') checkpoint_path = 'models/deepq/checkpoint.pth.tar' if os.path.exists(checkpoint_path): dqn, saved_mean_reward = load_checkpoint(dqn, cuda, filename=checkpoint_path) for t in range(max_timesteps): # Take action and update exploration to the newest value # custom process for DefeatZerglingsAndBanelings obs, screen, player = common.select_marine(env, obs) # action = act( # np.array(screen)[None], update_eps=update_eps, **kwargs)[0] action = dqn.choose_action(np.array(screen)[None]) reset = False rew = 0 new_action = None obs, new_action = common.marine_action(env, obs, player, action) army_count = env._obs[0].observation.player_common.army_count try: if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]: obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: # print(e) 1 # Do nothing player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative rew += obs[0].reward done = obs[0].step_type == environment.StepType.LAST selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if len(player_y) > 0: player = [int(player_x.mean()), int(player_y.mean())] if len(player) == 2: if player[0] > 32: new_screen = common.shift(LEFT, player[0] - 32, new_screen) elif player[0] < 32: new_screen = common.shift(RIGHT, 32 - player[0], new_screen) if player[1] > 32: new_screen = common.shift(UP, player[1] - 32, new_screen) elif player[1] < 32: new_screen = common.shift(DOWN, 32 - player[1], new_screen) # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() player_relative = obs[0].observation["screen"][ _PLAYER_RELATIVE] screen = player_relative group_list = common.init(env, obs) # Select all marines first # env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = dqn.learn(obses_t, actions, rewards, obses_tp1, gamma, batch_size) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. dqn.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_checkpoint({ 'epoch': t + 1, 'state_dict': dqn.save_state_dict(), 'best_accuracy': mean_100ep_reward }, checkpoint_path) saved_mean_reward = mean_100ep_reward
def learn(device, env, seed, number_timesteps, network, optimizer, save_path, save_interval, ob_scale, gamma, grad_norm, double_q, param_noise, exploration_fraction, exploration_final_eps, batch_size, train_freq, learning_starts, target_network_update_freq, buffer_size, prioritized_replay, prioritized_replay_alpha, prioritized_replay_beta0): """ Papers: Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep reinforcement learning[J]. Nature, 2015, 518(7540): 529. Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements in Deep Reinforcement Learning[J]. 2017. Parameters: ---------- double_q (bool): if True double DQN will be used param_noise (bool): whether or not to use parameter space noise dueling (bool): if True dueling value estimation will be used exploration_fraction (float): fraction of entire training period over which the exploration rate is annealed exploration_final_eps (float): final value of random action probability batch_size (int): size of a batched sampled from replay buffer for training train_freq (int): update the model every `train_freq` steps learning_starts (int): how many steps of the model to collect transitions for before learning starts target_network_update_freq (int): update the target network every `target_network_update_freq` steps buffer_size (int): size of the replay buffer prioritized_replay (bool): if True prioritized replay buffer will be used. prioritized_replay_alpha (float): alpha parameter for prioritized replay prioritized_replay_beta0 (float): beta parameter for prioritized replay """ name = '{}_{}'.format(os.path.split(__file__)[-1][:-3], seed) logger = get_logger(name) logger.info('Note that Rainbow features supported in current version is ' 'consitent with openai/baselines, which means `Multi-step` and ' '`Distributional` are missing. Welcome any contributions!') qnet = network.to(device) qtar = deepcopy(qnet) if prioritized_replay: buffer = PrioritizedReplayBuffer(buffer_size, device, prioritized_replay_alpha, prioritized_replay_beta0) else: buffer = ReplayBuffer(buffer_size, device) generator = _generate(device, env, qnet, ob_scale, number_timesteps, param_noise, exploration_fraction, exploration_final_eps) infos = {'eplenmean': deque(maxlen=100), 'eprewmean': deque(maxlen=100)} start_ts = time.time() for n_iter in range(1, number_timesteps + 1): if prioritized_replay: buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps *data, info = generator.__next__() buffer.add(*data) for k, v in info.items(): infos[k].append(v) # update qnet if n_iter > learning_starts and n_iter % train_freq == 0: b_o, b_a, b_r, b_o_, b_d, *extra = buffer.sample(batch_size) b_o.mul_(ob_scale) b_o_.mul_(ob_scale) b_q = qnet(b_o).gather(1, b_a) with torch.no_grad(): if double_q: b_a_ = qnet(b_o_).argmax(1).unsqueeze(1) b_q_ = (1 - b_d) * qtar(b_o_).gather(1, b_a_) else: b_q_ = (1 - b_d) * qtar(b_o_).max(1, keepdim=True)[0] abs_td_error = (b_q - (b_r + gamma * b_q_)).abs() if extra: loss = (extra[0] * huber_loss(abs_td_error)).mean() # weighted else: loss = huber_loss(abs_td_error).mean() optimizer.zero_grad() loss.backward() if grad_norm is not None: nn.utils.clip_grad_norm_(qnet.parameters(), grad_norm) optimizer.step() if prioritized_replay: priorities = abs_td_error.detach().cpu().clamp(1e-6).numpy() buffer.update_priorities(extra[1], priorities) # update target net and log if n_iter % target_network_update_freq == 0: qtar.load_state_dict(qnet.state_dict()) logger.info('{} Iter {} {}'.format('=' * 10, n_iter, '=' * 10)) fps = int(n_iter / (time.time() - start_ts)) logger.info('Total timesteps {} FPS {}'.format(n_iter, fps)) for k, v in infos.items(): v = (sum(v) / len(v)) if v else float('nan') logger.info('{}: {:.6f}'.format(k, v)) if n_iter > learning_starts and n_iter % train_freq == 0: logger.info('vloss: {:.6f}'.format(loss.item())) if save_interval and n_iter % save_interval == 0: torch.save([qnet.state_dict(), optimizer.state_dict()], os.path.join(save_path, '{}.{}'.format(name, n_iter)))