예제 #1
0
    def __init__(self, config: Dict, agent: Agent, transitions_queue: Queue,
                 global_episode: Counter, global_update_step: Counter,
                 epsilone: Union[ExponentialEpsilon,
                                 SinusoidalEpsilone], logger: Logger) -> None:
        """
        Thread responsable de l'update des poids des réseaux de neurones de l'agent

        :param config: Dictionnaire de configuration de l'expérience
        :param agent: Agent à optimiser
        :param transitions_queue: Queue à partir de laquelle les threads Player envoient leurs transitions au thread Trainer
        :param global_episode: Compteur du nombre d'époside joués (partagé entre les threads)
        :param global_update_step: Compteur du nombre d'update effectués (partagé entre les threads)
        :param epsilone: Epsilone processus utilisé pour le bruit ajouté aux actions de l'agent
        :param logger: Le logger utilisé au cours de l'expérience
        """
        super().__init__()

        self._config = config
        self._agent = agent
        self._episode_queue = transitions_queue
        self._global_episode = global_episode
        self._global_update_step = global_update_step
        self._logger = logger
        self._epsilone = epsilone

        self._replay_buffer = PrioritizedReplayBuffer(
            size=self._config["trainer_config"]["buffer_size"])
        # TODO: permettre de switcher entre ReplayBuffer et PrioritizedReplayBuffer via la config
        # ReplayBuffer(size=self._config["trainer_config"]["buffer_size"])

        self._best_test_reward = float('-inf')
예제 #2
0
    def __init__(self, parameters):
        super(PriorDQN, self).__init__(parameters)

        self.replay_buffer = PrioritizedReplayBuffer(
            self.buffersize, parameters["alpha"])

        self.beta_start = parameters["beta_start"]
        self.beta_frames = parameters["beta_frames"]
예제 #3
0
class PriorDQN(Trainer):

    def __init__(self, parameters):
        super(PriorDQN, self).__init__(parameters)

        self.replay_buffer = PrioritizedReplayBuffer(
            self.buffersize, parameters["alpha"])

        self.beta_start = parameters["beta_start"]
        self.beta_frames = parameters["beta_frames"]

    def push_to_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def beta_by_frame(self, frame_idx):
        beta = self.beta_start + frame_idx * \
            (1.0 - self.beta_start) / self.beta_frames
        return min(1.0, beta)

    def compute_td_loss(self, batch_size, frame_idx):

        beta = self.beta_by_frame(frame_idx)

        if len(self.replay_buffer) < batch_size:
            return None

        state, action, reward, next_state, done, indices, weights = self.replay_buffer.sample(
            batch_size, beta)

        state = Variable(torch.FloatTensor(np.float32(state)))
        next_state = Variable(torch.FloatTensor(np.float32(next_state)))
        action = Variable(torch.LongTensor(action))
        reward = Variable(torch.FloatTensor(reward))
        done = Variable(torch.FloatTensor(done))
        weights = Variable(torch.FloatTensor(weights))

        q_values = self.current_model(state)
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

        next_q_values = self.current_model(next_state)
        next_q_state_values = self.target_model(next_state)
        next_q_value = next_q_state_values.gather(
            1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)

        expected_q_value = reward + self.gamma * next_q_value * (1 - done)

        loss = (q_value - Variable(expected_q_value.data)).pow(2) * weights
        loss[loss.gt(1)] = 1
        prios = loss + 1e-5
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.replay_buffer.update_priorities(indices, prios.data.cpu().numpy())
        
        return loss
예제 #4
0
    def test_random_sampling(self):
        prb = PrioritizedReplayBuffer(8)
        for exp in Transitions:
            prb.add(exp)

        indexes, samples, weights = prb.sample(1)
        assert samples[0] in Transitions
        assert indexes[0] in [0, 1, 2, 3, 4, 5, 6, 7]
        assert weights[0] == 1
예제 #5
0
    def __init__(self, config):
        self.writer = SummaryWriter() 
        self.device = 'cuda' if T.cuda.is_available() else 'cpu'

        self.dqn_type = config["dqn-type"]
        self.run_title = config["run-title"]
        self.env = gym.make(config["environment"])

        self.num_states  = np.prod(self.env.observation_space.shape)
        self.num_actions = self.env.action_space.n

        layers = [
            self.num_states, 
            *config["architecture"], 
            self.num_actions
        ]

        self.policy_net = Q_Network(self.dqn_type, layers).to(self.device)
        self.target_net = Q_Network(self.dqn_type, layers).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        capacity = config["max-experiences"]
        self.p_replay_eps = config["p-eps"]
        self.prioritized_replay = config["prioritized-replay"]
        self.replay_buffer = PrioritizedReplayBuffer(capacity, config["p-alpha"]) if self.prioritized_replay \
                        else ReplayBuffer(capacity)

        self.beta_scheduler = LinearSchedule(config["episodes"], initial_p=config["p-beta-init"], final_p=1.0)
        self.epsilon_decay = lambda e: max(config["epsilon-min"], e * config["epsilon-decay"])

        self.train_freq = config["train-freq"]
        self.use_soft_update = config["use-soft-update"]
        self.target_update = config["target-update"]
        self.tau = config["tau"]
        self.gamma = config["gamma"]
        self.batch_size = config["batch-size"]
        self.time_step = 0

        self.optim = T.optim.AdamW(self.policy_net.parameters(), lr=config["lr-init"], weight_decay=config["weight-decay"])
        self.lr_scheduler = T.optim.lr_scheduler.StepLR(self.optim, step_size=config["lr-step"], gamma=config["lr-gamma"])
        self.criterion = nn.SmoothL1Loss(reduction="none") # Huber Loss
        self.min_experiences = max(config["min-experiences"], config["batch-size"])

        self.save_path = config["save-path"]
예제 #6
0
    def test_update_priorities1(self):
        prb = PrioritizedReplayBuffer(8)
        for exp in Transitions:
            prb.add(exp)

        prb.update(idx=[0, 1, 3, 4, 5, 6, 7],
                   priorities=[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
        indexes, samples, weights = prb.sample(100, beta=0.4)

        n = 0
        sum_p = sum([
            0.1**0.6, 0.1**0.6, 1.0**0.6, 0.1**0.6, 0.1**0.6, 0.1**0.6,
            0.1**0.6, 0.1**0.6
        ])

        assert (prb._st_sum._storage[-8:] == np.array([
            0.1**0.6, 0.1**0.6, 1.0**0.6, 0.1**0.6, 0.1**0.6, 0.1**0.6,
            0.1**0.6, 0.1**0.6
        ])).all()
        assert isclose(prb._st_sum._storage[0], sum_p)

        expected_weight_other = 1.0
        expected_weight_2 = (100**(-0.4)) / ((100 * (0.1**0.6))**(-0.4))
        for idx, sample, weight in zip(indexes, samples, weights):
            if idx == 2:
                n += 1
                assert isclose(weight, expected_weight_2)
            else:
                assert isclose(weight, expected_weight_other)

        assert n > 10
예제 #7
0
    def test_circular_buffer(self):
        prb = PrioritizedReplayBuffer(4)
        prb.add(Transitions[0])
        prb.add(Transitions[1])
        prb.add(Transitions[2])
        prb.add(Transitions[3])
        prb.add(Transitions[4])
        prb.add(Transitions[5])

        assert (prb._storage == [
            Transitions[4], Transitions[5], Transitions[2], Transitions[3]
        ]).all()
예제 #8
0
 def test_len(self):
     prb = PrioritizedReplayBuffer(4)
     prb.add(Transitions[0]).add(Transitions[1]).add(Transitions[2])
     assert len(prb) == 3
     for i in range(8):
         prb.add(Transitions[i])
     assert len(prb) == 4
        return action
    
    def reset_noise(self):
        self.noisy1.reset_noise()
        self.noisy2.reset_noise()

current_model = NoisyDQN(env.observation_space.shape[0], env.action_space.n)
target_model  = NoisyDQN(env.observation_space.shape[0], env.action_space.n)
    
optimizer = optim.Adam(current_model.parameters(), lr=0.0001)

beta_start = 0.4
beta_iterations = 50000 
beta_by_iteration = lambda iteration: min(1.0, beta_start + iteration * (1.0 - beta_start) / beta_iterations)

replay_buffer = PrioritizedReplayBuffer(25000, alpha=0.6)

def update_target(current_model, target_model):
    target_model.load_state_dict(current_model.state_dict())
    
update_target(current_model, target_model)

def compute_td_loss(batch_size, beta):
    state, action, reward, next_state, done, weights, indices = replay_buffer.sample(batch_size, beta) 

    state      = autograd.Variable(torch.FloatTensor(np.float32(state)))
    next_state = autograd.Variable(torch.FloatTensor(np.float32(next_state)))
    action     = autograd.Variable(torch.LongTensor(action))
    reward     = autograd.Variable(torch.FloatTensor(reward))
    done       = autograd.Variable(torch.FloatTensor(np.float32(done)))
    weights    = autograd.Variable(torch.FloatTensor(weights))
예제 #10
0
def learn(env,
          seed=None,
          num_agents = 2,
          lr=0.00008,
          total_timesteps=100000,
          buffer_size=2000,
          exploration_fraction=0.2,
          exploration_final_eps=0.01,
          train_freq=1,
          batch_size=16,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=2000,
          gamma=0.99,
          target_network_update_freq=1000,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs
          ):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model
    set_global_seeds(seed)
    double_q = True
    grad_norm_clipping = True
    shared_weights = True
    play_test = 1000
    nsteps = 16
    agent_ids = env.agent_ids()

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    print(f'agent_ids {agent_ids}')
    num_actions = env.action_space.n
    print(f'num_actions {num_actions}')

    dqn_agent = MAgent(env, agent_ids, nsteps, lr, replay_buffer, shared_weights, double_q, num_actions,
                           gamma, grad_norm_clipping, param_noise)


    if load_path is not None:
        load_path = osp.expanduser(load_path)
        ckpt = tf.train.Checkpoint(model=dqn_agent.q_network)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None)
        ckpt.restore(manager.latest_checkpoint)
        print("Restoring from {}".format(manager.latest_checkpoint))

    dqn_agent.update_target()

    episode_rewards = [0.0 for i in range(101)]
    saved_mean_reward = None
    obs_all = env.reset()
    obs_shape = obs_all
    reset = True
    done = False

    # Start total timer
    tstart = time.time()
    for t in range(total_timesteps):
        if callback is not None:
            if callback(locals(), globals()):
                break
        kwargs = {}
        if not param_noise:
            update_eps = tf.constant(exploration.value(t))
            update_param_noise_threshold = 0.
        else:
            update_eps = tf.constant(0.)
            # Compute the threshold such that the KL divergence between perturbed and non-perturbed
            # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
            # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
            # for detailed explanation.
            update_param_noise_threshold = -np.log(
                1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
            kwargs['reset'] = reset
            kwargs['update_param_noise_threshold'] = update_param_noise_threshold
            kwargs['update_param_noise_scale'] = True

        if t % print_freq == 0:
            time_1000_step = time.time()
            nseconds = time_1000_step - tstart
            tstart = time_1000_step
            print(f'time spend to perform {t-print_freq} to {t} steps is {nseconds} ')
            print('eps update', exploration.value(t))

        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [], [], [], [], []
        # mb_states = states
        epinfos = []
        for _ in range(nsteps):
            # Given observations, take action and value (V(s))
            obs_ = tf.constant(obs_all)
            # print(f'obs_.shape is {obs_.shape}')
            # obs_ = tf.expand_dims(obs_, axis=1)
            # print(f'obs_.shape is {obs_.shape}')
            actions_list, fps_ = dqn_agent.choose_action(obs_, update_eps=update_eps, **kwargs)
            fps = [[] for _ in agent_ids]
            # print(f'fps_.shape is {np.asarray(fps_).shape}')
            for a in agent_ids:
                fps[a] = np.delete(fps_, a, axis=0)

            # print(fps)
            # print(f'actions_list is {actions_list}')
            # print(f'values_list is {values_list}')

            # Append the experiences
            mb_obs.append(obs_all.copy())
            mb_actions.append(actions_list)
            mb_values.append(fps)
            mb_dones.append([float(done) for _ in range(num_agents)])

            # Take actions in env and look the results
            obs1_all, rews, done, info = env.step(actions_list)
            rews = [np.max(rews) for _ in range(len(rews))]  # for cooperative purpose same reward for every one
            # print(rews)
            mb_rewards.append(rews)
            obs_all = obs1_all
            # print(rewards, done, info)
            maybeepinfo = info[0].get('episode')
            if maybeepinfo: epinfos.append(maybeepinfo)

            episode_rewards[-1] += np.max(rews)
            if done:
                episode_rewards.append(0.0)
                obs_all = env.reset()
                reset = True

        mb_dones.append([float(done) for _ in range(num_agents)])

        # print(f'mb_actions is {mb_actions}')
        # print(f'mb_rewards is {mb_rewards}')
        # print(f'mb_values is {mb_values}')
        # print(f'mb_dones is {mb_dones}')

        mb_obs = np.asarray(mb_obs, dtype=obs_all[0].dtype)
        mb_actions = np.asarray(mb_actions, dtype=actions_list[0].dtype)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
        mb_values = np.asarray(mb_values, dtype=np.float32)
        # print(f'mb_values.shape is {mb_values.shape}')
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        mb_masks = mb_dones[:-1]
        mb_dones = mb_dones[1:]

        # print(f'mb_actions is {mb_actions}')
        # print(f'mb_rewards is {mb_rewards}')
        # print(f'mb_values is {mb_values}')
        # print(f'mb_dones is {mb_dones}')
        # print(f'mb_masks is {mb_masks}')
        # print(f'mb_masks.shape is {mb_masks.shape}')

        if gamma > 0.0:
            # Discount/bootstrap off value fn
            last_values = dqn_agent.value(tf.constant(obs_all))
            # print(f'last_values is {last_values}')
            if mb_dones[-1][0] == 0:
                # print('================ hey ================ mb_dones[-1][0] == 0')
                mb_rewards = discount_with_dones(np.concatenate((mb_rewards, [last_values])),
                                                 np.concatenate((mb_dones, [[float(False) for _ in range(num_agents)]]))
                                                 , gamma)[:-1]
            else:
                mb_rewards = discount_with_dones(mb_rewards, mb_dones, gamma)

        # print(f'after discount mb_rewards is {mb_rewards}')

        if replay_buffer is not None:
            replay_buffer.add(mb_obs, mb_actions, mb_rewards, obs1_all, mb_masks[:,0],
                              mb_values, np.tile([exploration.value(t), t], (nsteps, num_agents, 1)))

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones, fps, extra_datas = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            obses_t, obses_tp1 = tf.constant(obses_t), None
            actions, rewards, dones = tf.constant(actions), tf.constant(rewards, dtype=tf.float32), tf.constant(dones)
            weights, fps, extra_datas = tf.constant(weights), tf.constant(fps), tf.constant(extra_datas)

            s = obses_t.shape
            # print(f'obses_t.shape is {s}')
            obses_t = tf.reshape(obses_t, (s[0] * s[1], *s[2:]))
            s = actions.shape
            # print(f'actions.shape is {s}')
            actions = tf.reshape(actions, (s[0] * s[1], *s[2:]))
            s = rewards.shape
            # print(f'rewards.shape is {s}')
            rewards = tf.reshape(rewards, (s[0] * s[1], *s[2:]))
            s = weights.shape
            # print(f'weights.shape is {s}')
            weights = tf.reshape(weights, (s[0] * s[1], *s[2:]))
            s = fps.shape
            # print(f'fps.shape is {s}')
            fps = tf.reshape(fps, (s[0] * s[1], *s[2:]))
            # print(f'fps.shape is {fps.shape}')
            s = extra_datas.shape
            # print(f'extra_datas.shape is {s}')
            extra_datas = tf.reshape(extra_datas, (s[0] * s[1], *s[2:]))
            s = dones.shape
            # print(f'dones.shape is {s}')
            dones = tf.reshape(dones, (s[0], s[1], *s[2:]))
            # print(f'dones.shape is {s}')

            td_errors = dqn_agent.nstep_train(obses_t, actions, rewards, obses_tp1, dones, weights, fps, extra_datas)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            dqn_agent.update_target()

        if t % play_test == 0 and t != 0:
            play_test_games(dqn_agent)

        mean_100ep_reward = np.mean(episode_rewards[-101:-1])
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
            print(f'last 100 episode mean reward {mean_100ep_reward} in {num_episodes} playing')
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
            logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
            logger.dump_tabular()
예제 #11
0
def train(env, args, writer):
    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

    if args.load_model:  # and os.path.isfile(args.load_model)
        load_model(current_model, args)
        load_model(target_model, args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    state_buffer = deque(maxlen=args.action_repeat)
    states_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]
    rewards_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]
    actions_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]

    optimizer = optim.Adam(current_model.parameters(), lr=args.lr)

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0
    episode_length = 0
    episode = 0

    prev_time = time.time()
    prev_frame = 1

    state, state_buffer = get_initial_state(env, state_buffer,
                                            args.action_repeat)
    for frame_idx in range(1, args.max_frames + 1):

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, reward, done, end = env.step(action,
                                                 save_screenshots=False)
        add_state(next_state, state_buffer)
        next_state = recent_state(state_buffer)

        for agent_index in range(len(done)):
            states_deque[agent_index].append((state[agent_index]))
            rewards_deque[agent_index].append(reward[agent_index])
            actions_deque[agent_index].append(action[agent_index])
            if len(states_deque[agent_index]
                   ) == args.multi_step or done[agent_index]:
                n_reward = multi_step_reward(rewards_deque[agent_index],
                                             args.gamma)
                n_state = states_deque[agent_index][0]
                n_action = actions_deque[agent_index][0]
                replay_buffer.push(n_state, n_action, n_reward,
                                   next_state[agent_index],
                                   np.float32(done[agent_index]))

        # delete the agents that have reached the goal
        r_index = 0
        for r in range(len(done)):
            if done[r] is True:
                state_buffer, states_deque, actions_deque, rewards_deque = del_record(
                    r_index, state_buffer, states_deque, actions_deque,
                    rewards_deque)
                r_index -= 1
            r_index += 1
        next_state = recent_state(state_buffer)

        state = next_state
        episode_reward += np.array(reward).mean()
        episode_length += 1

        if end:
            if args.save_video and episode % 10 == 0:
                evaluate(env, current_model, args)
            state, state_buffer = get_initial_state(env, state_buffer,
                                                    args.action_repeat)
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("data/episode_reward", episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            episode_reward, episode_length = 0, 0
            for d in range(len(states_deque)):
                states_deque[d].clear()
                rewards_deque[d].clear()
                actions_deque[d].clear()
            states_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            rewards_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            actions_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            episode += 1

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            losses = 0
            for _ in range(1):
                loss = compute_td_loss(current_model, target_model,
                                       replay_buffer, optimizer, args, beta)
                losses += loss.item()
            loss_list.append(losses)
            writer.add_scalar("data/loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list,
                      length_list, loss_list)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(current_model, args)

    save_model(current_model, args)
예제 #12
0
def train(env, args, writer, datetime):
    best_iou = -1.0
    if args.env in ['1DStatic', '1DDynamic']:
        current_model = DQN_1D(env, args).to(args.device)
        target_model = DQN_1D(env, args).to(args.device)
    elif args.env in ['2DStatic', '2DDynamic']:
        current_model = DQN_2D(env, args).to(args.device)
        target_model = DQN_2D(env, args).to(args.device)
    elif args.env in ['3DStatic', '3DDynamic']:
        current_model = DQN_3D(env, args).to(args.device)
        target_model = DQN_3D(env, args).to(args.device)

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    optimizer = optim.Adam(current_model.parameters(), lr=args.lr)

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0
    episode_length = 0
    episode = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        if args.env in ['2DDynamic']:
            action = current_model.act(
                torch.FloatTensor(state).to(args.device), epsilon)
        else:
            action = current_model.act(
                torch.FloatTensor(state).to(args.device), epsilon)
        next_state, reward, done = env.step(action)
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            episode += 1

            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("Episode_reward/train", episode_reward, episode)
            writer.add_scalar("Episode_length/train", episode_length, episode)
            episode_reward = 0
            episode_length = 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            writer.add_scalar("Loss/train", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list,
                      length_list, loss_list, args)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()

            best_iou = test(env, args, current_model, best_iou, writer,
                            episode, datetime)
예제 #13
0
def train(env, args, writer):
    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    for para in target_model.parameters():
        para.requires_grad = False

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()
    #target_model.eval()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    update_target(current_model, target_model)
    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
        args.buffer_size = replay_buffer.it_capacity
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    print_args(args)
    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    if args.optim == 'adam':
        optimizer = optim.Adam(current_model.parameters(),
                               lr=args.lr,
                               eps=args.adam_eps,
                               betas=(0.9, args.beta2))
    elif args.optim == 'laprop':
        optimizer = laprop.LaProp(current_model.parameters(),
                                  lr=args.lr,
                                  betas=(0.9, args.beta2))

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0.
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    evaluation_interval = args.evaluation_interval
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, raw_reward, done, _ = env.step(action)
        if args.clip_rewards:
            reward = np.clip(raw_reward, -1., 1.)
        else:
            reward = raw_reward
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += raw_reward
        episode_length += 1

        if episode_length >= 9950:
            while not done:
                _, _, done, _ = env.step(random.randrange(env.action_space.n))

        if done:
            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            if episode_length > 10000:
                print('{:.2f}'.format(episode_reward), end='')
            writer.add_scalar("data/episode_reward", episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            episode_reward, episode_length = 0., 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            writer.add_scalar("data/loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % evaluation_interval == 0:
            if len(length_list) > 0:
                print_log(frame_idx, prev_frame, prev_time, reward_list,
                          length_list, loss_list, args)
                reward_list.clear(), length_list.clear(), loss_list.clear()
                prev_frame = frame_idx
                prev_time = time.time()
                save_model(current_model, args)
            else:
                evaluation_interval += args.evaluation_interval
        if frame_idx % 200000 == 0:
            if args.adam_eps == 1.5e-4:
                save_model(current_model,
                           args,
                           name="{}_{}".format(args.optim, frame_idx))
            else:
                save_model(current_model,
                           args,
                           name="{}{:.2e}_{}".format(args.optim, args.adam_eps,
                                                     frame_idx))

    reward_list.append(episode_reward)
    length_list.append(episode_length)
    print_log(frame_idx, prev_frame, prev_time, reward_list, length_list,
              loss_list, args)
    reward_list.clear(), length_list.clear(), loss_list.clear()
    prev_frame = frame_idx
    prev_time = time.time()

    save_model(current_model, args)
예제 #14
0
def main(args):
    config = load_config(args)
    prefix = config['env_id']
    training_config = config['training_config']
    if config['name_suffix']:
        prefix += config['name_suffix']
    if config['path_prefix']:
        prefix = os.path.join(config['path_prefix'], prefix)
    if not os.path.exists(prefix):
        os.makedirs(prefix)

    train_log = os.path.join(prefix, 'train.log')
    logger = Logger(open(train_log, "w"))
    logger.log('Command line:', " ".join(sys.argv[:]))
    logger.log(args)
    logger.log(config)

    env_params = training_config['env_params']
    env_id = config['env_id']
    if "NoFrameskip" not in env_id:
        env = make_atari_cart(env_id)
    else:
        env = make_atari(env_id)
        env = wrap_deepmind(env, **env_params)
        env = wrap_pytorch(env)

    seed = training_config['seed']
    env.seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

    state = env.reset()
    dtype = state.dtype
    logger.log("env_shape: {}, num of actions: {}".format(
        env.observation_space.shape, env.action_space.n))
    if "NoFrameskip" in env_id:
        logger.log('action meaning:',
                   env.unwrapped.get_action_meanings()[:env.action_space.n])

    robust = training_config.get('robust', False)
    adv_train = training_config.get('adv_train', False)
    bound_solver = training_config.get('bound_solver', 'cov')
    attack_config = {}
    if adv_train or bound_solver == 'pgd':
        test_config = config['test_config']
        attack_config = training_config["attack_config"]
        adv_ratio = training_config.get('adv_ratio', 1)
        if adv_train:
            logger.log('using adversarial examples for training, adv ratio:',
                       adv_ratio)
        else:
            logger.log('using pgd regularization training')
    if robust or adv_train:
        schedule_start = training_config['schedule_start']
        schedule_length = training_config['schedule_length']
        starting_epsilon = training_config['start_epsilon']
        end_epsilon = training_config['epsilon']
        epsilon_scheduler = EpsilonScheduler(
            training_config.get("schedule_type", "linear"), schedule_start,
            schedule_start + schedule_length - 1, starting_epsilon,
            end_epsilon, 1)
        max_eps = end_epsilon

    model_width = training_config['model_width']
    robust_model = robust and bound_solver != 'pgd'
    dueling = training_config.get('dueling', True)

    current_model = model_setup(env_id, env, robust_model, logger, USE_CUDA,
                                dueling, model_width)
    target_model = model_setup(env_id, env, robust_model, logger, USE_CUDA,
                               dueling, model_width)

    load_path = training_config["load_model_path"]
    if load_path != "" and os.path.exists(load_path):
        load_frame = int(re.findall('^.*frame_([0-9]+).pth$', load_path)[0])
        logger.log('\ntrain from model {}, current frame index is {}\n'.format(
            load_path, load_frame))
        current_model.features.load_state_dict(torch.load(load_path))
        target_model.features.load_state_dict(torch.load(load_path))
    else:
        logger.log('\ntrain from scratch')
        load_frame = 1

    lr = training_config['lr']
    grad_clip = training_config['grad_clip']
    natural_loss_fn = training_config['natural_loss_fn']
    optimizer = optim.Adam(current_model.parameters(),
                           lr=lr,
                           eps=training_config['adam_eps'])
    # Do not evaluate gradient for target model.
    for param in target_model.features.parameters():
        param.requires_grad = False

    buffer_config = training_config['buffer_params']
    replay_initial = buffer_config['replay_initial']
    buffer_capacity = buffer_config['buffer_capacity']
    use_cpp_buffer = training_config["cpprb"]
    use_async_rb = training_config['use_async_rb']
    num_frames = training_config['num_frames']
    batch_size = training_config['batch_size']
    gamma = training_config['gamma']

    if use_cpp_buffer:
        logger.log('using cpp replay buffer')
        if use_async_rb:
            replay_buffer_ctor = AsyncReplayBuffer(initial_state=state,
                                                   batch_size=batch_size)
        else:
            replay_buffer_ctor = cpprb.PrioritizedReplayBuffer
    else:
        logger.log('using python replay buffer')
    per = training_config['per']

    if per:
        logger.log('using prioritized experience replay.')
        alpha = buffer_config['alpha']
        buffer_beta_start = buffer_config['buffer_beta_start']
        buffer_beta_frames = buffer_config.get('buffer_beta_frames', -1)
        if buffer_beta_frames < replay_initial:
            buffer_beta_frames = num_frames - replay_initial
            logger.log('beffer_beta_frames reset to ', buffer_beta_frames)
        buffer_beta_scheduler = BufferBetaScheduler(buffer_beta_start,
                                                    buffer_beta_frames,
                                                    start_frame=replay_initial)
        if use_cpp_buffer:
            replay_buffer = replay_buffer_ctor(
                size=buffer_capacity,
                # env_dict={"obs": {"shape": state.shape, "dtype": np.uint8},
                env_dict={
                    "obs": {
                        "shape": state.shape,
                        "dtype": dtype
                    },
                    "act": {
                        "shape": 1,
                        "dtype": np.uint8
                    },
                    "rew": {},
                    # "next_obs": {"shape": state.shape, "dtype": np.uint8},
                    "next_obs": {
                        "shape": state.shape,
                        "dtype": dtype
                    },
                    "done": {}
                },
                alpha=alpha,
                eps=0.0)  # We add eps manually in training loop
        else:
            replay_buffer = PrioritizedReplayBuffer(buffer_capacity,
                                                    alpha=alpha)

    else:
        logger.log('using regular replay.')
        if use_cpp_buffer:
            replay_buffer = cpprb.ReplayBuffer(
                buffer_capacity,
                # {"obs": {"shape": state.shape, "dtype": np.uint8},
                {
                    "obs": {
                        "shape": state.shape,
                        "dtype": dtype
                    },
                    "act": {
                        "shape": 1,
                        "dtype": np.uint8
                    },
                    "rew": {},
                    # "next_obs": {"shape": state.shape, "dtype": np.uint8},
                    "next_obs": {
                        "shape": state.shape,
                        "dtype": dtype
                    },
                    "done": {}
                })
        else:
            replay_buffer = ReplayBuffer(buffer_capacity)

    update_target(current_model, target_model)

    act_epsilon_start = training_config['act_epsilon_start']
    act_epsilon_final = training_config['act_epsilon_final']
    act_epsilon_decay = training_config['act_epsilon_decay']
    act_epsilon_method = training_config['act_epsilon_method']
    if training_config.get('act_epsilon_decay_zero', True):
        decay_zero = num_frames
    else:
        decay_zero = None
    act_epsilon_scheduler = ActEpsilonScheduler(act_epsilon_start,
                                                act_epsilon_final,
                                                act_epsilon_decay,
                                                method=act_epsilon_method,
                                                start_frame=replay_initial,
                                                decay_zero=decay_zero)

    # Use optimized cuda memory management
    memory_mgr = CudaTensorManager(state.shape,
                                   batch_size,
                                   per,
                                   USE_CUDA,
                                   dtype=dtype)

    losses = []
    td_losses = []
    batch_cur_q = []
    batch_exp_q = []

    sa = None
    kappa = None
    hinge = False
    if robust:
        logger.log(
            'using convex relaxation certified classification loss as a regularization!'
        )
        kappa = training_config['kappa']
        reg_losses = []
        sa = np.zeros(
            (current_model.num_actions, current_model.num_actions - 1),
            dtype=np.int32)
        for i in range(sa.shape[0]):
            for j in range(sa.shape[1]):
                if j < i:
                    sa[i][j] = j
                else:
                    sa[i][j] = j + 1
        sa = torch.LongTensor(sa)
        hinge = training_config.get('hinge', False)
        logger.log('using hinge loss (default is cross entropy): ', hinge)

    if training_config['use_async_env']:
        # Create an environment in a separate process, run asychronously
        async_env = AsyncEnv(env_id,
                             result_path=prefix,
                             draw=training_config['show_game'],
                             record=training_config['record_game'],
                             env_params=env_params,
                             seed=seed)

    # initialize parameters in logging
    all_rewards = []
    episode_reward = 0
    act_epsilon = np.nan
    grad_norm = np.nan
    weights_norm = np.nan
    best_test_reward = -float('inf')
    buffer_stored_size = 0
    if adv_train:
        attack_count = 0
        suc_count = 0
    if robust and bound_solver == 'pgd':
        ori_margin, adv_margin = np.nan, np.nan

    start_time = time.time()
    period_start_time = time.time()

    # Main Loop
    for frame_idx in range(load_frame, num_frames + 1):
        # Step 1: get current action
        frame_start = time.time()
        t = time.time()

        eps = 0
        if adv_train or robust:
            eps = epsilon_scheduler.get_eps(frame_idx, 0)

        act_epsilon = act_epsilon_scheduler.get(frame_idx)
        if adv_train and eps != np.nan and eps >= np.finfo(np.float32).tiny:
            ori_state_tensor = torch.from_numpy(
                np.ascontiguousarray(state)).unsqueeze(0).cuda().to(
                    torch.float32)
            if dtype in UINTS:
                ori_state_tensor /= 255
            attack_config['params']['epsilon'] = eps
            if random.random() < adv_ratio:
                attack_count += 1
                state_tensor = attack(current_model, ori_state_tensor,
                                      attack_config)
                if current_model.act(state_tensor)[0] != current_model.act(
                        ori_state_tensor)[0]:
                    suc_count += 1
            else:
                state_tensor = ori_state_tensor
            action = current_model.act(state_tensor, act_epsilon)[0]
        else:
            with torch.no_grad():
                state_tensor = torch.from_numpy(
                    np.ascontiguousarray(state)).unsqueeze(0).cuda().to(
                        torch.float32)
                if dtype in UINTS:
                    state_tensor /= 255
                ori_state_tensor = torch.clone(state_tensor)
                action = current_model.act(state_tensor, act_epsilon)[0]

        # torch.cuda.synchronize()
        log_time('act_time', time.time() - t)

        # Step 2: run environment
        t = time.time()
        if training_config['use_async_env']:
            async_env.async_step(action)
        else:
            next_state, reward, done, _ = env.step(action)
        log_time('env_time', time.time() - t)

        # Step 3: save to buffer
        # For asynchronous env, defer saving
        if not training_config['use_async_env']:
            t = time.time()
            if use_cpp_buffer:
                replay_buffer.add(obs=state,
                                  act=action,
                                  rew=reward,
                                  next_obs=next_state,
                                  done=done)
            else:
                replay_buffer.push(state, action, reward, next_state, done)
            log_time('save_time', time.time() - t)

        if use_cpp_buffer:
            buffer_stored_size = replay_buffer.get_stored_size()
        else:
            buffer_stored_size = len(replay_buffer)

        beta = np.nan
        buffer_beta = np.nan
        t = time.time()

        if buffer_stored_size > replay_initial:
            if training_config['per']:
                buffer_beta = buffer_beta_scheduler.get(frame_idx)
            if robust:
                convex_final_beta = training_config['convex_final_beta']
                convex_start_beta = training_config['convex_start_beta']
                beta = (
                    max_eps - eps *
                    (1.0 - convex_final_beta)) / max_eps * convex_start_beta

            res = compute_td_loss(current_model,
                                  target_model,
                                  batch_size,
                                  replay_buffer,
                                  per,
                                  use_cpp_buffer,
                                  use_async_rb,
                                  optimizer,
                                  gamma,
                                  memory_mgr,
                                  robust,
                                  buffer_beta=buffer_beta,
                                  grad_clip=grad_clip,
                                  natural_loss_fn=natural_loss_fn,
                                  eps=eps,
                                  beta=beta,
                                  sa=sa,
                                  kappa=kappa,
                                  dtype=dtype,
                                  hinge=hinge,
                                  hinge_c=training_config.get('hinge_c', 1),
                                  env_id=env_id,
                                  bound_solver=bound_solver,
                                  attack_config=attack_config)
            loss, grad_norm, weights_norm, td_loss, batch_cur_q_value, batch_exp_q_value = res[
                0], res[1], res[2], res[3], res[4], res[5]
            if robust:
                reg_loss = res[-1]
                reg_losses.append(reg_loss.data.item())
                if bound_solver == 'pgd':
                    ori_margin, adv_margin = res[-3].data.item(
                    ), res[-2].data.item()

            losses.append(loss.data.item())
            td_losses.append(td_loss.data.item())
            batch_cur_q.append(batch_cur_q_value.data.item())
            batch_exp_q.append(batch_exp_q_value.data.item())

        log_time('loss_time', time.time() - t)

        # Step 2: run environment (async)
        t = time.time()
        if training_config['use_async_env']:
            next_state, reward, done, _ = async_env.wait_step()
        log_time('env_time', time.time() - t)

        # Step 3: save to buffer (async)
        if training_config['use_async_env']:
            t = time.time()
            if use_cpp_buffer:
                replay_buffer.add(obs=state,
                                  act=action,
                                  rew=reward,
                                  next_obs=next_state,
                                  done=done)
            else:
                replay_buffer.push(state, action, reward, next_state, done)
            log_time('save_time', time.time() - t)

        # Update states and reward
        t = time.time()
        state = next_state
        episode_reward += reward
        if done:
            if training_config['use_async_env']:
                state = async_env.reset()
            else:
                state = env.reset()
            all_rewards.append(episode_reward)
            episode_reward = 0
        log_time('env_time', time.time() - t)

        # All kinds of result logging
        if frame_idx % training_config[
                'print_frame'] == 0 or frame_idx == num_frames or (
                    robust and abs(frame_idx - schedule_start) < 5
                ) or abs(buffer_stored_size - replay_initial) < 5:
            logger.log(
                '\nframe {}/{}, learning rate: {:.6g}, buffer beta: {:.6g}, action epsilon: {:.6g}'
                .format(frame_idx, num_frames, lr, buffer_beta, act_epsilon))
            logger.log(
                'total time: {:.2f}, epoch time: {:.4f}, speed: {:.2f} frames/sec, last total loss: {:.6g}, avg total loss: {:.6g}, grad norm: {:.6g}, weights_norm: {:.6g}, latest episode reward: {:.6g}, avg 10 episode reward: {:.6g}'
                .format(
                    time.time() - start_time,
                    time.time() - period_start_time,
                    training_config['print_frame'] /
                    (time.time() - period_start_time),
                    losses[-1] if losses else np.nan,
                    np.average(losses[:-training_config['print_frame'] -
                                      1:-1]) if losses else np.nan, grad_norm,
                    weights_norm, all_rewards[-1] if all_rewards else np.nan,
                    np.average(all_rewards[:-11:-1])
                    if all_rewards else np.nan))
            logger.log('last td loss: {:.6g}, avg td loss: {:.6g}'.format(
                td_losses[-1] if td_losses else np.nan,
                np.average(td_losses[:-training_config['print_frame'] -
                                     1:-1]) if td_losses else np.nan))
            logger.log(
                'last batch cur q: {:.6g}, avg batch cur q: {:.6g}'.format(
                    batch_cur_q[-1] if batch_cur_q else np.nan,
                    np.average(batch_cur_q[:-training_config['print_frame'] -
                                           1:-1]) if batch_cur_q else np.nan))
            logger.log(
                'last batch exp q: {:.6g}, avg batch exp q: {:.6g}'.format(
                    batch_exp_q[-1] if batch_exp_q else np.nan,
                    np.average(batch_exp_q[:-training_config['print_frame'] -
                                           1:-1]) if batch_exp_q else np.nan))
            if robust:
                logger.log('current input epsilon: {:.6g}'.format(eps))
                if bound_solver == 'pgd':
                    logger.log(
                        'last logit margin: ori: {:.6g}, adv: {:.6g}'.format(
                            ori_margin, adv_margin))
                else:
                    logger.log('current bound beta: {:.6g}'.format(beta))
                logger.log(
                    'last cert reg loss: {:.6g}, avg cert reg loss: {:.6g}'.
                    format(
                        reg_losses[-1] if reg_losses else np.nan,
                        np.average(
                            reg_losses[:-training_config['print_frame'] -
                                       1:-1]) if reg_losses else np.nan))
                logger.log('current kappa: {:.6g}'.format(kappa))
            if adv_train:
                logger.log(
                    'current attack epsilon (same as input epsilon): {:.6g}'.
                    format(eps))
                diff = ori_state_tensor - state_tensor
                diff = np.abs(diff.data.cpu().numpy())
                logger.log('current Linf distortion: {:.6g}'.format(
                    np.max(diff)))
                logger.log(
                    'this batch attacked: {}, success: {}, attack success rate: {:.6g}'
                    .format(
                        attack_count, suc_count, suc_count * 1.0 /
                        attack_count if attack_count > 0 else np.nan))
                attack_count = 0
                suc_count = 0
                logger.log('attack stats reseted.')

            period_start_time = time.time()
            log_time.print()
            log_time.clear()

        if frame_idx % training_config[
                'save_frame'] == 0 or frame_idx == num_frames:
            plot(frame_idx, all_rewards, losses, prefix)
            torch.save(current_model.features.state_dict(),
                       '{}/frame_{}.pth'.format(prefix, frame_idx))

        if frame_idx % training_config['update_target_frame'] == 0:
            update_target(current_model, target_model)

        if frame_idx % training_config.get('mini_test', 100000) == 0 and (
            (robust and beta == 0) or
            (not robust and frame_idx * 1.0 / num_frames >= 0.8)):
            test_reward = mini_test(current_model, config, logger, dtype)
            logger.log('this test avg reward: {:6g}'.format(test_reward))
            if test_reward >= best_test_reward:
                best_test_reward = test_reward
                logger.log(
                    'new best reward {:6g} achieved, update checkpoint'.format(
                        test_reward))
                torch.save(current_model.features.state_dict(),
                           '{}/best_frame_{}.pth'.format(prefix, frame_idx))

        log_time.log_time('total', time.time() - frame_start)
예제 #15
0
    def test_initial_priorities(self):
        prb = PrioritizedReplayBuffer(8)
        for exp in Transitions:
            prb.add(exp)

        assert (prb._st_sum._storage[-8:] == np.array([1.] * 8)).all()
예제 #16
0
def learn(logger, device, env, number_timesteps, network, optimizer, save_path,
          save_interval, ob_scale, gamma, grad_norm, double_q, param_noise,
          exploration_fraction, exploration_final_eps, batch_size, train_freq,
          learning_starts, target_network_update_freq, buffer_size,
          prioritized_replay, prioritized_replay_alpha,
          prioritized_replay_beta0, atom_num, min_value, max_value):
    """
    Papers:
    Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep
    reinforcement learning[J]. Nature, 2015, 518(7540): 529.
    Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements
    in Deep Reinforcement Learning[J]. 2017.

    Parameters:
    ----------
    double_q (bool): if True double DQN will be used
    param_noise (bool): whether or not to use parameter space noise
    dueling (bool): if True dueling value estimation will be used
    exploration_fraction (float): fraction of entire training period over which
                                  the exploration rate is annealed
    exploration_final_eps (float): final value of random action probability
    batch_size (int): size of a batched sampled from replay buffer for training
    train_freq (int): update the model every `train_freq` steps
    learning_starts (int): how many steps of the model to collect transitions
                           for before learning starts
    target_network_update_freq (int): update the target network every
                                      `target_network_update_freq` steps
    buffer_size (int): size of the replay buffer
    prioritized_replay (bool): if True prioritized replay buffer will be used.
    prioritized_replay_alpha (float): alpha parameter for prioritized replay
    prioritized_replay_beta0 (float): beta parameter for prioritized replay
    atom_num (int): atom number in distributional RL for atom_num > 1
    min_value (float): min value in distributional RL
    max_value (float): max value in distributional RL

    """

    qnet = network.to(device)
    qtar = deepcopy(qnet)
    if prioritized_replay:
        buffer = PrioritizedReplayBuffer(buffer_size, device,
                                         prioritized_replay_alpha,
                                         prioritized_replay_beta0)
    else:
        buffer = ReplayBuffer(buffer_size, device)
    generator = _generate(device, env, qnet, ob_scale, number_timesteps,
                          param_noise, exploration_fraction,
                          exploration_final_eps, atom_num, min_value,
                          max_value)
    if atom_num > 1:
        delta_z = float(max_value - min_value) / (atom_num - 1)
        z_i = torch.linspace(min_value, max_value, atom_num).to(device)

    infos = {'eplenmean': deque(maxlen=100), 'eprewmean': deque(maxlen=100)}
    start_ts = time.time()
    for n_iter in range(1, number_timesteps + 1):
        if prioritized_replay:
            buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps
        *data, info = generator.__next__()
        buffer.add(*data)
        for k, v in info.items():
            infos[k].append(v)

        # update qnet
        if n_iter > learning_starts and n_iter % train_freq == 0:
            b_o, b_a, b_r, b_o_, b_d, *extra = buffer.sample(batch_size)
            b_o.mul_(ob_scale)
            b_o_.mul_(ob_scale)

            if atom_num == 1:
                with torch.no_grad():
                    if double_q:
                        b_a_ = qnet(b_o_).argmax(1).unsqueeze(1)
                        b_q_ = (1 - b_d) * qtar(b_o_).gather(1, b_a_)
                    else:
                        b_q_ = (1 - b_d) * qtar(b_o_).max(1, keepdim=True)[0]
                b_q = qnet(b_o).gather(1, b_a)
                abs_td_error = (b_q - (b_r + gamma * b_q_)).abs()
                priorities = abs_td_error.detach().cpu().clamp(1e-6).numpy()
                if extra:
                    loss = (extra[0] * huber_loss(abs_td_error)).mean()
                else:
                    loss = huber_loss(abs_td_error).mean()
            else:
                with torch.no_grad():
                    b_dist_ = qtar(b_o_).exp()
                    b_a_ = (b_dist_ * z_i).sum(-1).argmax(1)
                    b_tzj = (gamma * (1 - b_d) * z_i[None, :] + b_r).clamp(
                        min_value, max_value)
                    b_i = (b_tzj - min_value) / delta_z
                    b_l = b_i.floor()
                    b_u = b_i.ceil()
                    b_m = torch.zeros(batch_size, atom_num).to(device)
                    temp = b_dist_[torch.arange(batch_size), b_a_, :]
                    b_m.scatter_add_(1, b_l.long(), temp * (b_u - b_i))
                    b_m.scatter_add_(1, b_u.long(), temp * (b_i - b_l))
                b_q = qnet(b_o)[torch.arange(batch_size), b_a.squeeze(1), :]
                kl_error = -(b_q * b_m).sum(1)
                # use kl error as priorities as proposed by Rainbow
                priorities = kl_error.detach().cpu().clamp(1e-6).numpy()
                loss = kl_error.mean()

            optimizer.zero_grad()
            loss.backward()
            if grad_norm is not None:
                nn.utils.clip_grad_norm_(qnet.parameters(), grad_norm)
            optimizer.step()
            if prioritized_replay:
                buffer.update_priorities(extra[1], priorities)

        # update target net and log
        if n_iter % target_network_update_freq == 0:
            qtar.load_state_dict(qnet.state_dict())
            logger.info('{} Iter {} {}'.format('=' * 10, n_iter, '=' * 10))
            fps = int(n_iter / (time.time() - start_ts))
            logger.info('Total timesteps {} FPS {}'.format(n_iter, fps))
            for k, v in infos.items():
                v = (sum(v) / len(v)) if v else float('nan')
                logger.info('{}: {:.6f}'.format(k, v))
            if n_iter > learning_starts and n_iter % train_freq == 0:
                logger.info('vloss: {:.6f}'.format(loss.item()))

        if save_interval and n_iter % save_interval == 0:
            torch.save(
                [qnet.state_dict(), optimizer.state_dict()],
                os.path.join(save_path, '{}.checkpoint'.format(n_iter)))
예제 #17
0
def train(env, args, writer):
    p1_current_model = DQN(env, args).to(args.device)
    p1_target_model = DQN(env, args).to(args.device)
    update_target(p1_current_model, p1_target_model)
    p2_current_model = DQN(env, args).to(args.device)
    p2_target_model = DQN(env, args).to(args.device)
    update_target(p2_current_model, p2_target_model)

    if args.noisy:
        p1_current_model.update_noisy_modules()
        p1_target_model.update_noisy_modules()
        p2_current_model.update_noisy_modules()
        p2_target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(p1_current_model, args, 1)
        load_model(p2_current_model, args, 2)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
        p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        p1_replay_buffer = ReplayBuffer(args.buffer_size)
        p2_replay_buffer = ReplayBuffer(args.buffer_size)
    
    p1_state_deque = deque(maxlen=args.multi_step)
    p2_state_deque = deque(maxlen=args.multi_step)
    p1_reward_deque = deque(maxlen=args.multi_step)
    p1_action_deque = deque(maxlen=args.multi_step)
    p2_reward_deque = deque(maxlen=args.multi_step)
    p2_action_deque = deque(maxlen=args.multi_step)

    p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr)
    p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr)

    length_list = []
    p1_reward_list, p1_loss_list = [], []
    p2_reward_list, p2_loss_list = [], []
    p1_episode_reward, p2_episode_reward = 0, 0
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    (p1_state, p2_state) = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.noisy:
            p1_current_model.sample_noise()
            p1_target_model.sample_noise()
            p2_current_model.sample_noise()
            p2_target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon)
        p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon)

        if args.render:
            env.render()

        actions = {"1": p1_action, "2": p2_action}
        (p1_next_state, p2_next_state), reward, done, _ = env.step(actions)


        p1_state_deque.append(p1_state)
        p2_state_deque.append(p2_state)
        if args.negative:
            p1_reward_deque.append(reward[0] - 1)
        else:
            p1_reward_deque.append(reward[0])
        p1_action_deque.append(p1_action)
        if args.negative:
            p2_reward_deque.append(reward[1] - 1)
        else:
            p2_reward_deque.append(reward[1])
        p2_action_deque.append(p2_action)

        if len(p1_state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(p1_reward_deque, args.gamma)
            n_state = p1_state_deque[0]
            n_action = p1_action_deque[0]
            p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done))

            n_reward = multi_step_reward(p2_reward_deque, args.gamma)
            n_state = p2_state_deque[0]
            n_action = p2_action_deque[0]
            p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done))

        (p1_state, p2_state) = (p1_next_state, p2_next_state)
        p1_episode_reward += (reward[0])
        p2_episode_reward += (reward[1])
        if args.negative:
            p1_episode_reward -= 1
            p2_episode_reward -= 1
        episode_length += 1

        if done or episode_length > args.max_episode_length:
            (p1_state, p2_state) = env.reset()
            p1_reward_list.append(p1_episode_reward)
            p2_reward_list.append(p2_episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx)
            writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0
            p1_state_deque.clear()
            p2_state_deque.clear()
            p1_reward_deque.clear()
            p2_reward_deque.clear()
            p1_action_deque.clear()
            p2_action_deque.clear()

        if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta)
            p1_loss_list.append(loss.item())
            writer.add_scalar("data/p1_loss", loss.item(), frame_idx)

            loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta)
            p2_loss_list.append(loss.item())
            writer.add_scalar("data/p2_loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(p1_current_model, p1_target_model)
            update_target(p2_current_model, p2_target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list)
            print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list)
            p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear()
            p1_loss_list.clear(), p2_loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(p1_current_model, args, 1)
            save_model(p2_current_model, args, 2)

    save_model(p1_current_model, args, 1)
    save_model(p2_current_model, args, 2)
예제 #18
0
def train(env, args):
    # Init WandB
    wandb.init(config=args)

    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    optimizer = optim.Adam(current_model.parameters(), lr=args.lr)

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, reward, done, _ = env.step(action)
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            wandb.log({
                'episode_reward': episode_reward,
                'episode_length': episode_length,
            })
            episode_reward, episode_length = 0, 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            wandb.log({'loss': loss.item()})

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list,
                      length_list, loss_list)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(current_model, args)

    save_model(current_model, args)
예제 #19
0
class DQN:
    def __init__(self, config):
        self.writer = SummaryWriter() 
        self.device = 'cuda' if T.cuda.is_available() else 'cpu'

        self.dqn_type = config["dqn-type"]
        self.run_title = config["run-title"]
        self.env = gym.make(config["environment"])

        self.num_states  = np.prod(self.env.observation_space.shape)
        self.num_actions = self.env.action_space.n

        layers = [
            self.num_states, 
            *config["architecture"], 
            self.num_actions
        ]

        self.policy_net = Q_Network(self.dqn_type, layers).to(self.device)
        self.target_net = Q_Network(self.dqn_type, layers).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        capacity = config["max-experiences"]
        self.p_replay_eps = config["p-eps"]
        self.prioritized_replay = config["prioritized-replay"]
        self.replay_buffer = PrioritizedReplayBuffer(capacity, config["p-alpha"]) if self.prioritized_replay \
                        else ReplayBuffer(capacity)

        self.beta_scheduler = LinearSchedule(config["episodes"], initial_p=config["p-beta-init"], final_p=1.0)
        self.epsilon_decay = lambda e: max(config["epsilon-min"], e * config["epsilon-decay"])

        self.train_freq = config["train-freq"]
        self.use_soft_update = config["use-soft-update"]
        self.target_update = config["target-update"]
        self.tau = config["tau"]
        self.gamma = config["gamma"]
        self.batch_size = config["batch-size"]
        self.time_step = 0

        self.optim = T.optim.AdamW(self.policy_net.parameters(), lr=config["lr-init"], weight_decay=config["weight-decay"])
        self.lr_scheduler = T.optim.lr_scheduler.StepLR(self.optim, step_size=config["lr-step"], gamma=config["lr-gamma"])
        self.criterion = nn.SmoothL1Loss(reduction="none") # Huber Loss
        self.min_experiences = max(config["min-experiences"], config["batch-size"])

        self.save_path = config["save-path"]

    def act(self, state, epsilon=0):
        """
            Act on environment using epsilon-greedy policy
        """
        if np.random.sample() < epsilon:
            return int(np.random.choice(np.arange(self.num_actions)))
        else:
            self.policy_net.eval()
            return self.policy_net(T.tensor(state, device=self.device).float().unsqueeze(0)).argmax().item()

    def _soft_update(self, tau):
        """
            Polyak averaging: soft update model parameters. 
            θ_target = τ*θ_current + (1 - τ)*θ_target
        """
        for target_param, current_param in zip(self.target_net.parameters(), self.policy_net.parameters()):
            target_param.data.copy_(tau*target_param.data + (1.0-tau)*current_param.data)

    def update_target(self, tau):
        if self.use_soft_update:
            self._soft_update(tau)
        elif self.time_step % self.target_update == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

    def optimize(self, beta=None):
        if len(self.replay_buffer) < self.min_experiences:
            return None, None 

        self.policy_net.train()

        if self.prioritized_replay:
            transitions, (is_weights, t_idxes) = self.replay_buffer.sample(self.batch_size, beta)
        else:
            transitions = self.replay_buffer.sample(self.batch_size)
            is_weights, t_idxes = np.ones(self.batch_size), None

        # transpose the batch --> transition of batch-arrays
        batch = Transition(*zip(*transitions))
        # compute a mask of non-final states and concatenate the batch elements
        non_final_mask = T.tensor(tuple(map(lambda state: state is not None, batch.next_state)), 
                                                                device=self.device, dtype=T.bool)  
        non_final_next_states = T.cat([T.tensor([state]).float() for state in batch.next_state if state is not None]).to(self.device)

        state_batch  = T.tensor(batch.state,  device=self.device).float()
        action_batch = T.tensor(batch.action, device=self.device).long()
        reward_batch = T.tensor(batch.reward, device=self.device).float()

        state_action_values = self.policy_net(state_batch).gather(1, action_batch.unsqueeze(1))
    
        next_state_values = T.zeros(self.batch_size, device=self.device)
        if self.dqn_type == "vanilla":
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        else:
            self.policy_net.eval()
            action_next_state = self.policy_net(non_final_next_states).max(1)[1]
            self.policy_net.train()
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).gather(1, action_next_state.unsqueeze(1)).squeeze().detach()

        # compute the expected Q values (RHS of the Bellman equation)
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch
        
        # compute temporal difference error
        td_error = T.abs(state_action_values.squeeze() - expected_state_action_values).detach().cpu().numpy()

        # compute Huber loss
        loss = self.criterion(state_action_values, expected_state_action_values.unsqueeze(1))
        loss = T.mean(loss * T.tensor(is_weights, device=self.device))
      
        # optimize the model
        self.optim.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim.step()

        return td_error, t_idxes

    def run_episode(self, epsilon, beta):
        total_reward, done = 0, False
        state = self.env.reset()
        while not done:
            # use epsilon-greedy to get an action
            action = self.act(state, epsilon)
            # caching the information of current state
            prev_state = state
            # take action
            state, reward, done, _ = self.env.step(action)
            # accumulate reward
            total_reward += reward
            # store the transition in buffer
            if done: state = None 
            self.replay_buffer.push(prev_state, action, state, reward)
            # optimize model
            if self.time_step % self.train_freq == 0:
                td_error, t_idxes = self.optimize(beta=beta)
                # update priorities 
                if self.prioritized_replay and td_error is not None:
                    self.replay_buffer.update_priorities(t_idxes, td_error + self.p_replay_eps)
            # update target network
            self.update_target(self.tau)
            # increment time-step
            self.time_step += 1

        return total_reward

    def train(self, episodes, epsilon, solved_reward):
        total_rewards = np.zeros(episodes)
        for episode in range(episodes):
            
            # compute beta using linear scheduler
            beta = self.beta_scheduler.value(episode)
            # run episode and get rewards
            reward = self.run_episode(epsilon, beta)
            # exponentially decay epsilon
            epsilon = self.epsilon_decay(epsilon)
            # reduce learning rate by
            self.lr_scheduler.step()

            total_rewards[episode] = reward
            avg_reward = total_rewards[max(0, episode-100):(episode+1)].mean()
            last_lr = self.lr_scheduler.get_last_lr()[0]

            # log into tensorboard
            self.writer.add_scalar(f'dqn-{self.dqn_type}/reward', reward, episode)
            self.writer.add_scalar(f'dqn-{self.dqn_type}/reward_100', avg_reward, episode)
            self.writer.add_scalar(f'dqn-{self.dqn_type}/lr', last_lr, episode)
            self.writer.add_scalar(f'dqn-{self.dqn_type}/epsilon', epsilon, episode)

            print(f"Episode: {episode} | Last 100 Average Reward: {avg_reward:.5f} | Learning Rate: {last_lr:.5E} | Epsilon: {epsilon:.5E}", end='\r')

            if avg_reward > solved_reward:
                break
        
        self.writer.close()

        print(f"Environment solved in {episode} episodes")
        T.save(self.policy_net.state_dict(), os.path.join(self.save_path, f"{self.run_title}.pt"))

    def visualize(self, load_path=None):
        done = False
        state = self.env.reset()

        if load_path is not None:
            self.policy_net.load_state_dict(T.load(load_path, map_location=self.device))
        self.policy_net.eval()
        
        while not done:
            self.env.render()
            action = self.act(state)
            state, _, done, _ = self.env.step(int(action))
            sleep(0.01) 
예제 #20
0
class Trainer(threading.Thread):
    def __init__(self, config: Dict, agent: Agent, transitions_queue: Queue,
                 global_episode: Counter, global_update_step: Counter,
                 epsilone: Union[ExponentialEpsilon,
                                 SinusoidalEpsilone], logger: Logger) -> None:
        """
        Thread responsable de l'update des poids des réseaux de neurones de l'agent

        :param config: Dictionnaire de configuration de l'expérience
        :param agent: Agent à optimiser
        :param transitions_queue: Queue à partir de laquelle les threads Player envoient leurs transitions au thread Trainer
        :param global_episode: Compteur du nombre d'époside joués (partagé entre les threads)
        :param global_update_step: Compteur du nombre d'update effectués (partagé entre les threads)
        :param epsilone: Epsilone processus utilisé pour le bruit ajouté aux actions de l'agent
        :param logger: Le logger utilisé au cours de l'expérience
        """
        super().__init__()

        self._config = config
        self._agent = agent
        self._episode_queue = transitions_queue
        self._global_episode = global_episode
        self._global_update_step = global_update_step
        self._logger = logger
        self._epsilone = epsilone

        self._replay_buffer = PrioritizedReplayBuffer(
            size=self._config["trainer_config"]["buffer_size"])
        # TODO: permettre de switcher entre ReplayBuffer et PrioritizedReplayBuffer via la config
        # ReplayBuffer(size=self._config["trainer_config"]["buffer_size"])

        self._best_test_reward = float('-inf')

    def test(self) -> None:
        """
        Teste l'agent sur un episode complet sans bruit
        """
        start_test_time = time()

        env = make_env(self._config)
        obs, done = env.reset(), False
        rews = []
        nb_step = 0
        while not done:
            act = self._agent(obs=obs)
            obs, rew, done, _ = env.step(act)
            rews.append(rew)
            nb_step += 1

        rew_mean = np.mean(rews)

        # logging
        self._logger.add_scalar(label="test/reward",
                                value=sum(rews),
                                step=self._global_update_step.val())
        self._logger.add_scalar(label="test/nb_step",
                                value=nb_step,
                                step=self._global_update_step.val())
        self._logger.add_scalar(label="test/reward_mean",
                                value=float(rew_mean),
                                step=self._global_update_step.val())
        self._logger.add_scalar(label="test/reward_var",
                                value=float(np.var(rews)),
                                step=self._global_update_step.val())

        if self._best_test_reward < int(rew_mean):
            self._agent.save(episode=self._global_episode.val(),
                             update_step=self._global_update_step.val(),
                             test_reward=int(sum(rews)))

        self._logger.add_scalar(label="test/test_speed",
                                value=time() - start_test_time,
                                step=self._global_update_step.val())

    def _should_stop(self) -> bool:
        """
        Check si le thread doit terminer
        :return: True si le thread doit terminer, False sinon
        """
        if self._config["trainer_config"]["max_update_step"] and \
                self._global_update_step.val() > self._config["trainer_config"]["max_update_step"]:
            return True
        if self._config["trainer_config"]["max_episode"] and \
                self._global_episode.val() > self._config["trainer_config"]["max_episode"]:
            return True
        if self._config["trainer_config"]["max_time"] and \
                time() - self._start_training_time > self._config["trainer_config"]["max_time"]:
            return True

        return False

    def run(self) -> None:
        """
        Lance le thread
        """
        self._start_training_time = time()

        # Initialise le replay buffer avec la policy random jusqu'à qu'il contienne min_replay_size transitions
        p_bar = tqdm(total=self._config["agent_config"]["min_replay_size"])
        env = make_env(self._config)
        while len(self._replay_buffer
                  ) < self._config["agent_config"]["min_replay_size"]:
            obs, done = env.reset(), False

            while not done:
                act = env.action_space.sample()
                if len(act.shape) < 1:
                    act = [act]
                next_obs, rew, done, _ = env.step(act)
                transition = Transition(observation=obs,
                                        action=act,
                                        new_observation=next_obs,
                                        reward=rew,
                                        done=done)
                obs = next_obs
                self._replay_buffer.add(transition)
                p_bar.update(1)

        print("buffer initialization done")

        while True:
            start_get_replays_time = time()

            # Récupération d'une transition dans la queue et placement de cette transition dans le replay buffer
            while True:
                if self._should_stop():
                    break

                try:
                    transition = self._episode_queue.get_nowait()
                    self._replay_buffer.add(transition)
                    break
                except Empty:
                    sleep(0.01)
                    pass

            if self._should_stop():
                break

            end_get_replays_time = time()

            start_update_time = time()

            indexes_b, transition_b, weights_b = self._replay_buffer.sample(
                self._config["agent_config"]["batch_size"])

            if self._global_update_step.val(
            ) % self._config["trainer_config"]["log_freq"] == 0:
                update_step = self._global_update_step.val()
            else:
                update_step = None

            td_error = self._agent.update(transition_b, weights_b, update_step)

            # Update les priorités du replay buffer avec l'erreur du critic
            new_priorities = np.abs(
                td_error
            ) + 1e-16  # attention les priorités doivent toujours être strictement positives
            self._replay_buffer.update(indexes_b, new_priorities)

            # logging
            if update_step:
                self._logger.add_scalar(label="multithreading/queue_size",
                                        value=self._episode_queue.qsize(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/buffer_size",
                                        value=len(self._replay_buffer),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/priority_mean",
                                        value=weights_b.mean(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/priority_var",
                                        value=weights_b.var(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/td_error_mean",
                                        value=np.abs(td_error).mean(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/update_step",
                                        value=self._global_episode.val(),
                                        step=update_step)
                self._logger.add_scalar(label="trainer/update_time",
                                        value=time() - start_update_time,
                                        step=update_step)
                self._logger.add_scalar(label="trainer/idle_time",
                                        value=end_get_replays_time -
                                        start_get_replays_time,
                                        step=update_step)

            self._global_update_step.inc()
            self._epsilone.step()

            if self._global_update_step.val(
            ) % self._config["trainer_config"]["test_freq"] == 0:
                self.test()

        self.test()
        print("trainer end")
예제 #21
0
def learn(env,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16):
    torch.set_num_threads(num_cpu)
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,
            final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    exploration = LinearSchedule(
        schedule_timesteps=int(exploration_fraction * max_timesteps),
        initial_p=1.0,
        final_p=exploration_final_eps)
    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = player_relative

    obs, xy_per_marine = common.init(env, obs)

    group_id = 0
    reset = True
    dqn = DQN(num_actions, lr, cuda)

    print('\nCollecting experience...')
    checkpoint_path = 'models/deepq/checkpoint.pth.tar'
    if os.path.exists(checkpoint_path):
        dqn, saved_mean_reward = load_checkpoint(dqn, cuda, filename=checkpoint_path)
    for t in range(max_timesteps):
        # Take action and update exploration to the newest value
        # custom process for DefeatZerglingsAndBanelings
        obs, screen, player = common.select_marine(env, obs)
        # action = act(
        #     np.array(screen)[None], update_eps=update_eps, **kwargs)[0]
        action = dqn.choose_action(np.array(screen)[None])
        reset = False
        rew = 0
        new_action = None
        obs, new_action = common.marine_action(env, obs, player, action)
        army_count = env._obs[0].observation.player_common.army_count
        try:
            if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]:
                obs = env.step(actions=new_action)
            else:
                new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                obs = env.step(actions=new_action)
        except Exception as e:
            # print(e)
            1  # Do nothing
        player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
        new_screen = player_relative
        rew += obs[0].reward
        done = obs[0].step_type == environment.StepType.LAST
        selected = obs[0].observation["screen"][_SELECTED]
        player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
        if len(player_y) > 0:
            player = [int(player_x.mean()), int(player_y.mean())]
        if len(player) == 2:
            if player[0] > 32:
                new_screen = common.shift(LEFT, player[0] - 32, new_screen)
            elif player[0] < 32:
                new_screen = common.shift(RIGHT, 32 - player[0],
                                          new_screen)
            if player[1] > 32:
                new_screen = common.shift(UP, player[1] - 32, new_screen)
            elif player[1] < 32:
                new_screen = common.shift(DOWN, 32 - player[1], new_screen)
        # Store transition in the replay buffer.
        replay_buffer.add(screen, action, rew, new_screen, float(done))
        screen = new_screen
        episode_rewards[-1] += rew
        reward = episode_rewards[-1]
        if done:
            print("Episode Reward : %s" % episode_rewards[-1])
            obs = env.reset()
            player_relative = obs[0].observation["screen"][
                _PLAYER_RELATIVE]
            screen = player_relative
            group_list = common.init(env, obs)
            # Select all marines first
            # env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
            episode_rewards.append(0.0)
            reset = True

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(
                    batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = dqn.learn(obses_t, actions, rewards, obses_tp1, gamma, batch_size)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes,
                                                new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            dqn.update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("reward", reward)
            logger.record_tabular("mean 100 episode reward",
                                  mean_100ep_reward)
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.dump_tabular()

        if (checkpoint_freq is not None and t > learning_starts
                and num_episodes > 100 and t % checkpoint_freq == 0):
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                if print_freq is not None:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".format(
                            saved_mean_reward,
                            mean_100ep_reward))
                save_checkpoint({
                    'epoch': t + 1,
                    'state_dict': dqn.save_state_dict(),
                    'best_accuracy': mean_100ep_reward
                }, checkpoint_path)
                saved_mean_reward = mean_100ep_reward
예제 #22
0
def learn(device,
          env, seed,
          number_timesteps,
          network, optimizer,
          save_path, save_interval, ob_scale,
          gamma, grad_norm,
          double_q, param_noise,
          exploration_fraction, exploration_final_eps,
          batch_size, train_freq, learning_starts, target_network_update_freq,
          buffer_size, prioritized_replay, prioritized_replay_alpha,
          prioritized_replay_beta0):
    """
    Papers:
    Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep
    reinforcement learning[J]. Nature, 2015, 518(7540): 529.
    Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements
    in Deep Reinforcement Learning[J]. 2017.

    Parameters:
    ----------
    double_q (bool): if True double DQN will be used
    param_noise (bool): whether or not to use parameter space noise
    dueling (bool): if True dueling value estimation will be used
    exploration_fraction (float): fraction of entire training period over which
                                  the exploration rate is annealed
    exploration_final_eps (float): final value of random action probability
    batch_size (int): size of a batched sampled from replay buffer for training
    train_freq (int): update the model every `train_freq` steps
    learning_starts (int): how many steps of the model to collect transitions
                           for before learning starts
    target_network_update_freq (int): update the target network every
                                      `target_network_update_freq` steps
    buffer_size (int): size of the replay buffer
    prioritized_replay (bool): if True prioritized replay buffer will be used.
    prioritized_replay_alpha (float): alpha parameter for prioritized replay
    prioritized_replay_beta0 (float): beta parameter for prioritized replay

    """
    name = '{}_{}'.format(os.path.split(__file__)[-1][:-3], seed)
    logger = get_logger(name)
    logger.info('Note that Rainbow features supported in current version is '
                'consitent with openai/baselines, which means `Multi-step` and '
                '`Distributional` are missing. Welcome any contributions!')

    qnet = network.to(device)
    qtar = deepcopy(qnet)
    if prioritized_replay:
        buffer = PrioritizedReplayBuffer(buffer_size, device,
                                         prioritized_replay_alpha,
                                         prioritized_replay_beta0)
    else:
        buffer = ReplayBuffer(buffer_size, device)
    generator = _generate(device, env, qnet, ob_scale,
                          number_timesteps, param_noise,
                          exploration_fraction, exploration_final_eps)

    infos = {'eplenmean': deque(maxlen=100), 'eprewmean': deque(maxlen=100)}
    start_ts = time.time()
    for n_iter in range(1, number_timesteps + 1):
        if prioritized_replay:
            buffer.beta += (1 - prioritized_replay_beta0) / number_timesteps
        *data, info = generator.__next__()
        buffer.add(*data)
        for k, v in info.items():
            infos[k].append(v)

        # update qnet
        if n_iter > learning_starts and n_iter % train_freq == 0:
            b_o, b_a, b_r, b_o_, b_d, *extra = buffer.sample(batch_size)
            b_o.mul_(ob_scale)
            b_o_.mul_(ob_scale)
            b_q = qnet(b_o).gather(1, b_a)
            with torch.no_grad():
                if double_q:
                    b_a_ = qnet(b_o_).argmax(1).unsqueeze(1)
                    b_q_ = (1 - b_d) * qtar(b_o_).gather(1, b_a_)
                else:
                    b_q_ = (1 - b_d) * qtar(b_o_).max(1, keepdim=True)[0]
            abs_td_error = (b_q - (b_r + gamma * b_q_)).abs()
            if extra:
                loss = (extra[0] * huber_loss(abs_td_error)).mean()  # weighted
            else:
                loss = huber_loss(abs_td_error).mean()
            optimizer.zero_grad()
            loss.backward()
            if grad_norm is not None:
                nn.utils.clip_grad_norm_(qnet.parameters(), grad_norm)
            optimizer.step()
            if prioritized_replay:
                priorities = abs_td_error.detach().cpu().clamp(1e-6).numpy()
                buffer.update_priorities(extra[1], priorities)

        # update target net and log
        if n_iter % target_network_update_freq == 0:
            qtar.load_state_dict(qnet.state_dict())
            logger.info('{} Iter {} {}'.format('=' * 10, n_iter, '=' * 10))
            fps = int(n_iter / (time.time() - start_ts))
            logger.info('Total timesteps {} FPS {}'.format(n_iter, fps))
            for k, v in infos.items():
                v = (sum(v) / len(v)) if v else float('nan')
                logger.info('{}: {:.6f}'.format(k, v))
            if n_iter > learning_starts and n_iter % train_freq == 0:
                logger.info('vloss: {:.6f}'.format(loss.item()))

        if save_interval and n_iter % save_interval == 0:
            torch.save([qnet.state_dict(), optimizer.state_dict()],
                       os.path.join(save_path, '{}.{}'.format(name, n_iter)))