Ejemplo n.º 1
0
 def __init__(
     self,
     main_model: QNetwork,
     target_network: QNetwork,
     lr=1e-3,
     gamma=0.9,
     update_every_steps=10,
     memory: RootReplayBuffer = ReplayBuffer(buffer_size=int(1e5),
                                             batch_size=64),
     seed=0,
     device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")):
     """
     Initialization
     :param state_size: how many states in world.
     :param action_size: how many agents can the agent choose from.
     :param is_double: if True, implements Double DQN (https://arxiv.org/pdf/1509.06461.pdf)
     :param seed: to reproduce results.
     :param device: do I have a GPU, or not?
     """
     assert main_model.state_size == target_network.state_size, \
         "Main model accepts a state size %d, but target accepts a state size %d" % (main_model.state_size, target_network.state_size)
     assert main_model.action_size == target_network.action_size, \
         "Main model generates %d possible actions, but target generates %d" % (main_model.action_size, target_network.action_size)
     self.version = "v_debug: use of 'target' when calculating targets"
     self.state_size = main_model.state_size
     self.action_size = main_model.action_size
     self.seed = random.seed(seed)
     self.device = device
     self.update_every_steps = update_every_steps
     self.qnetwork_local = main_model.to(self.device)
     self.qnetwork_target = target_network.to(self.device)
     self.optimizer = optim.Adam(params=self.qnetwork_local.parameters(),
                                 lr=lr)
     self.memory = memory  # ReplayBuffer(buffer_size=memory_size, batch_size=batch_size)
     # let's keep track of the steps so that we can run the algorithms properly
     self.t_step = 0
     self.gamma = gamma
    def load_qnet(self, model_name):
        """Load Q-Network parameters from file.

        Params
        ======
            model_name (str): name of the Q-Network
        """
        # Saved QNetwork is alway the CPU version.
        qnetwork_loaded = QNetwork(self.aug_state_size,
                                   self.action_size,
                                   self.hsize1,
                                   self.hsize2,
                                   seed=None)
        qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth'))
        self.qnetwork_local.update_weights(qnetwork_loaded.to(
            device))  # copy loaded network weights to local network
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed)
        self.qnetwork_local = self.qnetwork_local.to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed)
        self.qnetwork_target = self.qnetwork_target.to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"

        self.optimizer.zero_grad()

        output_local = self.qnetwork_local(states)
        output_target = self.qnetwork_target(next_states).detach(
        )  # detach() is necessary because of softupdate below
        # where local parameters are copied into the target param.
        q_of_states_actions = output_local.gather(1, actions)
        max_q_of_next_states, _ = torch.max(output_target, dim=1)
        max_q_of_next_states = max_q_of_next_states.unsqueeze(1)

        loss = torch.mean(0.5 * (rewards + gamma * max_q_of_next_states *
                                 (1 - dones) - q_of_states_actions)**2)

        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 4
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, filepath):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.avarage_score = 0
        self.start_epoch = 0
        self.seed = random.randint(0, seed)
        random.seed(seed)
        print("seed ", seed, "  self.seed ", self.seed)
        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       self.seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        self.seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        if filepath:
            self.load_model(filepath)

        # Replay memory
        print("buffer size ", BUFFER_SIZE)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   self.seed)
        print("memory ", self.memory)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            #print("experiences ",experiences)
            self.learn_DDQN(experiences, GAMMA)
            self.t_step = (self.t_step + 1) % UPDATE_EVERY
            if self.t_step == 0:
                self.update_network(self.qnetwork_local, self.qnetwork_target)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn_DDQN(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # Get max predicted Q values (for next states) from target model
        Q_targets_next_argmax = self.qnetwork_local(next_states).squeeze(
            0).detach().max(1)[1].unsqueeze(1)
        #Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach()
        #Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1)
        Q_targets_next = self.qnetwork_target(next_states).squeeze(0).gather(
            1, Q_targets_next_argmax)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach()
        Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def save_model(self, filepath, epoch, score, last=False):
        checkpoint = {
            'input_size':
            self.state_size,
            'output_size':
            self.action_size,
            'hidden_layers':
            [each.in_features for each in self.qnetwork_local.hidden_layers],
            'state_dict':
            self.qnetwork_local.state_dict(),
            'optimizer_state_dict':
            self.optimizer.state_dict(),
            'epoch':
            epoch,
            'avarage_score':
            score
        }
        checkpoint['hidden_layers'].append(
            self.qnetwork_local.hidden_layers[-1].out_features)
        torch.save(checkpoint, filepath)
        if last:
            torch.save(self.qnetwork_local.state_dict(),
                       '{}_state_dict_{}.pt'.format(last, epoch))
        #print("checkpoint['hidden_layers'] ",checkpoint['hidden_layers'])

    def load_model(self, filepath):
        print("seed ", self.seed)
        if os.path.isfile(filepath):
            print("=> loading checkpoint '{}'".format(filepath))
            checkpoint = torch.load(filepath)
            print("checkpoint['hidden_layers'] ", checkpoint['hidden_layers'])
            self.qnetwork_local = QNetwork(
                checkpoint['input_size'], checkpoint['output_size'], self.seed,
                checkpoint['hidden_layers']).to(device)
            self.qnetwork_local.load_state_dict(checkpoint['state_dict'])
            self.qnetwork_local.to(device)
            self.qnetwork_target = QNetwork(
                checkpoint['input_size'], checkpoint['output_size'], self.seed,
                checkpoint['hidden_layers']).to(device)
            self.qnetwork_target.load_state_dict(checkpoint['state_dict'])
            self.qnetwork_target.to(device)
            if 'optimizer_state_dict' in checkpoint:
                self.optimizer.load_state_dict(
                    checkpoint['optimizer_state_dict'])
                for state in self.optimizer.state.values():
                    for k, v in state.items():
                        if isinstance(v, torch.Tensor):
                            state[k] = v.to(device)
                print(self.optimizer)
            if 'epoch' in checkpoint:
                self.start_epoch = checkpoint['epoch']
            if 'avarage_score' in checkpoint:
                self.avarage_score = checkpoint['avarage_score']

            print(self.qnetwork_target)
            print(self.optimizer)
        else:
            print("=> no checkpoint found at '{}'".format(filepath))

    def update_network(self, local_model, target_model):
        for target, local in zip(target_model.parameters(),
                                 local_model.parameters()):
            target.data.copy_(local.data)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 5
0
def make_movie(env_name, checkpoint='*.tar', num_frames=20, first_frame=0, resolution=75, \
               save_dir='./movies/', density=5, radius=5, prefix='default', overfit_mode=False):
    # log dirの名前を入れる
    load_dir = env_name.lower()
    # メタデータ
    meta = {}
    # if env_name == "Pong-v0":
    meta['critic_ff'] = 600
    meta['actor_ff'] = 500
    # 環境を作成
    # env = gym.make(env_name)
    env = make_pytorch_env(env_name)

    # actor crtic ネットワークをロードする
    n_state = env.observation_space.shape[0]
    n_act = env.action_space.n
    model = QNetwork(n_state, n_act)
    model.to(device)

    model.load()

    # movieのdirを取ってくる(default-100-PongNoFrameSkip)
    movie_title = "{}-{}-{}.mp4".format(prefix, num_frames, env_name.lower())
    print('\tmaking movie "{}" using checkpoint at {}{}'.format(
        movie_title, load_dir, checkpoint))
    max_ep_len = first_frame + num_frames + 1
    torch.manual_seed(0)

    # プレイしてログをえる(logitsはActorの値)
    history = rollout(model, env, max_ep_len=max_ep_len)
    print()

    # 保存用の作成
    start = time.time()

    total_frames = len(history['ins'])

    FFMpegWriter = manimation.writers['ffmpeg']
    metadata = dict(title=movie_title,
                    artist='greydanus',
                    comment='atari-saliency-video')
    writer = FFMpegWriter(fps=8, metadata=metadata)
    f = plt.figure(figsize=[6, 6 * 1.3], dpi=resolution)

    # 画像
    seq_image = np.array(history['ins'][first_frame:first_frame + num_frames])

    # with writer.saving(f, save_dir + movie_title, resolution):
    #     for i in range(num_frames):
    #         print('i: ', i)
    #         frame = seq_image[i].copy()
    #         actor_saliency = score_frame(model, seq_image[i].copy(), density, num_frames, mode='actor')
    #         frame = saliency_on_atari_frame(actor_saliency, frame, num_frames, fudge_factor=meta['actor_ff'])
    #
    #         # 描画する
    #         plt.imshow(frame)
    #         plt.gray()
    #         plt.title(env_name.lower(), fontsize=15)
    #         plt.show()
    #         writer.grab_frame()
    #         f.clear()
    #
    #         tstr = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start))
    #         print('\ttime: {} | progress: {:.1f}%'.format(tstr, 100 * i / min(num_frames, total_frames)), end='\r')

    print('\nfinished.')
class DDQNPERAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 tor_dstate,
                 srpt_pens,
                 lrn_rate,
                 hsize1,
                 hsize2,
                 seed=0):
        """Initialize a DDQN Agent object with PER (Prioritized Experience Replay) support.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            tor_dstate (float): tolerance for deciding whether two states are the same
            srpt_pens (array_like): penalty (negative reward) values for undesirable actions
            lrn_rate (float): learning rate for Q-Network training
            hsize1 (int): size of the first hidden layer of the Q-Network
            hsize2 (int): size of the second hidden layer of the Q-Network 
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.tor_dstate = tor_dstate
        self.srpt_pens = srpt_pens
        self.lrn_rate = lrn_rate

        self.hsize1 = hsize1
        self.hsize2 = hsize2

        self.seed = seed
        if seed is not None: random.seed(seed)

        # Each penalty value adds a vector of action_size to signal which action causes the penalty.
        self.aug_state_size = state_size + len(srpt_pens) * action_size

        # Set up Q-Networks.
        self.qnetwork_local = QNetwork(self.aug_state_size, action_size,
                                       hsize1, hsize2, seed).to(device)
        self.qnetwork_local.initialize_weights(
        )  # initialize network with random weights
        self.qnetwork_target = QNetwork(self.aug_state_size,
                                        action_size,
                                        hsize1,
                                        hsize2,
                                        seed=None).to(device)
        self.qnetwork_target.update_weights(
            self.qnetwork_local)  # copy network weights to target network
        self.qnetwork_target.eval()
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=lrn_rate)

        # Store trained Q-model when the environment is solved.
        self.qnetwork_solved = None

        # Set up experience replay memory.
        self.ebuffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)

        # Initialize interval steps.
        self.l_step = 0  # for learning every LEARN_EVERY time steps
        self.t_step = 0  # for updating target network every UPDATE_EVERY learnings

    def reset_epsisode(self, state, srpt_det=0):
        """Re-initialize buffers after environment reset for a new episode.
        
        Params
        ======
            state (array_like): initial state after environment reset
            srpt_det (int): number of repeated state types to be checked for post-processing
        """
        self.srpt_det = 0
        if len(self.srpt_pens) == 0:
            # State repeat detection for post-processing is active only when state repeat penalty option is off.
            self.srpt_det = srpt_det
        else:
            # This is used to signal self.step() hasn't been run yet.
            self.next_aug_state = None

        if len(self.srpt_pens) > 0 or self.srpt_det > 0:
            self.state_buffer = deque(maxlen=2)
            buffer_size = 2 * (max(len(self.srpt_pens), self.srpt_det) - 1)
            self.smsta_buffer = deque(maxlen=max(2, buffer_size))

            # The initial state will be pushed to the buffer again and be compared to this state in the process of
            # selecting the first action. So add 1 to the initial state here to ensure the states are different
            # enough for the first comparison.
            self.state_buffer.append(np.array(state) + 1)

            # Any position and orientation can be the initial simulated state here. It is like putting in a
            # coordinate system (origin and x-direction) for a 2-D plane and all the other simulated states
            # in the episode will be specified based on this reference coordinate system.
            self.smsta_buffer.append((np.array([0, 0]), 0))

    def step(self, state, action, reward, next_state, done):
        """Update replay memory and parameters of Q-Network by training.
        
        Params
        ======
            state (array_like): starting state of the step
            action (int): action performed in the step
            reward (float): reward from the action
            next_state (array_like): resulting state of the action in the step
            done (bool): indicator for whether next_state is terminal (i.e., end of episode) or not
        """
        if len(self.srpt_pens) > 0:
            # Augment state vector and modify reward using state repeat penalty values.
            self.state_buffer.append(np.array(next_state))
            self.next_aug_state = self.augment_state(next_state)
            state = self.aug_state
            next_state = self.next_aug_state
            reward = self.modify_reward(reward, state, action)

        # Save experience in replay memory.
        self.ebuffer.add(state, action, reward, next_state, done)

        # Learn every LEARN_EVERY steps after memory reaches batch_size.
        if len(self.ebuffer.memory) >= self.ebuffer.batch_size:
            self.l_step += 1
            self.l_step %= LEARN_EVERY
            if self.l_step == 0:
                experiences, weights = self.ebuffer.sample()
                self.learn(experiences, weights, GAMMA)

    def augment_state(self, state):
        """Augment state vector to penalize undesirable actions.
        
        Params
        ======
            state (array_like): original state vector to be augmented
        Returns
        ======
            aug_state (numpy.ndarray): augmented state vector
        """
        # Each penalty value adds a vector of action_size to signal which action causes the penalty.
        aug_state = np.concatenate(
            (state, np.zeros((len(self.srpt_pens) * self.action_size, ))))

        # Detect situation where the two preceeding observed states (not augmented) are essentially the
        # same, which indicates the agent is either stucked at a wall or in some kind of undesirable
        # blind spot. The next action to avoid (i.e., to be penalized) is the one that will keep the
        # agent stuck or in blind spot.
        avoid_action = self.get_avoid_action()
        if avoid_action != ACT_INVALID:
            aug_state[self.state_size + avoid_action] = 1
        if avoid_action != ACT_INVALID or len(self.srpt_pens) == 1:
            return aug_state

        # If agent is not stuck or in blind spot and there are more penalty values, continue to check
        # state repeats separated by more than two actions. Assuming NUM_ORIS is even, states separated
        # by odd number of actions won't repeat. So only even number of actions needs to be checked.
        for action in range(self.action_size):
            nxt_sta = self.sim_step(action)
            for act_cnt in range(2, 2 * len(self.srpt_pens), 2):
                if self.is_state_repeated(act_cnt, nxt_sta):
                    aug_state[self.state_size +
                              (act_cnt // 2) * self.action_size +
                              action] = 1  # signal undesirable action
                    break

        return aug_state

    def modify_reward(self, reward, aug_state, action):
        """Modify reward to penalized undesirable action.
        
        Params
        ======
            reward (float): original reward
            aug_state (numpy.ndarray): augmented state vector
            action (int): action performed
        Returns
        ======
            reward (float): modified reward
        """
        # Penalize undesirable action when it doesn't earn a reward or cause a penalty. If it earns a positive
        # reward or causes a more negative reward, leave the reward unchanged.
        if reward <= 0:
            for i, penalty in enumerate(self.srpt_pens):
                if aug_state[self.state_size + i * self.action_size +
                             action] > 0:  # action is undesirable
                    reward = min(reward, penalty)
                    break
        return reward

    def sim_step(self, action):
        """Advance simulated state (position and orientation) for one step by the action.
        
        Params
        ======
            action (int): action to advance the simulated state
        Returns
            pos, ori (numpy.ndarray, int): resulting simulated state
        """
        # An action can either be a move or turn (but not both) with the type of actions (including non-actions)
        # identified by the action code.
        pos, ori = self.smsta_buffer[-1]
        act_code = ACT_CODES[action]
        pos = pos + act_code[0] * ORIVEC_TABLE[ori]
        ori = (ori + act_code[1]) % NUM_ORIS
        return pos, ori

    def is_state_repeated(self, act_cnt, nxt_sta):
        """Check whether the next state repeats the past state separated by the specified number of actions.
        
        Params
        ======
            act_cnt (int): number of actions separating the past state to be checked and the next state
            nxt_sta (numpy.ndarray, int): next state resulting from an action
        Returns
        ======
            repeated (bool): indicator for repeated state
        """
        repeated = False
        if act_cnt <= len(self.smsta_buffer):
            chk_sta = self.smsta_buffer[-act_cnt]  # past state to be checked
            if chk_sta[1] == nxt_sta[1]:
                if np.linalg.norm(nxt_sta[0] - chk_sta[0]) <= self.tor_dstate:
                    repeated = True
        return repeated

    def act(self, state, eps=0.0):
        """Select action for given state as per epsilon-greedy current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for adjusting epsilon-greedy action selection
        Returns
        ======
            action (int): the chosen action
        """
        # If the agent is in testing mode, self.step() won't be invoked and some of the operations done there
        # need to be done here.
        if (len(self.srpt_pens) > 0
                and self.next_aug_state is None) or self.srpt_det > 0:
            # Push current state into state buffer for comparing with previous state if it is not alraedy pushed
            # by self.step() in the agent training process.
            self.state_buffer.append(np.array(state))

        if len(self.srpt_pens) > 0:
            if self.next_aug_state is None:
                self.aug_state = self.augment_state(state)
            else:
                self.aug_state = self.next_aug_state
            state = self.aug_state

        if self.srpt_det == 0:  # no checking for repeated states (observed or simulated)
            # Randomly select action.
            action = random.choice(np.arange(self.action_size))

            # Epsilon-greedy action selection.
            if random.random() >= eps:
                state = torch.from_numpy(state).float().to(device)
                self.qnetwork_local.eval()
                with torch.no_grad():
                    action = self.qnetwork_local(
                        state).squeeze().argmax().cpu().item()

            if len(self.srpt_pens) > 0:
                # Update simulated state buffer with result of chosen action.
                nxt_sta = self.sim_step(action)
                self.smsta_buffer.append(nxt_sta)

            return action

        # This is the implementation of the post-processing of the Epsilon-greedy policy to avoid repeated states
        # within a short series of actions. This option is set in self.reset_episode() for each espisode and is
        # only active when the option of penalizing undesirable actions, which is set for the class object, is
        # disabled when len(self.srpt_pens) == 0. To accomondate the post-processing of the selected actions, the
        # random policy is modified to randomly assign rankings to all the available actions.

        # Randomly assign rankings to action candidates.
        ranked_actions = np.random.permutation(self.action_size)

        # Epsilon-greedy action selection.
        if random.random() >= eps:
            state = torch.from_numpy(state).float().to(device)
            self.qnetwork_local.eval()
            with torch.no_grad():
                neg_act_qvals = -self.qnetwork_local(state).squeeze()
            ranked_actions = neg_act_qvals.argsort().cpu().numpy().astype(int)

        # Post-process ranked action candidates to remove undesirable action.
        avoid_action = self.get_avoid_action()
        action = self.select_nosrpt_action(avoid_action, ranked_actions)

        return action

    def get_avoid_action(self):
        """Avoid action that will keep the agent stucked or in a blind spot. 
        
        Returns
            avoid_action (int): next action to avoid
        """
        avoid_action = ACT_INVALID  # used to sigal agent is not stucked or in a blind spot
        if np.linalg.norm(self.state_buffer[1] -
                          self.state_buffer[0]) <= self.tor_dstate:
            sim_sta0 = self.smsta_buffer[-2]
            sim_sta1 = self.smsta_buffer[-1]
            if sim_sta0[1] == sim_sta1[
                    1]:  # action is not a turn, must be a move
                # Agent is stuck at a wall
                dpos = sim_sta1[0] - sim_sta0[0]
                mcode = np.around(np.dot(
                    dpos, ORIVEC_TABLE[sim_sta0[1]])).astype(
                        int)  # dot(mcode*(cos, sin), (cos, sin)) = mcode
                avoid_action = AVOID_MOVE_TABLE[mcode + 1]
                self.smsta_buffer.clear(
                )  # it is reasonable to backtrack to get unstucked except the last state which
                self.smsta_buffer.append(
                    sim_sta0
                )  # the agent is stucked in (as the new reference, it can be any state)
            else:  # action is a turn
                # Agent is in a blind spot (turned, but observed same state).
                tcode = sim_sta1[1] - sim_sta0[1]
                avoid_action = AVOID_TURN_TABLE[(tcode + 1) % NUM_ORIS]
                self.smsta_buffer.clear(
                )  # it is reasonable to backtrack to get out of blind
                self.smsta_buffer.append(
                    sim_sta0
                )  # spot except the last two states, which represent
                self.smsta_buffer.append(sim_sta1)  # the blind spot
        return avoid_action

    def select_nosrpt_action(self, avoid_action, ranked_actions):
        """Select action that avoids repeated state (i.e., loops) by a short series of actions.
        
        Params
        ======
            avoid_action (int): action to avoid if agent is stuck or in blind spot
            ranked_actions (array like): action candidates ranked by decreasing Q-values
        Returns
        ======
            action (int): the selected action
        """
        action = ranked_actions[0]
        if action == avoid_action: action = ranked_actions[1]
        nxt_sta = self.sim_step(action)

        # If repeated observed state by an action is detected (signaled by avoid_action != ACT_INVALID), the selected
        # action for avoiding the repeated state will be used since it is more important to free a agent that is
        # stucked or in a blind spot than to go back further to check for repeated simulated states. So the checking
        # for repeated simulated states by 2 or more actions will only occur when avoid_action == ACT_INVALID.
        if avoid_action == ACT_INVALID and self.srpt_det > 1:
            act_heapq = []
            for action in ranked_actions:
                nxt_sta = self.sim_step(action)
                for act_cnt in range(
                        2, 2 * self.srpt_det, 2
                ):  # assuming NUM_ORIS is even, only check even number of actions
                    if self.is_state_repeated(act_cnt, nxt_sta):
                        # Simulated state repeated, go checking next action.
                        heapq.heappush(act_heapq, [-act_cnt, action, nxt_sta])
                        break
                else:
                    # No repeated state detected, action is found.
                    break
            else:
                # No action can satisfy all the no repeated state conditions, select the action that repeats the
                # state separated by most actions (i.e., long loop is more acceptable than short loop).
                action, nxt_sta = heapq.heappop(act_heapq)[1:]

        self.smsta_buffer.append(
            nxt_sta
        )  # update simulated state buffer with result of chosen action.
        return action

    def learn(self, experiences, is_weights, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple (s, a, r, s', done) of batched experience data
            is_weights (torch.Tensor): importance sampling weights for the batched experiences
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Double DQN method for obtaining target Q-values.
        self.qnetwork_local.eval()
        with torch.no_grad():
            maxq_actions = self.qnetwork_local(next_states).max(
                1)[1].unsqueeze(1)
            qouts_next_states = self.qnetwork_target(next_states).gather(
                1, maxq_actions).squeeze()
        qouts_target = rewards + gamma * qouts_next_states * (1 - dones)

        # Obtain current Q-values and its difference from the target Q-values.
        self.qnetwork_local.train()
        qouts_states = self.qnetwork_local(states).gather(1, actions).squeeze()
        delta_qouts = qouts_states - qouts_target

        # Calculated weighted sum of squared losses.
        wsqr_loss = is_weights * delta_qouts**2  # weighted squared loss
        loss_sum = wsqr_loss.sum()

        # Update model parameters by minimizing the loss sum.
        self.optimizer.zero_grad()
        loss_sum.backward()
        self.optimizer.step()

        # Update priorities of the replay memory.
        neg_prios = -torch.abs(delta_qouts.detach())
        self.ebuffer.update_priorities(neg_prios.cpu().numpy())

        # Update target network.
        self.t_step += 1
        self.t_step %= UPDATE_EVERY
        if self.t_step == 0:
            self.qnetwork_target.update_weights(self.qnetwork_local, TAU)

    def update_beta(self, beta):
        """Update importance sampling weights for memory buffer with new Beta.

        Params
        ======
            beta (float): new Beta value
        """
        if beta != self.ebuffer.beta:
            self.ebuffer.beta = beta
            if len(self.ebuffer.memory) >= self.ebuffer.batch_size:
                self.ebuffer.update_is_weights()

    def copy_solved_qnet(self):
        """Copy current local Q-Network to solved Q-Network while local Q-Network will continue the training."""
        if self.qnetwork_solved is None:
            self.qnetwork_solved = QNetwork(self.aug_state_size,
                                            self.action_size,
                                            self.hsize1,
                                            self.hsize2,
                                            seed=None).to(device)
        self.qnetwork_solved.update_weights(
            self.qnetwork_local
        )  # copy local network weights to solved network

    def save_qnet(self, model_name):
        """Save Q-Network parameters into file.

        Params
        ======
            model_name (str): name of the Q-Network
        """
        # Save CPU version since it can be used with or without GPU.
        if self.qnetwork_solved is not None:
            torch.save(self.qnetwork_solved.cpu().state_dict(),
                       model_name + '.pth')
            self.qnetwork_solved = self.qnetwork_solved.to(device)
        else:
            torch.save(self.qnetwork_local.cpu().state_dict(),
                       model_name + '.pth')
            self.qnetwork_local = self.qnetwork_local.to(device)

    def load_qnet(self, model_name):
        """Load Q-Network parameters from file.

        Params
        ======
            model_name (str): name of the Q-Network
        """
        # Saved QNetwork is alway the CPU version.
        qnetwork_loaded = QNetwork(self.aug_state_size,
                                   self.action_size,
                                   self.hsize1,
                                   self.hsize2,
                                   seed=None)
        qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth'))
        self.qnetwork_local.update_weights(qnetwork_loaded.to(
            device))  # copy loaded network weights to local network
Ejemplo n.º 7
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Compute and minimize the loss
        criterion = torch.nn.MSELoss()

        ## Move input and label tensors to correct device
        self.qnetwork_local.to(device)
        self.qnetwork_target.to(device)
        inputs = next_states.to(device)

        ## Select max predicted Q value for next state using the target model
        with torch.no_grad():
            next_target = self.qnetwork_target(inputs)
            next_q_target = next_target.max(1)[0].unsqueeze(1)
        ## Calculate q targets
        target_q = rewards + (gamma * next_q_target * (1 - dones))

        ## Use local model to get the expected Q value
        expected_q = self.qnetwork_local(states).gather(1, actions)

        ## Compute and minimize the loss
        loss = criterion(expected_q, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 8
0
class Agent:
    """Interacts with and learns from the environment."""

    def __init__(self, action_size, seed, state_size, visual):
        """Initialize an Agent object.
        
        Params
        ======
            action_size (int): dimension of each action
            seed (int): random seed
            state_size (int): dimension of each state. Note this can be None if visual is true
            visual (bool): whether to train the agent on visual pixels or vector observations
        """
        if not visual:
            self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed) if not visual else VisualQNetwork(action_size, seed)
        self.qnetwork_local = self.qnetwork_local.to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed) if not visual else VisualQNetwork(action_size, seed)
        self.qnetwork_target = self.qnetwork_target.to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)
        self.beta_start = 0.4

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, GAMMA)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
        self.batch_no = 0
        self.beta_batch_nos = 50_000
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                beta = min(1.0, self.beta_start + (self.batch_no / self.beta_batch_nos) * (1 - self.beta_start))
                self.batch_no += 1
                experiences = self.memory.sample(beta)
                self._learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def _learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones, sample_indices, weight_update_weights = experiences

        # Get max predicted Q values (for next states) from target model
        q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0]
        # Compute Q targets for current states 
        q_targets = rewards.squeeze(1) + (gamma * q_targets_next * (1 - dones.squeeze(1)))

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions).squeeze(1)
        # Compute loss
        loss = (q_expected - q_targets.detach()).pow(2) * weight_update_weights
        prios = loss + 1e-5
        loss = loss.mean()
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()

        self.memory.update_priorities(prios.data.cpu().numpy(), sample_indices)

        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)