class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""
    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = Memory(capacity=buffer_size,
                             replay_beta=REPLAY_BETA,
                             replay_alpha=REPLAY_ALPHA,
                             replay_beta_increment=REPLAY_BETA_INCREMENT)
        self.batch_size = batch_size
        self.seed = random.seed(seed)
        self.experience = namedtuple(
            "Experience",
            field_names=["state", "action", "reward", "next_state", "done"])

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        if len(self.memory) <= self.batch_size:
            error = random.random()
        else:
            error = self.memory.max_prio
        e = self.experience(state, action, reward, next_state, done)
        self.memory.add(error, e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences, idxs, ws = self.memory.sample(n=self.batch_size)

        states = torch.from_numpy(
            np.vstack([e.state for e in experiences
                       if e is not None])).float().to(device)
        actions = torch.from_numpy(
            np.vstack([e.action for e in experiences
                       if e is not None])).long().to(device)
        rewards = torch.from_numpy(
            np.vstack([e.reward for e in experiences
                       if e is not None])).float().to(device)
        next_states = torch.from_numpy(
            np.vstack([e.next_state for e in experiences
                       if e is not None])).float().to(device)
        dones = torch.from_numpy(
            np.vstack([e.done for e in experiences
                       if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones), idxs, ws

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
Ejemplo n.º 2
0
class Agent(object):
    def __init__(self):
        self.network, self.target_network = AtariNet(ACTIONS_SIZE), AtariNet(
            ACTIONS_SIZE)
        self.memory = Memory(MEMORY_SIZE)
        self.learning_count = 0
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=LR)
        self.loss_func = nn.MSELoss()

    def action(self, state, israndom):
        if israndom and random.random() < EPSILON:
            return np.random.randint(0, ACTIONS_SIZE)
        state = torch.unsqueeze(torch.FloatTensor(state), 0)
        actions_value = self.network.forward(state)
        return torch.max(actions_value, 1)[1].data.numpy()[0]

    def learn(self, state, action, reward, next_state, done):
        old_val = self.network.forward(torch.FloatTensor([state])).gather(
            1, torch.LongTensor([[action]]))[0]
        target_val = self.network.forward(torch.FloatTensor([state]))
        if done:
            done = 0
            target = reward
        else:
            done = 1
            target = reward + GAMMA * torch.max(target_val)
        error = abs(old_val[0] - target)
        self.memory.add(error.data, (state, action, reward, next_state, done))
        if self.memory.tree.n_entries < MEMORY_THRESHOLD:
            return

        if self.learning_count % UPDATE_TIME == 0:
            self.target_network.load_state_dict(self.network.state_dict())
        self.learning_count += 1

        batch, idxs, is_weights = self.memory.sample(BATCH_SIZE)
        state = torch.FloatTensor([x[0] for x in batch])
        action = torch.LongTensor([[x[1]] for x in batch])
        reward = torch.FloatTensor([[x[2]] for x in batch])
        next_state = torch.FloatTensor([x[3] for x in batch])
        done = torch.FloatTensor([[x[4]] for x in batch])

        eval_q = self.network.forward(state).gather(1, action)
        next_q = self.target_network(next_state).detach()
        target_q = reward + GAMMA * next_q.max(1)[0].view(BATCH_SIZE, 1) * done
        errors = torch.abs(eval_q - target_q).data.numpy().flatten()
        loss = self.loss_func(eval_q, target_q)

        for i in range(BATCH_SIZE):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
Ejemplo n.º 3
0
class PERAgent(OffPolicyAgent):

    # construct agent's model separately, so it can be sized according to problem
    def __init__(self, n_replay, env, target_policy, behavior_policy, lr, discount, type = 'BC'):
        super().__init__(n_replay, env, target_policy, behavior_policy, lr, discount, type)

    # reseed numpy, reset weights of network
    # Reset must be performed before every episode
    def reset(self,seed):
        # Reset time
        self.t=0

        # Set seed value
        np.random.seed(seed)

        # Reset replay buffer
        self.replay_buffer = Memory(self.n_replay)

        # Rebuild model
        self.build_model(self.n_features,self.env.nA)
        
    def generate_action(self, s, target_policy_sel = True):
        pval = self.target_policy[s] if target_policy_sel else self.behavior_policy[s]
        return np.random.choice(a=self.actions, p=pval)

    def generate_all_actions(self,target_policy_sel = True):
        return np.array([self.generate_action(item, target_policy_sel) for item in range(self.target_policy.shape[0])])

    # Generate steps of experience
    def generate_experience(self, k=16):

        # Initialize environment
        s = self.env.reset()
        done = False
        steps = 0

        # For each step
        while steps < k:

            # choose action according to behavior policy
            a = self.generate_action(s,False)

            # Take a step in environment based on chosen action
            (s2,r,done,_) = self.env.step(a)

            # Compute importance ratios
            ratio = self.target_policy[s,a] / self.behavior_policy[s,a]

            # states and target action for Computing TD Error
            current_state = self.construct_features([s])
            next_state = self.construct_features([s2])
            target_policy_action = self.generate_action(s,True)

            # Get bootstrap estimate of next state action values
            value_s = self.model.predict([current_state,np.zeros(current_state.shape[0])])
            value_next_s = self.model.predict([next_state,np.zeros(next_state.shape[0])])
            updated_val = r if done else (r + self.discount*value_next_s[0][target_policy_action])

            # Compute TD error
            td_error = np.abs(updated_val - value_s[0][a])

            # Stop execution if weights blow up - not converged
            if td_error > 10**5:
                return 1

            # Add experience to IR replay buffer
            self.replay_buffer.add_per(td_error, (s,a,r,s2))

            # Set for next step
            s=s2
            self.t += 1
            steps += 1

            # If episode ends, reset environment
            if done:
                done = False
                s = self.env.reset()
        return 0

    # do batch of training using replay buffer
    def train_batch(self, n_samples, batch_size):

        # Sample a minibatch from replay buffer
        data_samples, idxs, ratios, buffer_total = self.replay_buffer.sample(n_samples)

        # Extract rewards, states, next states, actions from samples
        rewards = extract_transition_components(data_samples, TransitionComponent.reward)
        next_states = extract_transition_components(data_samples, TransitionComponent.next_state)
        next_state_features = self.construct_features(next_states)
        states = extract_transition_components(data_samples, TransitionComponent.state)
        state_features = self.construct_features(states)
        actions = extract_transition_components(data_samples, TransitionComponent.action)

        # Calculate Target policy actions
        target_policy_actions = np.array([self.generate_action(state, True) for state in states])

        # Calculate state values for TD error
        next_values_sa = self.model.predict([next_state_features, np.zeros(next_state_features.shape[0])])
        next_values = np.choose(target_policy_actions,next_values_sa.T)

        # v(s') is zero for terminal state, so need to fix model prediction
        for i in range(n_samples):
            # if experience ends in terminal state, value function returns 0
            if next_states[i] == -1 or next_states[i] == 10: #TODO this only works for randomwalk of size 10
                next_values[i] = 0.0

        # Compute targets by bootstrap estimates
        targets = (rewards + self.discount*next_values)

        # Compute error for updating priorities
        pred_values = self.model.predict([state_features, np.zeros(state_features.shape[0])])
        final_targets = np.copy(pred_values)
        np.put_along_axis(final_targets, np.expand_dims(actions,axis = 1),targets[:,np.newaxis],axis = 1)
        pred = np.choose(actions, pred_values.T)
        error = np.abs(pred - targets)

        # Priority update
        for i in range(batch_size):
            self.replay_buffer.update(idxs[i], error[i])

        # train on samples
        self.model.fit([state_features, ratios], final_targets, batch_size=batch_size, verbose=0)
Ejemplo n.º 4
0
class MAD4PG:
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 buffer_size=int(1e6),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 update_every=3,
                 num_mc_steps=5,
                 num_agents=2):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.BATCH_SIZE = batch_size
        self.GAMMA = gamma
        self.TAU = tau
        self.UPDATE_EVERY = update_every
        self.num_mc_steps = num_mc_steps
        self.experiences = [
            ExperienceQueue(num_mc_steps) for _ in range(num_agents)
        ]
        self.memory = Memory(buffer_size)
        self.t_step = 0
        self.train_start = batch_size
        self.mad4pg_agent = [
            D4PG(state_size,
                 action_size,
                 seed,
                 device,
                 num_atoms=N_ATOMS,
                 q_min=Vmin,
                 q_max=Vmax),
            D4PG(state_size,
                 action_size,
                 seed,
                 device,
                 num_atoms=N_ATOMS,
                 q_min=Vmin,
                 q_max=Vmax)
        ]

    def acts(self, states, add_noise=0.0):
        acts = []
        for s, a in zip(states, self.mad4pg_agent):
            acts.append(a.act(np.expand_dims(s, 0), add_noise))
        return np.vstack(acts)

    # borrow from https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/tree/master/Chapter14
    def distr_projection(self, next_distr_v, rewards_v, dones_mask_t, gamma):
        next_distr = next_distr_v.data.cpu().numpy()
        rewards = rewards_v.data.cpu().numpy()
        dones_mask = dones_mask_t.cpu().numpy().astype(np.bool)
        batch_size = len(rewards)
        proj_distr = np.zeros((batch_size, N_ATOMS), dtype=np.float32)
        dones_mask = np.squeeze(dones_mask)
        rewards = rewards.reshape(-1)

        for atom in range(N_ATOMS):
            tz_j = np.minimum(
                Vmax,
                np.maximum(Vmin, rewards + (Vmin + atom * DELTA_Z) * gamma))
            b_j = (tz_j - Vmin) / DELTA_Z
            l = np.floor(b_j).astype(np.int64)
            u = np.ceil(b_j).astype(np.int64)
            eq_mask = u == l

            proj_distr[eq_mask, l[eq_mask]] += next_distr[eq_mask, atom]
            ne_mask = u != l

            proj_distr[ne_mask,
                       l[ne_mask]] += next_distr[ne_mask,
                                                 atom] * (u - b_j)[ne_mask]
            proj_distr[ne_mask,
                       u[ne_mask]] += next_distr[ne_mask,
                                                 atom] * (b_j - l)[ne_mask]

        if dones_mask.any():
            proj_distr[dones_mask] = 0.0
            tz_j = np.minimum(Vmax, np.maximum(Vmin, rewards[dones_mask]))
            b_j = (tz_j - Vmin) / DELTA_Z
            l = np.floor(b_j).astype(np.int64)
            u = np.ceil(b_j).astype(np.int64)
            eq_mask = u == l
            if dones_mask.shape == ():
                if dones_mask:
                    proj_distr[0, l] = 1.0
                else:
                    ne_mask = u != l
                    proj_distr[0, l] = (u - b_j)[ne_mask]
                    proj_distr[0, u] = (b_j - l)[ne_mask]
            else:
                eq_dones = dones_mask.copy()

                eq_dones[dones_mask] = eq_mask
                if eq_dones.any():
                    proj_distr[eq_dones, l[eq_mask]] = 1.0
                ne_mask = u != l
                ne_dones = dones_mask.copy()
                ne_dones[dones_mask] = ne_mask
                if ne_dones.any():
                    proj_distr[ne_dones, l[ne_mask]] = (u - b_j)[ne_mask]
                    proj_distr[ne_dones, u[ne_mask]] = (b_j - l)[ne_mask]

        return torch.FloatTensor(proj_distr).to(device)

    def step(self, states, actions, rewards, next_states, dones):

        for agent_index in range(len(self.mad4pg_agent)):
            agent_experiences = self.experiences[agent_index]
            agent_experiences.states.appendleft(states[agent_index])
            agent_experiences.rewards.appendleft(rewards[agent_index] *
                                                 self.GAMMA**self.num_mc_steps)
            agent_experiences.actions.appendleft(actions[agent_index])
            if len(agent_experiences.rewards) == self.num_mc_steps or dones[
                    agent_index]:  # N-steps return: r= r1+gamma*r2+..+gamma^(t-1)*rt
                done_tensor = torch.tensor(
                    dones[agent_index]).float().to(device)
                condition = True
                while condition:
                    for i in range(len(agent_experiences.rewards)):
                        agent_experiences.rewards[i] /= self.GAMMA
                    state = torch.tensor(
                        agent_experiences.states[-1]).float().unsqueeze(0).to(
                            device)
                    next_state = torch.tensor(
                        next_states[agent_index]).float().unsqueeze(0).to(
                            device)
                    action = torch.tensor(
                        agent_experiences.actions[-1]).float().unsqueeze(0).to(
                            device)
                    sum_reward = torch.tensor(sum(
                        agent_experiences.rewards)).float().unsqueeze(0).to(
                            device)
                    with evaluating(
                            self.mad4pg_agent[agent_index]) as cur_agent:
                        q_logits_expected = cur_agent.critic_local(
                            state, action)
                        action_next = cur_agent.actor_target(next_state)
                        q_target_logits_next = cur_agent.critic_target(
                            next_state, action_next)
                        q_target_distr_next = F.softmax(q_target_logits_next,
                                                        dim=1)
                    q_target_distr_next_projected = self.distr_projection(
                        q_target_distr_next, sum_reward, done_tensor,
                        self.GAMMA**self.num_mc_steps)
                    cross_entropy = -F.log_softmax(
                        q_logits_expected,
                        dim=1) * q_target_distr_next_projected
                    error = cross_entropy.sum(dim=1).mean().cpu().data
                    self.memory.add(
                        error,
                        (states[agent_index], actions[agent_index], sum_reward,
                         next_states[agent_index], dones[agent_index]))
                    agent_experiences.states.pop()
                    agent_experiences.rewards.pop()
                    agent_experiences.actions.pop()
                    condition = False and dones[agent_index] and len(
                        agent_experiences.states) > 0
            if dones[agent_index]:
                agent_experiences.states.clear()
                agent_experiences.rewards.clear()
                agent_experiences.actions.clear()

        self.t_step = (self.t_step + 1) % self.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            # print(self.memory.tree.n_entries)
            if self.memory.tree.n_entries > self.train_start:
                for agent_index in range(len(self.mad4pg_agent)):
                    sampled_experiences, idxs = self.sample()
                    self.learn(self.mad4pg_agent[agent_index],
                               sampled_experiences, idxs)

    def sample(self):
        # prioritized experience replay
        mini_batch, idxs, is_weights = self.memory.sample(self.BATCH_SIZE)
        mini_batch = np.array(mini_batch).transpose()
        statess = np.vstack([m for m in mini_batch[0] if m is not None])
        actionss = np.vstack([m for m in mini_batch[1] if m is not None])
        rewardss = np.vstack([m for m in mini_batch[2] if m is not None])
        next_statess = np.vstack([m for m in mini_batch[3] if m is not None])
        doness = np.vstack([m for m in mini_batch[4] if m is not None])
        # bool to binary
        doness = doness.astype(int)
        statess = torch.from_numpy(statess).float().to(device)
        actionss = torch.from_numpy(actionss).float().to(device)
        rewardss = torch.from_numpy(rewardss).float().to(device)
        next_statess = torch.from_numpy(next_statess).float().to(device)
        doness = torch.from_numpy(doness).float().to(device)
        return (statess, actionss, rewardss, next_statess, doness), idxs

    def learn(self, agent, experiences, idxs):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        # Compute critic loss
        q_logits_expected = agent.critic_local(states, actions)
        actions_next = agent.actor_target(next_states)
        q_targets_logits_next = agent.critic_target(next_states, actions_next)
        q_targets_distr_next = F.softmax(q_targets_logits_next, dim=1)
        q_targets_distr_projected_next = self.distr_projection(
            q_targets_distr_next, rewards, dones,
            self.GAMMA**self.num_mc_steps)
        cross_entropy = -F.log_softmax(q_logits_expected,
                                       dim=1) * q_targets_distr_projected_next
        critic_loss = cross_entropy.sum(dim=1).mean()
        with torch.no_grad():
            errors = cross_entropy.sum(dim=1).cpu().data.numpy()
        # update priority
        for i in range(self.BATCH_SIZE):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

        # Minimize the loss
        agent.critic_optimizer.zero_grad()
        critic_loss.backward()
        agent.critic_optimizer.step()

        # Compute actor loss
        actions_pred = agent.actor_local(states)
        crt_distr_v = agent.critic_local(states, actions_pred)
        actor_loss = -agent.critic_local.distr_to_q(crt_distr_v)
        actor_loss = actor_loss.mean()
        # Minimize the loss
        agent.actor_optimizer.zero_grad()
        actor_loss.backward()
        agent.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        agent.soft_update(agent.critic_local, agent.critic_target, self.TAU)
        agent.soft_update(agent.actor_local, agent.actor_target, self.TAU)
Ejemplo n.º 5
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 random_seed,
                 buffer_size=BUFFER_SIZE,
                 batch_size=BATCH_SIZE):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.buffer_size = buffer_size
        self.memory = Memory(
            capacity=self.buffer_size)  # internal memory using SumTree
        self.batch_size = batch_size

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size).to(device)
        self.actor_target = Actor(state_size, action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size).to(device)
        self.critic_target = Critic(state_size, action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self,
             state,
             action,
             reward,
             next_state,
             done,
             batch_size=BATCH_SIZE,
             update_every=UPDATE_EVERY):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward

        self.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % update_every
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            if self.memory.tree.n_entries >= batch_size:
                experiences, idxs, is_weights = self.sample()
                self.learn(experiences, idxs, is_weights)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            #action = [act + self.noise.sample() for act in action]
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self,
              experiences,
              idxs,
              is_weights,
              batch_size=BATCH_SIZE,
              gamma=GAMMA):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models

        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)

        #Loss calculation
        critic_loss = (torch.from_numpy(is_weights).float().to(device) *
                       F.mse_loss(Q_expected, Q_targets)).mean()

        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()

        #Introducing gradient clipping
        torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)

        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        #.......................update priorities in prioritized replay buffer.......#
        #Calculate errors used in prioritized replay buffer
        errors = (Q_expected - Q_targets).squeeze().cpu().data.numpy()

        # update priority
        for i in range(batch_size):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def add(self, state, action, reward, next_state, done, gamma=GAMMA):
        """Add a new experience to memory."""

        next_state_torch = torch.from_numpy(next_state).float().to(device)
        reward_torch = torch.unsqueeze(
            torch.from_numpy(np.array(reward)).float().to(device), 1)
        done_torch = torch.unsqueeze(
            torch.from_numpy(np.array(done).astype(
                np.uint8)).float().to(device), 1)
        state_torch = torch.from_numpy(state).float().to(device)
        action_torch = torch.from_numpy(action).float().to(device)

        self.actor_target.eval()
        self.critic_target.eval()
        self.critic_local.eval()
        with torch.no_grad():
            action_next = self.actor_target(next_state_torch)
            Q_target_next = self.critic_target(next_state_torch, action_next)
            Q_target = reward_torch + (gamma * Q_target_next *
                                       (1 - done_torch))
            Q_expected = self.critic_local(state_torch, action_torch)
        self.actor_local.train()
        self.critic_target.train()
        self.critic_local.train()

        #Error used in prioritized replay buffer
        error = (Q_expected - Q_target).squeeze().cpu().data.numpy()

        #Adding experiences to prioritized replay buffer
        for i in np.arange(len(reward)):
            self.memory.add(
                error[i],
                (state[i], action[i], reward[i], next_state[i], done[i]))

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences, idxs, is_weights = self.memory.sample(self.batch_size)

        states = np.vstack([e[0] for e in experiences])
        states = torch.from_numpy(states).float().to(device)

        actions = np.vstack([e[1] for e in experiences])
        actions = torch.from_numpy(actions).float().to(device)

        rewards = np.vstack([e[2] for e in experiences])
        rewards = torch.from_numpy(rewards).float().to(device)

        next_states = np.vstack([e[3] for e in experiences])
        next_states = torch.from_numpy(next_states).float().to(device)

        dones = np.vstack([e[4] for e in experiences]).astype(np.uint8)
        dones = torch.from_numpy(dones).float().to(device)

        return (states, actions, rewards, next_states, dones), idxs, is_weights
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = Memory(BUFFER_SIZE)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(device)
        
        self.qnetwork_local.eval()
        self.qnetwork_target.eval()
        with torch.no_grad():
            target_action_values = self.qnetwork_target(next_state)
            expected_action_values = self.qnetwork_local(state)
        
        self.qnetwork_local.train()
        self.qnetwork_target.train()
        
        old_val = expected_action_values[0][action]
        new_val = reward
        if not done:
            new_val += GAMMA * torch.max(target_action_values)
        error = abs(old_val - new_val)
        
        # Save experience in replay memory
        self.memory.add(error, (state, action, reward, next_state, done))
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.tree.n_entries > BATCH_SIZE:
                experiences = self.memory.sample(BATCH_SIZE)
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy()).astype(int)
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        mini_batches, idxs, is_weights = experiences

        states = torch.from_numpy(np.vstack([mini_batch[0] for mini_batch in mini_batches])).float().to(device)
        actions = torch.from_numpy(np.vstack([mini_batch[1] for mini_batch in mini_batches])).long().to(device)
        rewards = torch.from_numpy(np.vstack([mini_batch[2] for mini_batch in mini_batches])).float().to(device)
        next_states = torch.from_numpy(np.vstack([mini_batch[3] for mini_batch in mini_batches])).float().to(device)
        dones = torch.from_numpy(np.vstack([int(mini_batch[4]) for mini_batch in mini_batches])).float().to(device)

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        Q_source_next = self.qnetwork_local(next_states).detach().max(1)[1].unsqueeze(1)
        
        Q_target = self.qnetwork_target(next_states)
        
        Q_double_target = torch.tensor([Q_target[i][max_index] for i, max_index in enumerate(Q_source_next)]).detach().unsqueeze(1)
        
        Q_observed = rewards + (gamma * Q_double_target * (1 - dones))
        
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        
        errors = torch.abs(Q_expected - Q_observed).data.numpy()
        
        # update priority
        for i in range(BATCH_SIZE):
            idx = idxs[i]
            self.memory.update(idx, errors[i])
        
        loss = (torch.FloatTensor(is_weights) * F.mse_loss(Q_expected, Q_observed)).mean()
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
Ejemplo n.º 7
0
class DQNAgent():
    def __init__(self, state_size, action_size):
        self.render = False
        self.load_model = False

        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.lr_step_size = 10
        self.lr_gamma = 0.9
        self.memory_size = 2**15
        self.epsilon = 1.0
        self.epsilon_min = 0.05
        self.explore_step = 1000
        self.epsilon_decay = 0.99995
        self.batch_size = 64
        self.train_start = 10000

        # create prioritized replay memory using SumTree
        self.memory = Memory(self.memory_size)

        # create main model and target model
        self.model = DQN(state_size, action_size)
        self.model.apply(self.weights_init)
        self.target_model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)
        self.scheduler = StepLR(self.optimizer,
                                step_size=self.lr_step_size,
                                gamma=self.lr_gamma)

        # initialize target model
        self.update_target_model()

        if self.load_model:
            self.model = torch.load('save_model/per_dqn')
        self.model.train()

    # weight xavier initialize
    def weights_init(self, m):
        classname = m.__class__.__name__
        if classname.find('Linear') != -1:
            torch.nn.init.xavier_uniform_(m.weight)

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            state = torch.from_numpy(state).float()
            q_value = self.model(state)
            _, action = torch.max(q_value, 1)
            return int(action)

    # save sample (error,<s,a,r,s'>) to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        target = self.model(torch.tensor(state).float()).data
        old_val = target[0][action]
        target_val = self.target_model(torch.tensor(next_state).float()).data
        if done:
            target[0][action] = reward
        else:
            target[0][action] = reward + \
                self.discount_factor * torch.max(target_val)

        error = abs(old_val - target[0][action])

        self.memory.add(error, (state, action, reward, next_state, done))

    # pick samples from prioritized replay memory (with batch_size)
    def train_model(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            self.epsilon = max(self.epsilon, self.epsilon_min)

        mini_batch, idxs, is_weights = self.memory.sample(self.batch_size)
        mini_batch = np.array(mini_batch).transpose()

        states = np.vstack(mini_batch[0])
        actions = list(mini_batch[1])
        rewards = list(mini_batch[2])
        next_states = np.vstack(mini_batch[3])
        dones = mini_batch[4]

        # bool to binary
        dones = dones.astype(int)

        # Q function of current state
        states = torch.tensor(states).float()
        pred = self.model(states)

        # one-hot encoding
        a = torch.tensor(actions, dtype=torch.long).view(-1, 1)

        one_hot_action = torch.zeros(self.batch_size, self.action_size)
        one_hot_action.scatter_(1, a, 1)

        pred = torch.sum(pred.mul(one_hot_action), dim=1)

        # Q function of next state
        next_states = torch.tensor(next_states, dtype=torch.float)
        next_pred = self.target_model(next_states.float()).data

        rewards = torch.tensor(rewards, dtype=torch.float)
        dones = torch.tensor(dones, dtype=torch.float)

        # Q Learning: get maximum Q value at s' from target model
        target = rewards + (1 - dones) * \
            self.discount_factor * next_pred.max(1)[0]

        errors = torch.abs(pred - target).data.numpy()

        # update priority
        for i in range(self.batch_size):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

        self.optimizer.zero_grad()

        # MSE Loss function
        loss = (torch.tensor(is_weights).float() *
                F.mse_loss(pred, target)).mean()
        loss.backward()

        # and train
        self.optimizer.step()
        return loss.item()
Ejemplo n.º 8
0
class IRAgent_FourRooms(OffPolicyAgent_FourRooms):
    # construct agent's model separately, so it can be sized according to problem
    def __init__(self,
                 n_replay,
                 env,
                 target_policy,
                 behavior_policy,
                 lr,
                 discount,
                 type='BC'):
        super().__init__(n_replay, env, target_policy, behavior_policy, lr,
                         discount, type)

    # reseed numpy, reset weights of network
    # Reset must be performed before every episode
    def reset(self, seed=0):
        # Reset time
        self.t = 0

        # Set seed value
        np.random.seed(seed)

        # Reset replay buffer
        self.replay_buffer = Memory(self.n_replay)

        # Rebuild model
        self.build_model(self.env.size[0] * self.env.size[1], 1)

    # instead of generating one episode of experience, take 16 steps of experience
    def generate_experience(self, k=16):

        # Initialize environment
        s = self.env.reset()
        done = False
        steps = 0  # counting to k steps

        while steps < k:

            # choose action according to policy
            a = np.random.choice(a=self.actions,
                                 p=self.behavior_policy[s[0], s[1]])

            # Take a step in environment based on chosen action
            (s2, r, done, _) = self.env.step(a)

            # Compute importance ratios
            ratio = self.target_policy[s[0], s[1],
                                       a] / self.behavior_policy[s[0], s[1], a]

            # Add experience to IR replay buffer
            self.replay_buffer.add(ratio, (s, a, r, s2))

            # Set for next step
            s = s2
            self.t += 1
            steps += 1

            # If episode ends, reset environment
            if done:
                s = self.env.reset()
                done = False

    # do batch of training using replay buffer
    def train_batch(self, batch_size):

        # Sample a minibatch from replay buffer
        data_samples, _, _, buffer_total = self.replay_buffer.sample(
            batch_size)

        # Extract rewards, states, next states from samples
        rewards = extract_transition_components(data_samples,
                                                TransitionComponent.reward)
        next_states = extract_transition_components(
            data_samples, TransitionComponent.next_state)
        next_state_features = self.construct_features(next_states)
        states = extract_transition_components(data_samples,
                                               TransitionComponent.state)
        state_features = self.construct_features(states)

        # Importance ratios for update equation - IR does not use this
        ratios = np.ones(len(states))

        # In case of Bias Correction, pre-multiply bias corrector to update
        if self.name == "BC":
            ratios = ratios * (buffer_total /
                               self.replay_buffer.tree.n_entries)

        # Get value estimate for next state
        next_values = self.model.predict(
            [next_state_features,
             np.zeros(next_state_features.shape[0])]).flatten()

        # v(s') is zero for terminal state, so need to fix model prediction
        for i in range(batch_size):
            # if experience ends in terminal state then s==s2
            if (states[i] == next_states[i]).all():
                next_values[i] = 0.0

        # Compute targets by bootstrap estimates
        targets = (rewards + self.discount * next_values)

        # Train on samples
        self.model.fit([state_features, ratios],
                       targets,
                       batch_size=batch_size,
                       verbose=0)
Ejemplo n.º 9
0
class Dqn():
    def __init__(self):
        self.eval_net, self.target_net = Net(), Net()
        self.eval_net.cuda()
        self.target_net.cuda()

        # create prioritized replay memory using SumTree
        self.memory = Memory(Train_Configs.MEMORY_CAPACITY)
        self.learn_counter = 0
        self.optimizer = optim.Adam(self.eval_net.parameters(), lr=Train_Configs.LR,betas=(0.9, 0.99), eps=1e-08, weight_decay=2e-5)
        self.loss = nn.MSELoss(reduce=False, size_average=False)

        self.fig, self.ax = plt.subplots()
        self.discount_factor = Train_Configs.GAMMA

    def store_trans(self, state_path, action, reward, next_state_path,done):
        ## action type: id
        x, y, c = my_utils.translate_actionID_to_XY_and_channel(action)
        trans = state_path+'#'+str(action)+'#'+str(reward)+'#'+next_state_path#np.hstack((state, [action], [reward], next_state))
        #------ calculate TD errors from (s,a,r,s'), #--only from the first depth image, without considering other 9 rotated depth images
        state_d = state_path
        next_state_d = next_state_path
        if c > 0:
            state_d = my_utils.get_rotate_depth(c,state_d)
            next_state_d = my_utils.get_rotate_depth(c, next_state_d)
        state_depth = my_utils.copy_depth_to_3_channel(state_d).reshape(1, 3, DIM_STATES[0], DIM_STATES[1])
        next_state_depth = my_utils.copy_depth_to_3_channel(next_state_d).reshape(1, 3, DIM_STATES[0], DIM_STATES[1])

        if c == 0:
            state_rgb = my_utils.trans_HWC_to_CHW(cv2.imread(state_path.replace('npy','png').replace('state_depth','state_image'))).reshape(1, 3, DIM_STATES[0], DIM_STATES[1])
            next_state_rgb = my_utils.trans_HWC_to_CHW(cv2.imread(next_state_path.replace('npy','png').replace('state_depth', 'state_image'))).reshape(1, 3, DIM_STATES[0], DIM_STATES[1])
        else:
            state_rgb = my_utils.get_rotate_rgb(c,state_path.replace('npy','png').replace('state_depth','state_image')).reshape(1, 3, DIM_STATES[0], DIM_STATES[1])
            next_state_rgb = my_utils.get_rotate_rgb(c,next_state_path.replace('npy','png').replace('state_depth','state_image')).reshape(1, 3, DIM_STATES[0], DIM_STATES[1])

        # # normlize
        # state_depth = (state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR)
        # next_state_depth = (next_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR)
        # numpy to tensor
        state_depth = torch.cuda.FloatTensor(state_depth)
        next_state_depth = torch.cuda.FloatTensor(next_state_depth)
        state_rgb = torch.cuda.FloatTensor(state_rgb)
        next_state_rgb = torch.cuda.FloatTensor(next_state_rgb)

        target_singleChannel_q_map = self.eval_net.forward(state_rgb,state_depth)#dim:[1,1,224,224],CHANNEL=1
        # x,y,c = my_utils.translate_actionID_to_XY_and_channel(action)
        old_val = target_singleChannel_q_map[0][0][x][y]
        # old_val = target[0][action]
        target_val_singleChannel_q_map = self.target_net.forward(next_state_rgb,next_state_depth)#dim:[1,1,224,224]

        if done == 1:
            target_q = reward # target[0][action] = reward
        else:
            target_q = reward + self.discount_factor * torch.max(target_val_singleChannel_q_map) # target[0][action] = reward + self.discount_factor * torch.max(target_val)

        error = abs(old_val - target_q)
        self.memory.add(float(error), trans)

    def choose_action(self, state_path,EPSILON):
        state_rgb = []
        state_depth = []
        state_rgb.append(my_utils.trans_HWC_to_CHW(cv2.imread(state_path.replace('npy','png').replace('state_depth','state_image'))))
        state_depth.append(my_utils.copy_depth_to_3_channel(state_path))#dim:[3, DIM_STATES[0], DIM_STATES[1]]#.reshape(1, 3, DIM_STATES[0], DIM_STATES[1]))
        for i in range(1,Train_Configs.ROTATION_BINS):
            state_rotate_rgb = my_utils.get_rotate_rgb(i,state_path.replace('npy','png').replace('state_depth','state_image'))
            state_rgb.append(state_rotate_rgb)
            #------------------------
            state_rotate_depth = my_utils.get_rotate_depth(i,state_path)
            state_rotate_3_depth = my_utils.copy_depth_to_3_channel(state_rotate_depth)
            state_depth.append(state_rotate_3_depth)

        state_rgb = np.array(state_rgb)
        state_depth = np.array(state_depth)
        # # normlize
        # state_depth = (state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR)
        # numpy to tensor
        state_rgb = torch.cuda.FloatTensor(state_rgb)  # dim:[INPUT_IMAGE,3,224,224]
        state_depth = torch.cuda.FloatTensor(state_depth) #dim:[INPUT_IMAGE,3,224,224]

        # random exploration
        prob = np.min((EPSILON,1))
        p_select = np.array([prob, 1 - prob])
        selected_ac_type = np.random.choice([0, 1], p=p_select.ravel())

        if selected_ac_type == 0:#origin predicted action
            target_multiChannel_q_map = self.eval_net.forward(state_rgb,state_depth)  # dim:[INPUT_IMAGES,1,224,224]
            action = my_utils.find_maxQ_in_qmap(target_multiChannel_q_map.cpu().detach().numpy())
            ac_ty = '0'
        else:
            if np.random.randn() <= 0.5:#sample action according to depth image
                action = my_utils.select_randpID_from_mask(state_path)
                ac_ty = '1'
            else:# random sample
                action = np.random.randint(0,DIM_ACTIONS)
                ac_ty = '2'

        return ac_ty,action # the id of action

    def plot(self, ax, x):
        ax.cla()
        ax.set_xlabel("episode")
        ax.set_ylabel("total reward")
        ax.plot(x, 'b-')
        plt.pause(0.000000000000001)

    def load_batch_data(self,batch_list):#batch_list.dim:[batch_size]
        # print(batch_list)
        batch_state_rgb = []
        batch_state_depth = []
        batch_action = []
        batch_reward = []
        batch_next_state_rgb = []
        batch_next_state_depth = []

        for item in batch_list:
            data = item.split('#')#state+'#'+str(action)+'#'+str(reward)+'#'+next_state
            action_id = int(data[1])
            batch_state_rgb.append(my_utils.get_rotate_rgb(action_id,data[0].replace('npy','png').replace('state_depth','state_image')))
            batch_state_depth.append(my_utils.copy_depth_to_3_channel(my_utils.get_rotate_depth(action_id,data[0])).reshape((3,DIM_STATES[0],DIM_STATES[1])))
            batch_action.append([int(data[1])])
            batch_reward.append([float(data[2])])
            batch_next_state_rgb.append(my_utils.get_rotate_rgb(action_id, data[3].replace('npy','png').replace('state_depth', 'state_image')))
            batch_next_state_depth.append(my_utils.copy_depth_to_3_channel(my_utils.get_rotate_depth(action_id,data[3])).reshape((3,DIM_STATES[0],DIM_STATES[1])))

        batch_state_depth = np.array(batch_state_depth)
        batch_next_state_depth = np.array(batch_next_state_depth)
        # # normlize
        # batch_state_depth = (batch_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR)
        # batch_next_state_depth = (batch_next_state_depth - Train_Configs.MIN_DEPTH_ARR) / (Train_Configs.MAX_DEPTH_ARR - Train_Configs.MIN_DEPTH_ARR)

        return torch.cuda.FloatTensor(batch_state_rgb),torch.cuda.FloatTensor(batch_state_depth),torch.cuda.LongTensor(batch_action),torch.cuda.FloatTensor(batch_reward),torch.cuda.FloatTensor(batch_next_state_rgb),torch.cuda.FloatTensor(batch_next_state_depth)

    def learn(self):
        # learn 100 times then the target network update
        if self.learn_counter % Train_Configs.Q_NETWORK_ITERATION ==0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_counter+=1

        mini_batch, idxs, is_weights = self.memory.sample(Train_Configs.BATCH_SIZE)#
        batch_state_rgb,batch_state_depth,batch_action,batch_reward,batch_next_state_rgb,batch_next_state_depth = self.load_batch_data(mini_batch)#dim:[1]

        eval_singleChannel_q_map = self.eval_net(batch_state_rgb,batch_state_depth)  # dim:[BATCH_SIZE,1,224,224]
        x_y_c_list = my_utils.translate_actionID_to_XY_and_channel_batch(batch_action)
        # old_val = target_multiChannel_q_map[0][c][x][y]
        batch_q = []
        # for xyc in x_y_c_list:
        for i in range(len(x_y_c_list)):
            xyc = x_y_c_list[i]
            batch_q.append([eval_singleChannel_q_map[i][0][xyc[0]][xyc[1]]])
        q_eval = torch.cuda.FloatTensor(batch_q)#self.eval_net(batch_state).gather(1, batch_action)#action: a value in range [0,DIM_ACTIONS-1]
        q_eval = Variable(q_eval.cuda(), requires_grad=True)
        target_singleChannel_q_map = self.target_net(batch_next_state_rgb,batch_next_state_depth).cpu().detach().numpy()#q_next,dim:[BATCH_SIZE,1,224,224]
        batch_q_next = []
        for b_item in target_singleChannel_q_map:#dim:[1,224,224]
            batch_q_next.append([np.max(b_item)])
        q_next = torch.cuda.FloatTensor(batch_q_next)
        # q_next = Variable(q_next.cuda(), requires_grad=True)

        q_target = batch_reward + Train_Configs.GAMMA*q_next
        q_target = Variable(q_target.cuda(), requires_grad=True)
        # self.average_q = q_eval.mean()
        weight_tensor = torch.cuda.FloatTensor(is_weights)#
        weight_tensor = weight_tensor.reshape((Train_Configs.BATCH_SIZE,1))
        weight_tensor = Variable(weight_tensor.cuda(), requires_grad=False)

        loss = (weight_tensor * self.loss(q_eval, q_target)).mean()##(torch.FloatTensor(is_weights) * F.mse_loss(pred, target)).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return float(loss),float(q_eval.mean())
Ejemplo n.º 10
0
class Agent:
    """
    Interacts with and learns from the environment.
    Learns using a Deep Q-Network with prioritised experience replay.
    Two models are instantiated, one for use during evaluation and updating (qnetwork_local) and one to be used for the
    target values in the learning algorithm (qnetwork_target)
    """

    BUFFER_SIZE = int(1e5)  # prioritised experience replay buffer size
    BATCH_SIZE = 64  # minibatch size
    TAU = 1e-3  # for soft update of target parameters
    LR = 5e-4  # learning rate
    UPDATE_EVERY = 4  # how often to update the network
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def __init__(self,
                 state_size: int = 37,
                 action_size: int = 4,
                 seed: int = 44,
                 gamma: float = 0.99,
                 tau: float = 1e-3):
        """
        Initialize an Agent object.

        :param state_size: dimension of each state
        :param action_size: dimension of each action
        :param seed: random seed for network initialisation
        :param gamma: discount factor
        :param tau: lag for soft update of target network parameters
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.gamma = gamma
        self.tau = tau

        self.max_w = 0

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.LR)

        # Prioritised Experience Replay memory
        self.memory = Memory(self.BUFFER_SIZE)

        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self,
             state: np.ndarray,
             action: int,
             reward: float,
             next_state: np.ndarray,
             done: bool,
             gamma: Optional[float] = None,
             tau: Optional[float] = None):
        """
        An agent step takes the current experience and stores it in the replay memory, then samples from the memory and
        calls the learning algorithm.

        :param state: the state vector
        :param action: the action performed on the state
        :param reward: the reward given upon performing the action
        :param next_state: the next state after doing the action
        :param done: True if the episode has ended
        :param gamma: discount factor
        :param tau: lag for soft update of target network parameters
        """
        gamma_value = gamma if gamma is not None else self.gamma
        tau_value = tau if tau is not None else self.tau

        self.memory.add((state, action, reward, next_state,
                         done))  # Save experience in replay memory

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if self.memory.tree.n_entries > self.BATCH_SIZE:
                experiences, idxs, importance_weights = self.memory.sample(
                    self.BATCH_SIZE)
                self.learn(experiences, idxs, importance_weights, gamma_value,
                           tau_value)

    def act(self, state: np.ndarray, eps: float = 0.0):
        """
        Returns actions for given state as per current policy. Uses the local copy of the model.

        :param state: current state
        :param eps: epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.int32(np.argmax(action_values.cpu().data.numpy()))
        else:
            return np.int32(random.choice(np.arange(self.action_size)))

    def learn(self, experiences: Tuple[torch.Tensor, torch.Tensor,
                                       torch.Tensor, torch.Tensor,
                                       torch.Tensor], indices: np.ndarray,
              importance_weights: torch.Tensor, gamma: float, tau: float):
        """
        Update value parameters using given batch of experience tuples.

        :param experiences: tuple of (s, a, r, s', done) tuples
        :param indices:
            indices of the SumTree that contain the priority values for these experiences. Used for updating the
            priority values after error has been found
        :param importance_weights: the weighting that each experience carries when used in updating the network
        :param gamma: discount factor
        :param tau: lag for soft update of target network parameters
        """
        states, actions, rewards, next_states, dones = experiences

        # For Double-DQN, get action with the highest q-value (for next_states) from the local model
        next_action = self.qnetwork_local(next_states).detach().max(
            1)[1].unsqueeze(1)
        # Get max predicted Q values (for next states) from target model
        q_targets_next = self.qnetwork_target(next_states).gather(
            1, next_action)
        # Compute Q targets for current states
        q_targets = rewards + (gamma * q_targets_next * (1 - dones))

        # Get expected Q values from local model
        q_expected = self.qnetwork_local(states).gather(1, actions)

        error = torch.abs(q_targets - q_expected).detach().numpy()

        # update priorities
        self.memory.batch_update(indices, error)

        # Compute mse and loss with importance weights
        t_mse = F.mse_loss(q_expected, q_targets)
        loss = (importance_weights * t_mse).mean()
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # update target network with model parameters approaching those of the local network.
        self.soft_update(self.qnetwork_local, self.qnetwork_target, tau)

    @staticmethod
    def soft_update(local_model: torch.nn.Module,
                    target_model: torch.nn.Module, tau: float):
        """
        Soft update model parameters. Every learning step the target network is updated to bring its parameters nearer
        by a factor TAU to those of the improving local network.

        If TAU = 1 the target network becomes a copy of the local network.
        If TAU = 0 the target network is not updated.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        :param local_model: weights will be copied from
        :param target_model: weights will be copied to
        :param tau: interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class DDQN_Agent:
    def __init__(self, useDepth=False):
        self.useDepth = useDepth
        self.eps_start = 0.9
        self.eps_end = 0.05
        self.eps_decay = 30000
        self.gamma = 0.8
        self.learning_rate = 0.001
        self.batch_size = 512
        self.memory = Memory(10000)
        self.max_episodes = 10000
        self.save_interval = 2
        self.test_interval = 10
        self.network_update_interval = 10
        self.episode = -1
        self.steps_done = 0
        self.max_steps = 34

        self.policy = DQN()
        self.target = DQN()
        self.test_network = DQN()
        self.target.eval()
        self.test_network.eval()
        self.updateNetworks()

        self.env = DroneEnv(useDepth)
        self.optimizer = optim.Adam(self.policy.parameters(), self.learning_rate)

        if torch.cuda.is_available():
            print('Using device:', device)
            print(torch.cuda.get_device_name(0))
        else:
            print("Using CPU")

        # LOGGING
        cwd = os.getcwd()
        self.save_dir = os.path.join(cwd, "saved models")
        if not os.path.exists(self.save_dir):
            os.mkdir("saved models")
        if not os.path.exists(os.path.join(cwd, "videos")):
            os.mkdir("videos")

        if torch.cuda.is_available():
            self.policy = self.policy.to(device)  # to use GPU
            self.target = self.target.to(device)  # to use GPU
            self.test_network = self.test_network.to(device)  # to use GPU

        # model backup
        files = glob.glob(self.save_dir + '\\*.pt')
        if len(files) > 0:
            files.sort(key=os.path.getmtime)
            file = files[-1]
            checkpoint = torch.load(file)
            self.policy.load_state_dict(checkpoint['state_dict'])
            self.episode = checkpoint['episode']
            self.steps_done = checkpoint['steps_done']
            self.updateNetworks()
            print("Saved parameters loaded"
                  "\nModel: ", file,
                  "\nSteps done: ", self.steps_done,
                  "\nEpisode: ", self.episode)


        else:
            if os.path.exists("log.txt"):
                open('log.txt', 'w').close()
            if os.path.exists("last_episode.txt"):
                open('last_episode.txt', 'w').close()
            if os.path.exists("last_episode.txt"):
                open('saved_model_params.txt', 'w').close()

        self.optimizer = optim.Adam(self.policy.parameters(), self.learning_rate)
        obs, _ = self.env.reset()
        tensor = self.transformToTensor(obs)
        writer.add_graph(self.policy, tensor)

    def updateNetworks(self):
        self.target.load_state_dict(self.policy.state_dict())

    def transformToTensor(self, img):
        tensor = torch.FloatTensor(img).to(device)
        tensor = tensor.unsqueeze(0)
        tensor = tensor.unsqueeze(0)
        tensor = tensor.float()
        return tensor

    def convert_size(self, size_bytes):
        if size_bytes == 0:
            return "0B"
        size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
        i = int(math.floor(math.log(size_bytes, 1024)))
        p = math.pow(1024, i)
        s = round(size_bytes / p, 2)
        return "%s %s" % (s, size_name[i])

    def act(self, state):
        self.eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * math.exp(
            -1.0 * self.steps_done / self.eps_decay
        )
        self.steps_done += 1
        if random.random() > self.eps_threshold:
            # print("greedy")
            if torch.cuda.is_available():
                action = np.argmax(self.policy(state).cpu().data.squeeze().numpy())
            else:
                action = np.argmax(self.policy(state).data.squeeze().numpy())
        else:
            action = random.randrange(0, 4)
        return int(action)

    def append_sample(self, state, action, reward, next_state):
        next_state = self.transformToTensor(next_state)

        current_q = self.policy(state).squeeze().cpu().detach().numpy()[action]
        next_q = self.target(next_state).squeeze().cpu().detach().numpy()[action]
        expected_q = reward + (self.gamma * next_q)

        error = abs(current_q - expected_q),

        self.memory.add(error, state, action, reward, next_state)

    def learn(self):
        if self.memory.tree.n_entries < self.batch_size:
            return

        states, actions, rewards, next_states, idxs, is_weights = self.memory.sample(self.batch_size)

        states = tuple(states)
        next_states = tuple(next_states)

        states = torch.cat(states)
        actions = np.asarray(actions)
        rewards = np.asarray(rewards)
        next_states = torch.cat(next_states)

        current_q = self.policy(states)[[range(0, self.batch_size)], [actions]]
        next_q =self.target(next_states).cpu().detach().numpy()[[range(0, self.batch_size)], [actions]]
        expected_q = torch.FloatTensor(rewards + (self.gamma * next_q)).to(device)

        errors = torch.abs(current_q.squeeze() - expected_q.squeeze()).cpu().detach().numpy()

        # update priority
        for i in range(self.batch_size):
            idx = idxs[i]
            self.memory.update(idx, errors[i])

        loss = F.smooth_l1_loss(current_q.squeeze(), expected_q.squeeze())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def train(self):
        print("Starting...")

        score_history = []
        reward_history = []

        if self.episode == -1:
            self.episode = 1

        for e in range(1, self.max_episodes + 1):
            start = time.time()
            state, _ = self.env.reset()
            steps = 0
            score = 0
            while True:
                state = self.transformToTensor(state)

                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)

                if steps == self.max_steps:
                    done = 1

                #self.memorize(state, action, reward, next_state)
                self.append_sample(state, action, reward, next_state)
                self.learn()

                state = next_state
                steps += 1
                score += reward
                if done:
                    print("----------------------------------------------------------------------------------------")
                    if self.memory.tree.n_entries < self.batch_size:
                        print("Training will start after ", self.batch_size - self.memory.tree.n_entries, " steps.")
                        break

                    print(
                        "episode:{0}, reward: {1}, mean reward: {2}, score: {3}, epsilon: {4}, total steps: {5}".format(
                            self.episode, reward, round(score / steps, 2), score, self.eps_threshold, self.steps_done))
                    score_history.append(score)
                    reward_history.append(reward)
                    with open('log.txt', 'a') as file:
                        file.write(
                            "episode:{0}, reward: {1}, mean reward: {2}, score: {3}, epsilon: {4}, total steps: {5}\n".format(
                                self.episode, reward, round(score / steps, 2), score, self.eps_threshold,
                                self.steps_done))

                    if torch.cuda.is_available():
                        print('Total Memory:', self.convert_size(torch.cuda.get_device_properties(0).total_memory))
                        print('Allocated Memory:', self.convert_size(torch.cuda.memory_allocated(0)))
                        print('Cached Memory:', self.convert_size(torch.cuda.memory_reserved(0)))
                        print('Free Memory:', self.convert_size(torch.cuda.get_device_properties(0).total_memory - (
                                torch.cuda.max_memory_allocated() + torch.cuda.max_memory_reserved())))

                        # tensorboard --logdir=runs
                        memory_usage_allocated = np.float64(round(torch.cuda.memory_allocated(0) / 1024 ** 3, 1))
                        memory_usage_cached = np.float64(round(torch.cuda.memory_reserved(0) / 1024 ** 3, 1))

                        writer.add_scalar("memory_usage_allocated", memory_usage_allocated, self.episode)
                        writer.add_scalar("memory_usage_cached", memory_usage_cached, self.episode)

                    writer.add_scalar('epsilon_value', self.eps_threshold, self.episode)
                    writer.add_scalar('score_history', score, self.episode)
                    writer.add_scalar('reward_history', reward, self.episode)
                    writer.add_scalar('Total steps', self.steps_done, self.episode)
                    writer.add_scalars('General Look', {'score_history': score,
                                                        'reward_history': reward}, self.episode)

                    # save checkpoint
                    if self.episode % self.save_interval == 0:
                        checkpoint = {
                            'episode': self.episode,
                            'steps_done': self.steps_done,
                            'state_dict': self.policy.state_dict()
                        }
                        torch.save(checkpoint, self.save_dir + '//EPISODE{}.pt'.format(self.episode))

                    if self.episode % self.network_update_interval == 0:
                        self.updateNetworks()

                    self.episode += 1
                    end = time.time()
                    stopWatch = end - start
                    print("Episode is done, episode time: ", stopWatch)

                    if self.episode % self.test_interval == 0:
                        self.test()

                    break
        writer.close()

    def test(self):
        self.test_network.load_state_dict(self.target.state_dict())

        start = time.time()
        steps = 0
        score = 0
        image_array = []
        state, next_state_image = self.env.reset()
        image_array.append(next_state_image)

        while True:
            state = self.transformToTensor(state)

            action = int(np.argmax(self.test_network(state).cpu().data.squeeze().numpy()))
            next_state, reward, done, next_state_image = self.env.step(action)
            image_array.append(next_state_image)

            if steps == self.max_steps:
                done = 1

            state = next_state
            steps += 1
            score += reward

            if done:
                print("----------------------------------------------------------------------------------------")
                print("TEST, reward: {}, score: {}, total steps: {}".format(
                    reward, score, self.steps_done))

                with open('tests.txt', 'a') as file:
                    file.write("TEST, reward: {}, score: {}, total steps: {}\n".format(
                        reward, score, self.steps_done))

                writer.add_scalars('Test', {'score': score, 'reward': reward}, self.episode)

                end = time.time()
                stopWatch = end - start
                print("Test is done, test time: ", stopWatch)

                # Convert images to video
                frameSize = (256, 144)
                import cv2
                video = cv2.VideoWriter("videos\\test_video_episode_{}_score_{}.avi".format(self.episode, score), cv2.VideoWriter_fourcc(*'DIVX'), 7, frameSize)

                for img in image_array:
                    video.write(img)

                video.release()

                break
class A2CAgent:
    def __init__(self,
                 replay_size,
                 memory_size=10000,
                 prioritized=False,
                 load_models=False,
                 actor_model_file='',
                 critic_model_file='',
                 is_eval=False):
        self.state_size = 2
        self.action_size = 3
        self.step = 0
        self.replay_size = replay_size
        self.replay_queue = deque(maxlen=self.replay_size)
        self.memory_size = memory_size
        self.prioritized = prioritized
        if self.prioritized:
            self.memory = Memory(capacity=memory_size)

        # Hyper parameters for learning
        self.value_size = 1
        self.layer_size = 16
        self.discount_factor = 0.99
        self.actor_learning_rate = 0.0005
        self.critic_learning_rate = 0.005
        self.is_eval = is_eval

        # Create actor and critic neural networks
        self.actor = self.build_actor()
        self.critic = self.build_critic()
        #self.actor.summary()

        if load_models:
            if actor_model_file:
                self.actor.load_weights(actor_model_file)
            if critic_model_file:
                self.critic.load_weights(critic_model_file)

    # The actor takes a state and outputs probabilities of each possible action
    def build_actor(self):

        layer1 = Dense(self.layer_size,
                       input_dim=self.state_size,
                       activation='relu',
                       kernel_initializer='he_uniform')
        layer2 = Dense(self.layer_size,
                       input_dim=self.layer_size,
                       activation='relu',
                       kernel_initializer='he_uniform')
        # Use softmax activation so that the sum of probabilities of the actions becomes 1
        layer3 = Dense(self.action_size,
                       activation='softmax',
                       kernel_initializer='he_uniform')  # self.action_size = 2

        actor = Sequential(layers=[layer1, layer2, layer3])

        # Print a summary of the network
        actor.summary()

        # We use categorical crossentropy loss since we have a probability distribution
        actor.compile(loss='categorical_crossentropy',
                      optimizer=Adam(lr=self.actor_learning_rate))
        return actor

    # The critic takes a state and outputs the predicted value of the state
    def build_critic(self):

        layer1 = Dense(self.layer_size,
                       input_dim=self.state_size,
                       activation='relu',
                       kernel_initializer='he_uniform')
        layer2 = Dense(self.layer_size,
                       input_dim=self.layer_size,
                       activation='relu',
                       kernel_initializer='he_uniform')
        layer3 = Dense(self.value_size,
                       activation='linear',
                       kernel_initializer='he_uniform')  # self.value_size = 1

        critic = Sequential(layers=[layer1, layer2, layer3])

        # Print a summary of the network
        critic.summary()

        critic.compile(loss='mean_squared_error',
                       optimizer=Adam(lr=self.critic_learning_rate))
        return critic

    def act(self, state):
        # Get probabilities for each action
        policy = self.actor.predict(np.array([state]), batch_size=1).flatten()

        # Randomly choose an action
        if not self.is_eval:
            return np.random.choice(self.action_size, 1, p=policy).take(0)
        else:
            return np.argmax(policy)  # 20191117- for evaluation

    def store_transition(self, s, a, r, s_, dd):
        if self.prioritized:  # prioritized replay
            transition = np.hstack((s, [a, r], s_, dd))
            self.memory.store(
                transition)  # have high priority for newly arrived transition
        else:
            #self.replay_queue.append((s, [a, r], s_, dd))
            transition = np.hstack((s, [a, r], s_, dd))
            self.replay_queue.append(transition)

    def expReplay(self, batch_size=64, lr=1, factor=0.95):
        if self.prioritized:
            tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size)
        else:
            batch_memory = random.sample(self.replay_queue, batch_size)

        s_prevBatch = np.array([replay[[0, 1]] for replay in batch_memory])
        a = np.array([replay[[2]] for replay in batch_memory])
        r = np.array([replay[[3]] for replay in batch_memory])
        s_currBatch = np.array([replay[[4, 5]] for replay in batch_memory])
        d = np.array([replay[[6]] for replay in batch_memory])

        td_error = np.zeros((d.shape[0], ), dtype=float)
        for i in range(d.shape[0]):
            q_prev = self.critic.predict(np.array([s_prevBatch[i, :]]))
            q_curr = self.critic.predict(np.array([s_currBatch[i, :]]))
            if int(d[i]) == 1:
                q_curr = r[i]
            q_realP = r[i] + factor * q_curr
            advantages = np.zeros((1, self.action_size))
            advantages[0, int(a[i])] = q_realP - q_prev

            if self.prioritized:
                td_error[i] = abs(advantages[0, int(a[i])])

            self.actor.fit(np.array([s_prevBatch[i, :]]),
                           advantages,
                           epochs=1,
                           verbose=0)
            self.critic.fit(np.array([s_prevBatch[i, :]]),
                            reshape(q_realP),
                            epochs=1,
                            verbose=0)

        if self.prioritized:
            self.memory.batch_update(tree_idx, td_error)
class DoubleDQN(object):
    def __init__(self, replay_size, memory_size=10000, prioritized=False):
        self.step = 0
        self.replay_size = replay_size
        self.replay_queue = deque(maxlen=self.replay_size)
        self.memory_size = memory_size
        self.tau = 1e-2  #MountainCar-v0
        self.model = self.create_model()
        self.prioritized = prioritized
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        if self.prioritized:
            self.memory = Memory(capacity=memory_size)

    def create_model(self):

        STATE_DIM, ACTION_DIM = 2, 3
        model = models.Sequential([
            layers.Dense(100, input_dim=STATE_DIM, activation='relu'),
            layers.Dense(ACTION_DIM, activation="linear")
        ])
        model.compile(loss='mean_squared_error',
                      optimizer=optimizers.Adam(0.001))
        return model

    def act(self, s, epsilon=0.1):

        #
        if np.random.uniform() < epsilon - self.step * 0.0002:
            return np.random.choice([0, 1, 2])
        return np.argmax(self.model.predict(np.array([s]))[0])

    def save_model(self, file_path='MountainCar-v0-Ddqn.h5'):
        print('model saved')
        self.model.save(file_path)

    def store_transition(self, s, a, r, s_, dd):
        if self.prioritized:  # prioritized replay
            transition = np.hstack((s, [a, r], s_, dd))  # transition -> 7x1
            self.memory.store(
                transition)  # have high priority for newly arrived transition
        else:
            #self.replay_queue.append((s, [a, r], s_, dd))
            transition = np.hstack((s, [a, r], s_, dd))  # transition -> 7x1
            self.replay_queue.append(transition)

    def expReplay(self, batch_size=64, lr=1, factor=0.95):

        if self.prioritized:
            tree_idx, batch_memory, ISWeights = self.memory.sample(batch_size)
        else:
            batch_memory = random.sample(self.replay_queue, batch_size)

        s_batch = np.array([replay[[0, 1]] for replay in batch_memory])
        a = np.array([replay[[2]] for replay in batch_memory])
        r = np.array([replay[[3]] for replay in batch_memory])
        next_s_batch = np.array([replay[[4, 5]] for replay in batch_memory])
        d = np.array([replay[[6]] for replay in batch_memory])

        Q = self.model.predict(s_batch)
        Q_next = self.model.predict(next_s_batch)
        Q_targ = self.target_model.predict(next_s_batch)

        #update Q value
        td_error = np.zeros((d.shape[0], ), dtype=float)
        for i in range(d.shape[0]):
            old_q = Q[i, int(a[i])]
            if int(d[i]) == 1:
                Q[i, int(a[i])] = r[i]
            else:
                next_best_action = np.argmax(Q_next[i, :])
                Q[i, int(a[i])] = r[i] + factor * Q_targ[i, next_best_action]

            if self.prioritized:
                td_error[i] = abs(old_q - Q[i, int(a[i])])

        if self.prioritized:
            self.memory.batch_update(tree_idx, td_error)

        self.model.fit(s_batch, Q, verbose=0)

    def transfer_weights(self):
        """ Transfer Weights from Model to Target at rate Tau
        """
        W = self.model.get_weights()
        tgt_W = self.target_model.get_weights()
        for i in range(len(W)):
            tgt_W[i] = self.tau * W[i] + (1 - self.tau) * tgt_W[i]
        self.target_model.set_weights(tgt_W)
Ejemplo n.º 14
0
class Agent:  # todo change name to Agent
    def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1,
                 lambda_2, lambda_3, n_steps, l_margin):
        # Input Parameters
        self.eps = eps  # eps-greedy
        self.gamma = gamma  # discount factor
        self.batch_size = batch_size
        self.tau = tau  # frequency of target replacement
        self.ed = 0.005  # bonus for demonstration # todo they aren't used
        self.ea = 0.001  # todo they aren't used
        self.l_margin = l_margin
        self.n_steps = n_steps
        self.lambda1 = lambda_1  # n-step return
        self.lambda2 = lambda_2  # supervised loss
        self.lambda3 = lambda_3  # L2

        self.counter = 0  # target replacement counter # todo change to iter_counter
        self.replay = Memory(capacity=max_memory)
        self.loss = nn.MSELoss()
        self.policy = Policy()  # todo change not have to pass architecture
        self.opt = optim.Adam(self.policy.predictNet.parameters(),
                              lr=lr,
                              weight_decay=lambda_3)

        self.replay.e = 0
        self.demoReplay = ddict(list)

        self.noisy = hasattr(self.policy.predictNet, "sample")

    def choose_action(self, state):
        state = torch.Tensor(state)
        A = self.policy.sortedA(state)

        if self.noisy:
            self.policy.predictNet.sample()
            return A[0]

        if np.random.random() < self.eps:
            return random.sample(A, 1)[0]
        return A[0]

    def sample(self):
        return self.replay.sample(self.batch_size)

    def store_demonstration(self, s, a, r, s_, done, episode):
        s = torch.Tensor(s)
        s_ = torch.Tensor(s_)
        episodeReplay = self.demoReplay[
            episode]  # replay of certain demo episode
        index = len(episodeReplay)
        data = (s, a, r, s_, done, (episode, index))
        episodeReplay.append(data)
        self.replay.add(transition=data, demonstration=True)

    def store_transition(self, s, a, r, s_, done):
        s = torch.Tensor(s)
        s_ = torch.Tensor(s_)
        data = (s, a, r, s_, done, None)
        self.replay.add(transition=data, demonstration=False)

    def calculate_td_errors(self, samples):
        if self.noisy:
            self.policy.predictNet.sample()  # for choosing action
        alls, alla, allr, alls_, alldone, *_ = zip(*samples)
        maxA = [self.policy.sortedA(s_)[0] for s_ in alls_]
        if self.noisy:
            self.policy.predictNet.sample()  # for prediction
            self.policy.targetNet.sample()  # for target

        Qtarget = torch.Tensor(allr)
        Qtarget[torch.tensor(alldone) != 1] += self.gamma * self.policy.calcQ(
            self.policy.targetNet, alls_, maxA)[torch.tensor(alldone) != 1]
        Qpredict = self.policy.calcQ(self.policy.predictNet, alls, alla)
        return Qpredict, Qtarget

    def JE(self, samples):
        loss = torch.tensor(0.0)
        count = 0  # number of demo
        for s, aE, *_, isdemo in samples:
            if isdemo is None:
                continue
            A = self.policy.sortedA(s)
            if len(A) == 1:
                continue
            QE = self.policy.calcQ(self.policy.predictNet, s, aE)
            A1, A2 = np.array(A)[:
                                 2]  # action with largest and second largest Q
            maxA = A2 if (A1 == aE).all() else A1
            Q = self.policy.calcQ(self.policy.predictNet, s, maxA)
            if (Q + self.l_margin) < QE:
                continue
            else:
                loss += (Q - QE)
                count += 1
        return loss / count if count != 0 else loss

    def Jn(self, samples, Qpredict):
        # wait for refactoring, can't use with noisy layer
        loss = torch.tensor(0.0)
        count = 0
        for i, (s, a, r, s_, done, isdemo) in enumerate(samples):
            if isdemo is None:
                continue
            episode, idx = isdemo
            nidx = idx + self.n_steps
            lepoch = len(self.demoReplay[episode])
            if nidx > lepoch:
                continue
            count += 1
            ns, na, nr, ns_, ndone, _ = zip(
                *self.demoReplay[episode][idx:nidx])
            ns, na, ns_, ndone = ns[-1], na[-1], ns_[-1], ndone[-1]
            discountedR = reduce(
                lambda x, y: (x[0] + self.gamma**x[1] * y, x[1] + 1), nr,
                (0, 0))[0]
            maxA = self.policy.sortedA(ns_)[0]
            target = discountedR if ndone else discountedR + self.gamma**self.n_steps * self.policy.calcQ(
                self.policy.targetNet, ns_, maxA)
            predict = Qpredict[i]
            loss += (target - predict)**2
        return loss / count

    def L2(self, parameters):
        loss = 0
        for p in parameters:
            loss += (p**2).sum()
        return loss

    def learn(self):
        self.opt.zero_grad()
        samples, idxs, = self.sample()
        Qpredict, Qtarget = self.calculate_td_errors(samples)

        for i in range(self.batch_size):
            error = math.fabs(float(Qpredict[i] - Qtarget[i]))
            self.replay.update(idxs[i], error)

        JDQ = self.loss(Qpredict, Qtarget)
        JE = self.JE(samples)
        Jn = self.Jn(samples, Qpredict)
        L2 = self.L2(self.policy.predictNet.parameters())
        J = JDQ + self.lambda2 * JE + self.lambda1 * Jn + self.lambda3 * L2
        J.backward()
        self.opt.step()

        self.counter += 1
        if self.counter % self.tau == 0:
            self.policy.updateTargetNet()
Ejemplo n.º 15
0
class QLearning:
    def __init__(
        self,
        k,
        d,
        env_name,
        env_dir,
        env_fixed_xo,
        n_hidden,
        save_and_load_path,
        load,
        tensorboard_path,
        logger_path,
        learn_wall_time_limit,
        prioritized,
        trial_size,
        learning_rate=0.005,
        # we have finite horizon, so we don't worry about reward explosion
        # see: https://goo.gl/Ew4629 (Other Prediction Problems and Update Rules)
        reward_decay=1.0,
        e_greedy=0.8,
        save_model_iter=5000,
        memory_capacity=300000,
        memory_capacity_start_learning=10000,
        batch_size=64,
        e_greedy_increment=0.0005,
        replace_target_iter=500,
        planning=False,
        random_seed=None,
    ):
        self.env_name = env_name
        self.env, self.n_features, self.n_actions = self.get_env(
            env_name, env_dir, env_fixed_xo, k, d)
        self.save_and_load_path = save_and_load_path
        self.load = load

        self.path_check(load)

        # create a graph for model variables and session
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        if not load:
            self.random_seed = random_seed
            numpy.random.seed(self.random_seed)
            tf.set_random_seed(self.random_seed)

            self.tensorboard_path = tensorboard_path
            self.logger_path = logger_path
            self.tb_writer = TensorboardWriter(
                folder_name=self.tensorboard_path, session=self.sess)
            self.logger = Logger(self.logger_path)

            self.n_hidden = n_hidden
            self.lr = learning_rate
            self.gamma = reward_decay
            self.epsilon_max = e_greedy
            self.save_model_iter = save_model_iter
            self.memory_capacity = memory_capacity
            self.memory_capacity_start_learning = memory_capacity_start_learning
            self.learn_wall_time_limit = learn_wall_time_limit
            self.batch_size = batch_size
            self.epsilon_increment = e_greedy_increment
            self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
            self.prioritized = prioritized  # decide to use prioritized experience replay or not
            self.trial_size = trial_size
            self.replace_target_iter = replace_target_iter
            self.planning = planning  # decide to use planning for additional learning

            with self.graph.as_default():
                self._build_net()
                self.sess.run(tf.global_variables_initializer())
            self.memory = Memory(prioritized=self.prioritized,
                                 capacity=self.memory_capacity,
                                 n_features=self.n_features,
                                 n_actions=self.n_actions,
                                 batch_size=self.batch_size,
                                 planning=self.planning,
                                 qsa_feature_extractor=self.env.step_state,
                                 qsa_feature_extractor_for_all_acts=self.env.
                                 all_possible_next_states)
            self.learn_iterations = 0
            self.learn_wall_time = 0.
            self.sample_iterations = 0
            self.sample_wall_time = 0.
            self.last_cpu_time = 0.
            self.last_wall_time = 0.
            self.last_save_time = time.time()
            self.last_test_learn_iterations = 0
        else:
            self.load_model()

        self.memory_lock = multiprocessing.Lock(
        )  # lock for memory modification

    def get_env(self, env_name, env_dir, env_fixed_xo, k, d):
        # n_actions: # of one-card modification + 1 for not changing any card
        # n_features: input dimension to qlearning network (x_o and x_p plus time step as a feature)
        if env_name == 'env_nn':
            from environment.env_nn import Environment
            if env_dir:
                env = Environment.load(env_dir)
            else:
                raise NotImplementedError(
                    'we enforce environment has been created')
            n_features, n_actions = 2 * env.k + 1, env.d * (env.k - env.d) + 1
        elif env_name == 'env_nn_noisy':
            from environment.env_nn_noisy import Environment
            if env_dir:
                env = Environment.load(env_dir)
            else:
                raise NotImplementedError(
                    'we enforce environment has been created')
            n_features, n_actions = 2 * env.k + 1, env.d * (env.k - env.d) + 1
        elif env_name == 'env_greedymove':
            from environment.env_greedymove import Environment
            if env_dir:
                env = Environment.load(env_dir)
            else:
                raise NotImplementedError(
                    'we enforce environment has been created')
            n_features, n_actions = 2 * env.k + 1, env.d * (env.k - env.d) + 1
        elif env_name == 'env_gamestate':
            from environment.env_gamestate import Environment
            if env_dir:
                env = Environment.load(env_dir)
            else:
                raise NotImplementedError(
                    'we enforce environment has been created')
            n_features, n_actions = 2 * env.k + 1, env.d * (env.k - env.d) + 1

        return env, n_features, n_actions

    def path_check(self, load):
        save_and_load_path_dir = os.path.dirname(self.save_and_load_path)
        if load:
            assert os.path.exists(
                save_and_load_path_dir
            ), "model path not exist:" + save_and_load_path_dir
        else:
            os.makedirs(save_and_load_path_dir, exist_ok=True)
            # remove old existing models if any
            files = glob.glob(save_and_load_path_dir + '/*')
            for file in files:
                os.remove(file)

    def save_model(self):
        # save tensorflow
        with self.graph.as_default():
            saver = tf.train.Saver()
            path = saver.save(self.sess, self.save_and_load_path)
        # save memory
        self.memory_lock.acquire()
        with open(self.save_and_load_path + '_memory.pickle', 'wb') as f:
            pickle.dump(self.memory, f, protocol=-1)  # -1: highest protocol
        self.memory_lock.release()
        # save variables
        with open(self.save_and_load_path + '_variables.pickle', 'wb') as f:
            pickle.dump(
                (self.random_seed, self.tensorboard_path, self.logger_path,
                 self.n_hidden, self.lr, self.gamma, self.epsilon_max,
                 self.save_model_iter, self.memory_capacity,
                 self.memory_capacity_start_learning,
                 self.learn_wall_time_limit, self.batch_size,
                 self.epsilon_increment, self.epsilon, self.prioritized,
                 self.trial_size, self.replace_target_iter, self.planning,
                 self.learn_iterations, self.sample_iterations,
                 self.learn_wall_time, self.sample_wall_time, self.cpu_time,
                 self.wall_time, self.last_test_learn_iterations),
                f,
                protocol=-1)
        self.last_save_time = time.time()
        print('save model to', path)

    def load_model(self):
        # load tensorflow
        with self.graph.as_default():
            saver = tf.train.import_meta_graph(self.save_and_load_path +
                                               '.meta')
            saver.restore(
                self.sess,
                tf.train.latest_checkpoint(
                    os.path.dirname(self.save_and_load_path)))
            # placeholders
            self.s = self.graph.get_tensor_by_name('s:0')  # Q(s,a) feature
            self.s_ = self.graph.get_tensor_by_name('s_:0')  # Q(s',a') feature
            self.rewards = self.graph.get_tensor_by_name('reward:0')  # reward
            self.terminal_weights = self.graph.get_tensor_by_name(
                'terminal:0')  # terminal
            # variables
            self.q_eval = self.graph.get_tensor_by_name('eval_net/q_eval:0')
            self.eval_w1 = self.graph.get_tensor_by_name('eval_net/l1/w1:0')
            self.eval_b1 = self.graph.get_tensor_by_name('eval_net/l1/b1:0')
            self.eval_w2 = self.graph.get_tensor_by_name('eval_net/l2/w2:0')
            self.eval_b2 = self.graph.get_tensor_by_name('eval_net/l2/b2:0')
            self.q_next = self.graph.get_tensor_by_name('eval_net/q_next:0')
            self.q_target = self.graph.get_tensor_by_name("q_target:0")
            self.is_weights = self.graph.get_tensor_by_name("is_weights:0")
            self.loss = self.graph.get_tensor_by_name("loss:0")
            self.abs_errors = self.graph.get_tensor_by_name("abs_errors:0")
            # operations
            self.train_op = self.graph.get_operation_by_name('train_op')
        # load memory
        with open(self.save_and_load_path + '_memory.pickle', 'rb') as f:
            self.memory = pickle.load(f)  # -1: highest protocol
        # load variables
        with open(self.save_and_load_path + '_variables.pickle', 'rb') as f:
            self.random_seed, \
            self.tensorboard_path, self.logger_path, \
            self.n_hidden, \
            self.lr, self.gamma, \
            self.epsilon_max, self.save_model_iter, \
            self.memory_capacity, self.memory_capacity_start_learning, \
            self.learn_wall_time_limit, self.batch_size, \
            self.epsilon_increment, self.epsilon, \
            self.prioritized, self.trial_size, \
            self.replace_target_iter, self.planning, \
            self.learn_iterations, \
            self.sample_iterations, \
            self.learn_wall_time, \
            self.sample_wall_time, \
            self.last_cpu_time, \
            self.last_wall_time, \
            self.last_test_learn_iterations = pickle.load(f)

        numpy.random.seed(self.random_seed)
        tf.set_random_seed(self.random_seed)

        self.tb_writer = TensorboardWriter(folder_name=self.tensorboard_path,
                                           session=self.sess)
        self.logger = Logger(self.logger_path)
        self.last_save_time = time.time()

    def _build_net(self):
        self.s = tf.placeholder(tf.float32, [None, self.n_features],
                                name='s')  # Q(s,a) feature
        self.s_ = tf.placeholder(tf.float32,
                                 [None, self.n_actions, self.n_features],
                                 name='s_')  # Q(s',a') feature
        self.rewards = tf.placeholder(tf.float32, [None],
                                      name='reward')  # reward
        self.terminal_weights = tf.placeholder(tf.float32, [None],
                                               name='terminal')  # terminal

        w_initializer, b_initializer = \
            tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers

        # ------------------ build evaluate_net ------------------
        with tf.variable_scope('eval_net'):
            # s is Q(s,a) feature, shape: (n_sample, n_features)
            # s_ Q(s',a') for all a' feature, shape: (n_sample, n_actions, n_features)
            with tf.variable_scope('l1'):
                self.eval_w1 = tf.get_variable(
                    'w1', [self.n_features, self.n_hidden],
                    initializer=w_initializer)
                self.eval_b1 = tf.get_variable('b1', [self.n_hidden],
                                               initializer=b_initializer)
                # l1 shape: (n_sample, n_hidden)
                l1 = tf.nn.relu(tf.matmul(self.s, self.eval_w1) + self.eval_b1)
                # l1_ shape: shape: (n_sample, n_actions, n_hidden)
                l1_ = tf.nn.relu(
                    tf.einsum('ijk,kh->ijh', self.s_, self.eval_w1) +
                    self.eval_b1)
            with tf.variable_scope('l2'):
                self.eval_w2 = tf.get_variable('w2', [self.n_hidden, 1],
                                               initializer=w_initializer)
                self.eval_b2 = tf.get_variable('b2', [1],
                                               initializer=b_initializer)
                # out shape: (n_sample, 1)
                out = tf.matmul(l1, self.eval_w2) + self.eval_b2
                # out_ shape: (n_sample, n_actions, 1), Q(s',a') for all a' feature
                out_ = tf.einsum('ijh,ho->ijo', l1_,
                                 self.eval_w2) + self.eval_b2
            self.q_eval = tf.squeeze(out, name='q_eval')
            self.q_next = tf.squeeze(out_, name='q_next')

        # ------------------ loss function ----------------------
        self.q_target = tf.add(
            self.rewards,
            self.terminal_weights *
            (self.gamma * tf.reduce_max(self.q_next, axis=1)),
            name='q_target')
        # We do not want the target to be used for computing the gradient
        self.q_target = tf.stop_gradient(self.q_target)
        # importance sampling weight
        self.is_weights = tf.placeholder(tf.float32, [None], name='is_weights')
        self.loss = tf.reduce_mean(
            self.is_weights *
            tf.squared_difference(self.q_target, self.q_eval),
            name='loss')
        self.abs_errors = tf.abs(self.q_target - self.q_eval,
                                 name='abs_errors')
        self.train_op = tf.train.RMSPropOptimizer(self.lr).minimize(
            self.loss, name='train_op')

    def store_transition(self, s, a, r, s_, terminal):
        self.memory_lock.acquire()
        # transition is a tuple (current_state, action, reward, next_state, whether_terminal)
        self.memory.store((s, a, r, s_, terminal))
        self.memory_lock.release()

    def update_memory_priority(self, exp_ids, abs_errors):
        """ update memory priority """
        self.memory_lock.acquire()
        self.memory.update_priority(exp_ids, abs_errors)
        self.memory_lock.release()

    def choose_action(self,
                      state,
                      next_possible_states,
                      next_possbile_actions,
                      epsilon_greedy=True):
        pred_q_values = self.sess.run(self.q_eval,
                                      feed_dict={
                                          self.s: next_possible_states
                                      }).flatten()
        if not epsilon_greedy or np.random.uniform() < self.epsilon:
            action_idx = np.argmax(pred_q_values)
        else:
            action_idx = np.random.choice(
                numpy.arange(len(next_possbile_actions)))
        action = next_possbile_actions[action_idx]
        pred_q_value = pred_q_values[action_idx]
        return action, pred_q_value

    # def _replace_target_params(self):
    #     with self.graph.as_default():
    #         t_params = tf.get_collection('target_net_params')
    #         e_params = tf.get_collection('eval_net_params')
    #         self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
    #         print('target_params_replaced')

    def planning_learn(self, qsa_next_features, qsa_features):
        """ additional learning from planning """
        raise NotImplementedError

    @property
    def cpu_time(self):
        cpu_time = psutil.Process().cpu_times()
        return cpu_time.user + cpu_time.system + cpu_time.children_system + cpu_time.children_user + self.last_cpu_time

    @property
    def wall_time(self):
        return time.time() - psutil.Process().create_time(
        ) + self.last_wall_time

    def learn(self):
        while True:
            if self.wall_time > self.learn_wall_time_limit:
                break

            if self.memory_size() < self.memory_capacity_start_learning:
                print('LEARN:{}:wait for more samples:wall time:{}'.format(
                    self.learn_iterations, self.wall_time))
                time.sleep(2)
                continue

            # don't learn too fast
            if self.learn_iterations > self.sample_iterations > 0:
                time.sleep(0.2)
                continue

            learn_time = time.time()
            qsa_feature, qsa_next_features, rewards, terminal_weights, is_weights, exp_ids \
                = self.memory.sample()

            _, loss, abs_errors = self.sess.run(
                [self.train_op, self.loss, self.abs_errors],
                feed_dict={
                    self.s: qsa_feature,
                    self.s_: qsa_next_features,
                    self.rewards: rewards,
                    self.terminal_weights: terminal_weights,
                    self.is_weights: is_weights
                })

            if self.prioritized:
                self.update_memory_priority(exp_ids, abs_errors)
                mem_total_p = self.memory.memory.tree.total_p
            else:
                mem_total_p = -1

            if self.planning:
                self.planning_learn()

            self.epsilon = self.cur_epsilon()

            learn_time = time.time() - learn_time
            self.learn_iterations += 1
            self.learn_wall_time += learn_time

            print(
                'LEARN:{}:mem_size:{}:virtual:{}:wall_t:{:.2f}:total:{:.2f}:cpu_time:{:.2f}:pid:{}:wall_t:{:.2f}:mem_p:{:.2f}'
                .format(self.learn_iterations, self.memory_size(),
                        self.memory_virtual_size(),
                        learn_time, self.learn_wall_time, self.cpu_time,
                        os.getpid(), self.wall_time, mem_total_p))

    def cur_epsilon(self):
        return self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max

    def tb_write(self, tags, values, step):
        """ write to tensorboard """
        if self.tb_writer:
            self.tb_writer.write(tags, values, step)

    def get_logger(self):
        return self.logger

    def memory_size(self):
        return self.memory.size

    def memory_virtual_size(self):
        return self.memory.virtual_size

    def function_call_counts_training(self):
        """ number of function calls during training, which equals to memory virtual size """
        return self.memory.virtual_size

    def collect_samples(self, EPISODE_SIZE, TEST_PERIOD):
        """ collect samples in a process """
        for i_episode in range(self.sample_iterations, EPISODE_SIZE):
            if self.wall_time > self.learn_wall_time_limit:
                self.save_model()
                break

            # don't sample too fast
            while 0 < self.learn_iterations < self.sample_iterations - 3:
                time.sleep(0.2)

            sample_wall_time = time.time()
            cur_state = self.env.reset()

            for i_episode_step in range(self.trial_size):
                # prevent wall time over limit during sampling
                if self.wall_time > self.learn_wall_time_limit:
                    self.save_model()
                    break

                # save every 6 min
                if time.time() - self.last_save_time > 6 * 60:
                    self.save_model()

                next_possible_states, next_possible_actions = self.env.all_possible_next_state_action(
                    cur_state)
                action, _ = self.choose_action(cur_state,
                                               next_possible_states,
                                               next_possible_actions,
                                               epsilon_greedy=True)
                cur_state_, reward = self.env.step(action)
                terminal = True if i_episode_step == self.trial_size - 1 else False
                self.store_transition(cur_state, action, reward, cur_state_,
                                      terminal)
                cur_state = cur_state_

            sample_wall_time = time.time() - sample_wall_time
            self.sample_iterations += 1
            self.sample_wall_time += sample_wall_time

            # end_state distilled output = reward (might be noisy)
            end_output = self.env.still(reward)
            mem_total_p = -1 if not self.prioritized else self.memory.memory.tree.total_p
            print(
                'SAMPLE:{}:finished output:{:.5f}:cur_epsilon:{:.5f}:mem_size:{}:virtual:{}:wall_t:{:.2f}:total:{:.2f}:pid:{}:wall_t:{:.2f}:mem_p:{:.2f}'
                .format(self.sample_iterations, end_output, self.cur_epsilon(),
                        self.memory_size(), self.memory_virtual_size(),
                        sample_wall_time, self.sample_wall_time, os.getpid(),
                        self.wall_time, mem_total_p))

            # test every once a while
            if self.memory_virtual_size() >= self.memory_capacity_start_learning \
                    and self.learn_iterations % TEST_PERIOD == 0 \
                    and self.learn_iterations > self.last_test_learn_iterations \
                    and self.wall_time < self.learn_wall_time_limit:
                #self.env.test(TRIAL_SIZE, RANDOM_SEED, self.learn_step_counter, self.wall_time, self.env_name,
                #               rl_model=self)
                max_val_rl, max_state_rl, end_val_rl, end_state_rl, duration_rl, _, _ = self.exp_test(
                )

                max_val_mc, max_state_mc, _, _, duration_mc, _ = self.env.monte_carlo(
                )
                self.logger.log_test(output_mc=max_val_mc,
                                     state_mc=max_state_mc,
                                     duration_mc=duration_mc,
                                     output_rl=max_val_rl,
                                     state_rl=max_state_rl,
                                     duration_rl=duration_rl,
                                     learn_step_counter=self.learn_iterations,
                                     wall_time=self.wall_time)

                self.tb_write(
                    tags=[
                        'Prioritized={0}, gamma={1}, seed={2}, env={3}, fixed_xo={4}/(Max_RL-MC)'
                        .format(self.prioritized, self.gamma, self.random_seed,
                                self.env_name, self.env.if_set_fixed_xo()),
                        'Prioritized={0}, gamma={1}, seed={2}, env={3}, fixed_xo={4}/Ending Output (RL)'
                        .format(self.prioritized, self.gamma, self.random_seed,
                                self.env_name, self.env.if_set_fixed_xo()),
                    ],
                    values=[max_val_rl - max_val_mc,
                            end_val_rl],  # note we record end value for RL
                    step=self.learn_iterations)

                self.last_test_learn_iterations = self.learn_iterations

    def exp_test(self, debug=True):
        """
        If debug is true, find the max output along the search.
        If debug is false, only return the end output
        """
        cur_state = self.env.reset()
        duration = time.time()
        start_state = cur_state.copy()
        end_output = max_output = -99999.
        max_state = None

        for i in range(self.trial_size):
            next_possible_states, next_possible_actions = self.env.all_possible_next_state_action(
                cur_state)
            action, q_val = self.choose_action(cur_state,
                                               next_possible_states,
                                               next_possible_actions,
                                               epsilon_greedy=False)
            if debug:
                # reward is noisy output
                cur_state, reward = self.env.step(action)
                # noiseless, stilled end output
                end_output = self.env.still(
                    self.env.output_noiseless(cur_state))
                print(
                    'TEST  :{}:output: {:.5f}, qval: {:.5f}, reward {:.5f}, at {}'
                    .format(i, end_output, q_val, reward, cur_state))
                if end_output > max_output:
                    max_output = end_output
                    max_state = cur_state.copy()
            else:
                cur_state = self.env.step_without_reward(action)
                print('TEST  :{}:qval: {:.5f}, at {}'.format(
                    i, q_val, cur_state))

        duration = time.time() - duration
        end_state = cur_state
        if not debug:
            end_output = self.env.still(self.env.output_noiseless(cur_state))

        if_set_fixed_xo = self.env.if_set_fixed_xo()

        return max_output, max_state, end_output, end_state, duration, if_set_fixed_xo, start_state

    # very adhoc methods to query environment's information
    def set_env_fixed_xo(self, x_o):
        self.env.set_fixed_xo(x_o)

    def get_env_if_set_fixed_xo(self):
        return self.env.if_set_fixed_xo()

    def get_learn_iteration(self):
        return self.learn_iterations

    def get_wall_time(self):
        return self.wall_time
Ejemplo n.º 16
0
class Agent():
    """
    The Agent interacts with the environment and learns from the interactions and the environment
    """
    def __init__(self, state_size, action_size, buffer_size, batch_size, gamma, tau, lr_actor, lr_critic, \
        weight_decay, epsilon, epsilon_decay, update_every, update_times, start_learning, random_seed, \
        mu ,theta, sigma):
        """
        Parameters
        ==========
            state_size (in): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
            epsilon (int): start value of epsilon, default 1.0
            buffer_size (int): size of the memorybuffer
            batch_size (int): size of the batch
            gamma (float): discounted value, must be between 0 and 1
            tau (float): blend parameter for the soft update, has to be between 0 and 1
            lr_actor (float): learning rate for the actor dnn
            lr_critic (float): learning rate for the critic dnn
            weight_decay (float): L2 penalty
            epsilon_decay (float): factor to reduce epsilon frequently
            update_every (int): update frequence
            update_times (int): how many times the weights should be updated at one update step

        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.random_seed = random_seed
        self.epsilon = epsilon
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.weight_decay = weight_decay
        self.epsilon_decay = epsilon_decay
        self.update_every = update_every
        self.update_times = update_times
        self.start_learning = start_learning
        self.theta = theta
        self.sigma = sigma
        self.mu = mu

        self.update_every_x = 0
        self.noise_getter_mean = 0.0

        # The Actor
        ###########
        self.actor_local  = Actor(self.state_size, self.action_size, self.random_seed).to(device)
        self.actor_target = Actor(self.state_size, self.action_size, self.random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor)

        # The Critic
        ############
        self.critic_local = Critic(self.state_size, self.action_size, self.random_seed).to(device)
        self.critic_target = Critic(self.state_size, self.action_size, self.random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic, weight_decay = self.weight_decay)

        # for target_param, param in zip(self.actor_target.parameters(), self.actor_local.parameters()):
        #     target_param.data.copy_(param.data)
        # for target_param, param in zip(self.critic_target.parameters(), self.critic_local.parameters()):
        #     target_param.data.copy_(param.data)
            
        # The Replay Buffer
        ###################
        self.PER = Memory(self.buffer_size)

        # The "Ornstein-Uhlenbeck-Noise"
        ################################
        self.noise = OUNoise(self.action_size, self.random_seed, 0., self.theta, self.sigma)

    def step(self, state, action, reward, next_state, done, learn_reset=True):
        """
        add info to the ringbuffer, if enough entries are available --> learn
        """
        #calculate the TD error
        state_calc = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        self.critic_target_eval() ## necessary???
        self.actor_target.eval()
        with torch.no_grad():
            
            old_val = self.critic_local(state,action).cpu().data.numpy()
            actions_next = self.actor_target(next_states)
            target_val = self.critic_target(next_states, actions_next).cpu().data.numpy()

            if done:
                target = reward
            else:
                target = reward + self.gamma * target_val            
            
             error = abs(old_val - target))
        
        self.actor_local.train()
        self.critic_target.train()
        self.actor_target.train()

        self.RER.add(error, (state, action, reward, next_state, done))
        #print("Length buffer: " + str(len(self.RepMem)))
        self.update_every_x = (self.update_every_x+1) % self.update_every
        if self.update_every_x == 0:
            # start learning when buffer is half-full
            self.reset()
            if len(self.PER) > int(self.batch_size * self.start_learning):
                #print ("len_buff: {}\t threshold: {}".format(len(self.RepMem),int(self.batch_size * self.start_learning)))
                for _ in range(int(self.update_times)):
                    sample_batch, idxs, is_weights = self.PER.sample(self.batch_size)
                    


                    #states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
                    #actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
                    #rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
                    #next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
                    #dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

                    #print (sample_batch)
                    #print("----------")
                    self.learn(sample_batch,idxs, is_weights, learn_reset)
                    #print ("Learned")
                    #print ("++++++++++")
        


    def act(self, state, add_noise = True):
        """
        choose an action due to the given state and policy
        Parameters:
        ===========
        state: the current state
        add_noise (bool): True: adds a noise for exploration
        
        return: the estimate action
        """
        ### adapt state and send it to device
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            noise_ = self.noise.sample()
            #print (action)
            self.noise_getter_mean = np.mean(noise_)
            action += self.epsilon * noise_
            #print (action)
            #print ("------------------------------------------")
        ### be shure that with the noise the action is still between -1 and 1  
        return np.clip(action, -1.0, 1.0)

    def get_epsilon(self):
        """
        getter function: used to monitor epsilon
        """
        return self.epsilon

    def get_noise_mean(self):
        """
        getter function: used to monitor the noise
        """
        return self.noise_getter_mean

    def learn(self, sample_batch, idxs, is_weights, noise_reset=True):
        """
        update the DNNs
        Parameters:
        ==========
        experiences: batch sample 
        """
        sample_batch = np.array(sample_batch).transpose()

        states = torch.from_numpy(np.vstack(sample_batch[0])).float().to(device)
        #actions = list(sample_batch[1])
        #rewards = list(sample_batch[2])
        #next_states = np.vstack(sample_batch[3])
        #dones = sample_batch[4].astype(int)

        actions = torch.from_numpy(np.vstack(sample_batch[1])).float().to(device)
        rewards = torch.from_numopy(np.vstack(sample_batch[2])).float().to(device)
        next_states = torch.from_numpy(np.vstack(sample_batch[3])).float().to(device)
        dones = torch.from_numpy(np.vstack(sample_batch).astype(np.uint8)).float().to(device)

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        #print ("Q_targets_next"+ str(Q_targets_next.shape))
        #print ("rewards"+str(rewards.shape))
        #print ("dones"+str(dones.shape))
        #print("---------")
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = torch.FloatTensor(is_weighhts) *  F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)  

        self.epsilon = max(0.001, self.epsilon * self.epsilon_decay)
        if noise_reset:
            self.noise.reset()



    def soft_update(self, local_model, target_model, tau):
        """
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Parameters
        ==========
        local_model (Actor or Critic object): from that model the parameters are used
        target_model (Actor or Critic object): to that model the parameters are updated 
        tau (float): blend parameter, has to be between 0 and 1
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def reset(self):
        self.noise.reset()