class DQNagent:

    def __init__(self, mem_size, epsilon, mini_batch_size, learning_rate, gamma):

        self.epsilon = epsilon
        self.mini_batch_size = mini_batch_size
        self.gamma = gamma

        self.update_counter = 0

        self.net = nn.Sequential(
            nn.Linear(2, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 3)
        ).float()

        self.net_target = copy.deepcopy(self.net)

        self.net = self.net.cuda()
        self.net_target = self.net_target.cuda()

        # self.net_target = nn.Sequential(
        #     nn.Linear(2, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 3)
        # ).float()

        self.replay_memory = ReplayMemory(max_size=mem_size)

        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
    
    def get_action(self, obs, mode='e-greedy'):
        if mode == 'random':
            action = random.choice([0, 1, 2])
        elif mode == 'greedy':
            obs = torch.tensor(obs, dtype=torch.float).cuda()
            with torch.no_grad():
                action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        elif mode == 'e-greedy':
            action = random.choice([0, 1, 2])
            if random.random() >= self.epsilon:
                obs = torch.tensor(obs, dtype=torch.float).cuda()
                with torch.no_grad():
                    action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        # if not explore and random.random() >= self.epsilon:
        #     obs = torch.tensor(obs, dtype=torch.float).cuda()
        #     with torch.no_grad():
        #         action = torch.argmax(self.net(obs)).cpu().numpy().tolist()
        
        assert type(action) == int
        return action
    
    def store_transition(self, obs, action, reward, new_obs, done):
        self.replay_memory.push(obs, action, reward, new_obs, done)
    
    def update(self):
        
        if len(self.replay_memory) < self.mini_batch_size:
            return

        obs_batch, action_batch, reward_batch, new_obs_batch, done_batch = self.replay_memory.sample(self.mini_batch_size)

        new_obs_batch = torch.tensor(new_obs_batch, dtype=torch.float).cuda()
        # print(new_obs_batch.shape)
        # time.sleep(5)
        with torch.no_grad():
            target_batch = torch.tensor(reward_batch, dtype=torch.float).cuda()
            # print(target_batch.shape)
            # time.sleep(5)
            vals_new_obs = torch.max(self.net_target(new_obs_batch), dim=1)[0]
            # print(vals_new_obs.shape)
            # time.sleep(5)
            for i in range(self.mini_batch_size):
                if not done_batch[i]:
                    target_batch[i] += self.gamma * vals_new_obs[i]
            # target_batch = target_batch + self.gamma * vals_new_obs
        
        obs_batch = torch.tensor(obs_batch, dtype=torch.float).cuda()
        pred_batch = self.net(obs_batch)
        # print(pred_batch[:5])
        # print(pred_batch.size(0))
        # print(action_batch)
        # pred_batch_ = pred_batch[torch.arange(pred_batch.size(0)), action_batch]
        action_batch = torch.tensor(action_batch, dtype=torch.long).cuda()
        # print(action_batch[:5])
        pred_batch_ = pred_batch.gather(1, action_batch.unsqueeze(1)).squeeze(1)
        # print(pred_batch_[:5])
        # time.sleep(5)

        loss = self.criterion(pred_batch_, target_batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.update_counter += 1
        if self.update_counter%20 == 0:
            self.update_counter = 0
            for target_param, param in zip(self.net_target.parameters(), self.net.parameters()):
                target_param.data.copy_(param)
Beispiel #2
0
class EGreedyAgent(MAEAgent):
    """Epsilon greedy agent.
    """
    def __init__(self,
                 default_reward,
                 name,
                 color,
                 env,
                 agent_type,
                 features_n,
                 memory_capacity,
                 init_value=0.0,
                 batch_size=64,
                 gamma=0.99,
                 eps_start=0.9,
                 eps_end=0.01,
                 eps_decay=50,
                 need_reload=False,
                 reload_path=None,
                 need_exploit=True):
        super(EGreedyAgent, self).__init__((0, 0),
                                           default_reward=default_reward,
                                           color=color,
                                           env=env,
                                           name=name,
                                           default_type=agent_type,
                                           default_value=init_value)
        self.actions_n = env.action_space.n
        # discounted value
        self.gamma = gamma
        self.batch_size = batch_size
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
        self.features_n = features_n
        self.memory_capacity = memory_capacity
        self.memory = ReplayMemory(self.memory_capacity)
        self.steps_count = 0
        self.device = 'cpu'
        # for evaluate Q_value
        self.policy_net = DQN(self.features_n, self.actions_n, 50, 50, 50)
        # evaluate Q_target
        self.target_net = DQN(self.features_n, self.actions_n, 50, 50, 50)
        if need_reload:
            self.restore(reload_path)
        # let target net has the same params as policy net
        self.target_net.eval()
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.001)
        self.save_file_path = './model/'
        self.need_exploit = need_exploit

    def act(self, state):
        """Chose action greedily.
        """
        # Trans list state to tensor, shape is (1, 4)
        # [[1,2,3,4]]
        state = torch.FloatTensor([state])
        sample = random.random()
        # chose action randomly at the beginning, then slowly chose max Q_value
        eps_threhold = self.eps_end + (self.eps_start - self.eps_end) * \
            math.exp(-1. * self.steps_count / self.eps_decay) \
                if self.need_exploit else 0.01
        self.steps_count += 1
        if sample > eps_threhold:
            with torch.no_grad():
                return self.policy_net(state).max(1)[1].view(1, 1).item()
        else:
            return random.randrange(self.actions_n)

    def optimize_model(self):
        """
        Train model.
        """
        if len(self.memory) < self.batch_size:
            return 0.0
        transitions = self.memory.sample(self.batch_size)
        # batch is ([state], [action], [next_state], [reward])
        batch = Transition(*zip(*transitions))
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device)
        non_final_next_states = torch.cat([
            torch.tensor([s], dtype=torch.float) for s in batch.next_state
            if s is not None
        ])
        state_batch = torch.cat(
            [torch.tensor([s], dtype=torch.float) for s in batch.state])
        action_batch = torch.cat(
            [torch.tensor([[s]], dtype=torch.long) for s in batch.action])
        reward_batch = torch.cat(
            [torch.tensor([[s]], dtype=torch.float) for s in batch.reward])
        q_eval = self.policy_net(state_batch).gather(1, action_batch)
        q_next = torch.zeros(self.batch_size, device=self.device)
        q_next[non_final_mask] = self.target_net(non_final_next_states).max(
            1)[0].detach()
        q_target = (q_next * self.gamma) + reward_batch.squeeze()

        loss = F.mse_loss(q_eval, q_target.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def save(self, name):
        """
        Save trained model to model/`name.
        """
        torch.save(self.target_net.state_dict(), self.save_file_path + name)

    def restore(self, path):
        """
        Restore model from `path.
        """
        params = torch.load(path)
        self.target_net.load_state_dict(params)
        self.policy_net.load_state_dict(params)
Beispiel #3
0
class Agent(object):
    """RL agent for the Atari game"""
    def __init__(
        self,
        player_id: int = 1,
        name: str = "Ugo",
        batch_size: int = 128,
        gamma: float = 0.98,
        memory_size: int = 40000,
    ) -> None:
        """Initialization for the DQN agent

        Args:
            player_id (int, optional): Side of the board on which to play. Defaults to 1.
            name (str, optional): Name of the player. Defaults to "Ugo".
            batch_size (int, optional): Batch size of the update. Defaults to 128.
            gamma (float, optional): Gamme value for update decay. Defaults to 0.98.
            memory_size (int, optional): Experience memory capacity. Defaults to 40000.
        """
        # list of parameters of the agent
        self.player_id = player_id
        self.name = name
        self.batch_size = batch_size  # size of batch for update
        self.gamma = gamma  # discount factor
        self.memory_size = memory_size  # size of replay memory
        self.memory = ReplayMemory(self.memory_size,
                                   train_buffer_capacity=4,
                                   test_buffer_capacity=4)

        # networks
        self.policy_net = DQN(action_space_dim=3,
                              hidden_dim=256).to(torch.device(device))
        self.target_net = DQN(action_space_dim=3,
                              hidden_dim=256).to(torch.device(device))
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4)

    def update_policy_net(self) -> None:
        """Update policy_net via Q-learning approximation"""

        # check if memory has enough elements to sample
        if len(self.memory) < self.batch_size:
            return

        # get transitions
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # get elements from batch
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to(
            torch.device(device))
        non_final_mask = non_final_mask.type(torch.bool)
        non_final_next_obs = torch.stack([
            ob for nonfinal, ob in zip(non_final_mask, batch.next_ob)
            if nonfinal
        ]).to(torch.device(device))
        ob_batch = torch.stack(batch.ob).to(torch.device(device))
        rew_batch = torch.stack(batch.rew).to(torch.device(device))
        action_batch = torch.stack(batch.action).to(torch.device(device))

        # estimate Q(st, a) with the policy network
        state_action_values = (self.policy_net.forward(ob_batch).gather(
            1, action_batch).squeeze())

        # estimate V(st+1) with target network
        next_state_values = torch.zeros(self.batch_size).to(
            torch.device(device))
        next_state_values[non_final_mask] = (
            self.target_net.forward(non_final_next_obs).max(1)[0].detach())

        # expected Q value
        expected_state_action_values = (rew_batch.squeeze() +
                                        self.gamma * next_state_values)

        # loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        # optimize the network
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-0.1, 0.1)
        self.optimizer.step()

    def update_target_net(self) -> None:
        """Update target net"""
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def get_action(self,
                   ob: np.ndarray,
                   epsilon: float = 0.1,
                   train: bool = False) -> int:
        """Interface function that returns the action that the agent took based
        on the observation ob

        Args:
            ob (np.ndarray, optional): Current observation from the game.
            epsilon (float, optional): Epsilon for epsilon greedy. Defaults to 0.1.
            train (bool, optional): Identifies if the agent is in testing or training phase. Defaults to False.

        Returns:
            int: the action taken by the agent policy
        """

        # epsilon greedy action selection
        if train and np.random.rand() < epsilon:
            action = np.random.randint(0, 3)
        else:
            # get stack of obeservations
            if train:
                ob_stack = self.get_stack_from_train_buffer(ob)
            else:
                ob_stack = self.get_stack_from_test_buffer(ob)
            ob_stack = ob_stack.unsqueeze(0)

            # predict best action
            with torch.no_grad():
                action = self.policy_net.forward(ob_stack).argmax().item()

        if not train:
            self.push_to_test_buffer(ob)

        return action

    def get_name(self) -> str:
        """Return name of the agent

        Returns:
            str: name of the agent
        """
        return self.name

    def reset(self) -> None:
        """Clean the buffers of the memory"""
        self.memory.test_buffer = []
        self.memory.train_buffer = []

    def load_model(
        self,
        path_ai: str = "weights/hibrid_tuned_best.ai",
        path_optm: str = None,
    ) -> None:
        """Load model weights and optimizer from a certain path

        Args:
            path_ai (str, optional): Path to model weights. Defaults to "weights/hibrid_tuned_best.ai".
            path_optm (str, optional): Path to optimizer weights. Defaults to None.
        """
        # load model weights
        self.policy_net.load_state_dict(
            torch.load(path_ai, map_location=torch.device(device)))
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        # load optimizer parameters
        if path_optm is not None:
            try:
                self.optimizer.load_state_dict(
                    torch.load(path_optm, map_location=torch.device(device)))
            except:
                print(
                    "WARNING: No optimizer state_dict found! Remember to load the optimizer state_dict when retraining the model!"
                )

    def save_model(self, dir: str, ep: int) -> None:
        """Save model to file

        Args:
            dir (str): Directory to where save the model
            ep (int): episode number
        """
        torch.save(self.policy_net.state_dict(), dir + f"/DQN_{ep+1}.ai")
        torch.save(self.optimizer.state_dict(), dir + f"/DQN_{ep+1}.optm")

    def push_to_train_buffer(self, ob: np.ndarray, action: int, reward: int,
                             next_ob: np.ndarray, done: bool) -> None:
        """Push a transition to the memory train buffer

        Args:
            ob (np.ndarray): Obsertation/state at time t
            action (int): Action at time t
            reward (int): Reward for taking action a in state s at time t
            next_ob (np.ndarray): Observation/state at time t+1
            done (bool): Defines if the game is finished or not
        """
        # preprocess observations
        ob = self.preprocess_ob(ob)
        next_ob = self.preprocess_ob(next_ob)

        # save to buffer
        action = torch.Tensor([action]).long().to(torch.device(device))
        reward = torch.tensor([reward],
                              dtype=torch.float32).to(torch.device(device))
        self.memory.push_to_train_buffer(ob, action, next_ob, reward, done)

        # check if I need to push to memory
        if len(self.memory.train_buffer
               ) == self.memory.train_buffer_capacity or done:

            # get the buffer and transition elements to push into memory
            buffer = self.memory.train_buffer
            ob_stack = torch.stack((buffer[0].ob, buffer[1].ob, buffer[2].ob,
                                    buffer[3].ob)).to(torch.device(device))
            next_ob_stack = torch.stack((
                buffer[0].next_ob,
                buffer[1].next_ob,
                buffer[2].next_ob,
                buffer[3].next_ob,
            )).to(torch.device(device))

            # push to memory
            self.memory.push_to_memory(
                ob_stack,
                buffer[3].action,
                next_ob_stack,
                buffer[3].rew,
                buffer[3].done,
            )

            # if not done delete the firt row in the buffer
            if not done:
                self.memory.train_buffer = self.memory.train_buffer[1:]

            # if done reset everything
            if done:
                self.reset()

    def push_to_test_buffer(self, ob: np.ndarray) -> None:
        """Push a transition to the train buffer

        Args:
            ob (np.ndarray): Observation to push to the buffer
        """
        # preprocess observation and push to test buffer
        ob = self.preprocess_ob(ob)
        self.memory.push_to_test_buffer(ob)

        # check if I have filled it
        if len(self.memory.test_buffer) == self.memory.test_buffer_capacity:
            self.memory.test_buffer = self.memory.test_buffer[1:]

    def get_stack_from_train_buffer(self, ob: np.ndarray) -> Tensor:
        """Get stack of preprocessed observations/states from train buffer

        Args:
            ob (np.ndarray): Current observation/state

        Returns:
            Tensor: Stack of preprocessed observations/states
        """
        ob = self.preprocess_ob(ob)

        # get observations from train buffer
        obs = ([x.ob for x in self.memory.train_buffer]
               if len(self.memory.train_buffer) != 0 else [ob])
        obs.append(ob)

        # complete the sequence
        while len(obs) != self.memory.train_buffer_capacity:
            obs.append(obs[-1])

        # stack observations and return them
        ob_stack = torch.stack(obs).to(torch.device(device))

        return ob_stack

    def get_stack_from_test_buffer(self, ob: np.ndarray) -> Tensor:
        """Get stack of preprocessed observations/states from test buffer

        Args:
            ob (np.ndarray): Current observation/state

        Returns:
            Tensor: Stack of preprocessed observations/states
        """
        ob = self.preprocess_ob(ob)

        # get observations from test buffer
        obs = ([x for x in self.memory.test_buffer]
               if len(self.memory.test_buffer) != 0 else [ob])
        obs.append(ob)

        # complete the sequence
        while len(obs) != self.memory.test_buffer_capacity:
            obs.append(obs[-1])

        # stack observations and return them
        ob_stack = torch.stack(obs).to(torch.device(device))

        return ob_stack

    def preprocess_ob(self, ob: np.ndarray) -> Tensor:
        """Preprocess observation:\n
        - shrink the image to 100x100\n
        - transform it to black and white\n
        - transform it into a Tensor\n

        Args:
            ob (np.ndarray): Observation to preprocess

        Returns:
            Tensor: Preprocessed observation
        """
        # shrink image
        ob = Image.fromarray(ob)
        ob = ob.resize((100, 100))
        ob = np.asarray(ob)

        # grayscale image
        ob = rgb2grayscale(ob)
        ob[ob != ob[0][0]] = 1
        ob[ob == ob[0][0]] = 0

        # Tensor definition
        ob = torch.from_numpy(ob).float().to(torch.device(device))

        return ob
Beispiel #4
0
class DQNAgent(Agent):
    def __init__(self, model, env, **kwargs):
        Agent.__init__(self, **kwargs)
        self.update_step = 0
        self.eps = self.EPS_START
        self.global_step = 0
        self.model = model
        self.target_model = copy.deepcopy(model)
        self.in_size = model.in_size
        self.out_size = model.out_size
        self.memory = ReplayMemory(self.REPLAY_CAPACITY)
        self.opt = torch.optim.Adam(self.model.parameters(), lr=self.LR)
        self.env = env
        self.container = Container(self.model.SAVE_MODEL_NAME)

    def select_action(self, state):
        if self.is_training:
            self.global_step += 1
            self.eps = self.EPS_START - (self.EPS_START - self.EPS_END
                                         ) / self.EPS_DECAY * self.global_step
            if self.eps < self.EPS_END:
                self.eps = self.EPS_END

        if self.is_training and np.random.rand() < self.eps:
            return LongTensor([[np.random.randint(self.out_size)]])
        else:
            var = Variable(state).type(FloatTensor)
            out = self.model(var)
            return out.max(1)[1].data.view(1, 1)

    def _DQ_loss(self, y_pred, reward_batch, non_final_mask,
                 non_final_next_states):
        q_next = Variable(torch.zeros(self.BATCH_SIZE).type(FloatTensor))
        target_q = self.target_model(non_final_next_states)
        if self.DOUBLE_DQN:
            max_act = self.model(non_final_next_states).max(1)[1].view(-1, 1)
            q_next[non_final_mask] = target_q.gather(1, max_act).data.view(
                target_q.gather(1, max_act).data.shape[0])
        else:
            q_next[non_final_mask] = target_q.max(1)[0].data

        # next_state_values.volatile = False
        y = q_next * self.GAMMA + reward_batch
        loss = nn.functional.mse_loss(y_pred, y)
        return loss

    def _calc_loss(self):
        batch = self.memory.sample(self.BATCH_SIZE)
        non_final_mask = ByteTensor(
            tuple([s is not None for s in batch.next_state]))
        non_final_next_states = Variable(
            torch.cat([s for s in batch.next_state if s is not None]))

        state_batch = Variable(
            torch.cat([s for s in batch.state if s is not None]))
        action_batch = Variable(
            torch.cat([s for s in batch.action if s is not None]))
        reward_batch = Variable(
            torch.cat([s for s in batch.reward if s is not None]))

        y_pred = self.model(state_batch).gather(1, action_batch).squeeze()
        loss = self._DQ_loss(y_pred, reward_batch, non_final_mask,
                             non_final_next_states)
        self.container.add("y_pred", torch.mean(y_pred.data))
        self.container.add("loss", loss.data.item())
        return loss

    def update_policy(self):
        loss = self._calc_loss()
        self.opt.zero_grad()
        loss.backward()
        if self.GRADIENT_CLIPPING:
            for param in self.model.parameters():
                param.grad.data.clamp_(-self.GRADIENT_CLIPPING,
                                       self.GRADIENT_CLIPPING)
        self.opt.step()

    def update_target_network(self):
        if not self.SOFT_UPDATE:
            self.update_step = (self.update_step + 1) % self.TARGET_UPDATE_FREQ
            if self.update_step == 0:
                state_dict = self.model.state_dict()
                self.target_model.load_state_dict(copy.deepcopy(state_dict))
        else:
            tw = self.target_model.state_dict().values()
            sw = self.model.state_dict().values()
            for t, s in zip(tw, sw):
                t.add_(self.TARGET_UPDATE_FREQ * (s - t))

    def _forward(self, obs, is_train, update_memory):
        if self.state_processor:
            state = self.state_processor(obs)
        else:
            temp = obs[None, :] if len(obs.shape) == 1 else obs[None, None, :]
            state = torch.from_numpy(temp).type(FloatTensor)

        if self.GET_DEMO:
            action = self.rule_processor(obs)
        else:
            action = self.select_action(state)

        act = action.numpy().squeeze()
        if self.VERBOSE:
            print("action: {}".format(act))
        action_step = self.ACTION_REPEAT
        reward = 0
        done = False
        while action_step > 0:
            action_step -= 1
            next_obs, r, done, _ = self.env.step(act)

            # CartPole reward
            # x, x_dot, theta, theta_dot = next_obs
            # r1 = (self.env.x_threshold - abs(x)) / self.env.x_threshold - 0.8
            # r2 = (self.env.theta_threshold_radians - abs(theta)) / self.env.theta_threshold_radians - 0.5
            # r = r1 + r2

            # MountainCar reward
            # position, velocity = next_obs
            # r = abs(position - (-0.5))

            reward += r
            if done:
                break

        self.reward_episode += reward
        if update_memory:
            reward = FloatTensor([reward])
            self.memory.push(state, action, reward)
            if done:
                self.memory.push(None, None, None)

        if len(self.memory) >= self.REPLAY_START and is_train:
            self.update_policy()
            self.update_target_network()

        if self.is_render:
            self.env.render()

        return next_obs, done

    def fit(self,
            is_train,
            update_memory=True,
            num_step=np.inf,
            num_episode=np.inf,
            max_episode_length=np.inf,
            is_render=False):
        if num_step == np.inf and num_episode == np.inf:
            raise Exception("")
        if num_step != np.inf and num_episode != np.inf:
            raise Exception("")

        self.is_render = is_render
        while self.i_episode < num_episode and self.i_step < num_step:
            self.i_episode += 1
            print("------------------------")
            print("episode: {}, step: {}".format(self.i_episode, self.i_step))
            obs = self.env.reset()
            self.reward_episode = 0
            episode_step = 0
            while episode_step < max_episode_length:
                episode_step += 1
                self.i_step += 1
                obs, done = self._forward(obs, is_train, update_memory)
                if done:
                    self.reward_step_pairs.push(self.reward_episode,
                                                self.i_step)
                    if self.is_test:
                        self.container.add("reward", self.reward_episode,
                                           self.record_i_step)
                    self.print(is_train)
                    break

    def train(self, **kwargs):
        self.is_training = True
        if kwargs.pop("clear", True):
            self.i_episode = 0
            self.i_step = 0
            self.reward_step_pairs.reset()
        print("Training starts...")
        self.fit(True, **kwargs)
        # self.model.save()
        self.container.save()

    def run(self, **kwargs):
        self.is_training = False
        if kwargs.pop("clear", True):
            self.i_episode = 0
            self.i_step = 0
            self.reward_step_pairs.reset()
        print("Running starts...")
        self.fit(False, **kwargs)

    def _test(self, num_step):
        self.record_i_episode = self.i_episode
        self.record_i_step = self.i_step
        self.is_test = True
        self.run(num_step=num_step)
        self.i_episode = self.record_i_episode
        self.i_step = self.record_i_step
        self.is_test = False

    def train_test(self, num_step, test_period=1000, test_step=100):
        self.i_episode = 0
        self.i_step = 0
        while self.i_step < num_step:
            self._test(test_step)
            self.train(num_step=self.record_i_step + test_period, clear=False)
        self._test(test_step)

    def print(self, is_train):
        print("reward_episode {}".format(self.reward_episode))
        print("eps {}".format(self.eps))
        if is_train:
            print("loss_episode {}".format(self.container.get("loss")))
            print("y_pred_episode {}".format(self.container.get("y_pred")))
class Agent(object):
    def __init__(self,
                 num_actions,
                 gamma=0.98,
                 memory_size=5000,
                 batch_size=32):
        self.scaler = None
        self.featurizer = None
        self.q_functions = None
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_actions = num_actions
        self.memory = ReplayMemory(memory_size)
        self.initialize_model()

    def initialize_model(self):
        # Draw some samples from the observation range and initialize the scaler
        obs_limit = np.array([4.8, 5, 0.5, 5])
        samples = np.random.uniform(-obs_limit, obs_limit,
                                    (1000, obs_limit.shape[0]))
        self.scaler = StandardScaler()
        self.scaler.fit(samples)

        # Initialize the RBF featurizer
        self.featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=80)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=50)),
        ])
        self.featurizer.fit(self.scaler.transform(samples))

        # Create a value approximator for each action
        self.q_functions = [
            SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3)
            for _ in range(self.num_actions)
        ]

        # Initialize it to whatever values; implementation detail
        for q_a in self.q_functions:
            q_a.partial_fit(self.featurize(samples),
                            np.zeros((samples.shape[0], )))

    def featurize(self, state):
        """ Test two different features for state representations
        """
        if len(state.shape) == 1:
            state = state.reshape(1, -1)
        # Task 1a: TODO: Use (s, abs(s)) as features # handcrafted feature vector: s = [1, -2, 3, -4], then (s, abs(s)) = [1, -2, 3, -4, 1, 2, 3, 4] (see slack discussion)
        #return np.concatenate((state, abs(state)), axis=1)
        # Task 1b: RBF features # radial basis function representations
        return self.featurizer.transform(self.scaler.transform(state))

    def get_action(self, state, epsilon=0.0):
        if np.random.random() < epsilon:
            a = int(np.random.random() * self.num_actions)
            return a
        else:
            featurized = self.featurize(state)
            qs = [q.predict(featurized)[0] for q in self.q_functions]
            qs = np.array(qs)
            a = np.argmax(qs, axis=0)
            return a

    def single_update(self, state, action, next_state, reward, done):
        # Calculate feature representations of the
        # Task 1: TODO: Set the feature state and feature next state
        featurized_state = self.featurize(state)
        featurized_next_state = self.featurize(next_state)

        # Task 1:  TODO Get Q(s', a) for the next state
        predictions = []
        for q_func in self.q_functions:  # one function approximator for each of the two actions
            predictions.append(
                q_func.predict(featurized_next_state)
            )  # calculate prediction for every function approximator q_function
        next_qs = np.max(predictions)  # chose highest predicted value

        # Calculate the updated target Q- values
        # Task 1: TODO: Calculate target based on rewards and next_qs
        if done:  # terminal state
            target = [reward + self.gamma * 0]
        else:  # not terminal state
            target = [reward + self.gamma * next_qs]

        # Update Q-value estimation
        self.q_functions[action].partial_fit(
            featurized_state,
            target)  # partial_fit() for mini-batch learning (see sklearn docs)

    def update_estimator(self):
        if len(self.memory) < self.batch_size:
            # Use the whole memory
            samples = self.memory.memory
        else:
            # Sample some data
            samples = self.memory.sample(
                self.batch_size
            )  # return random sample; length=32 # print("", )

        # Task 2: TODO: Reformat data in the minibatch
        states = np.array(
            [sample.state for sample in samples]
        )  # pick all the states from the batch, we have to retrieve the data of the batches
        action = np.array([
            sample.action for sample in samples
        ])  # return array with 32 elements (number of batch size)
        next_states = np.array([sample.next_state for sample in samples])
        rewards = np.array([sample.reward for sample in samples])
        dones = np.array([sample.done for sample in samples])

        # Task 2: TODO: Calculate Q(s', a)
        featurized_next_states = self.featurize(next_states)
        # we need to do the same for next_qs as in single_update but for every sample in the batch
        next_qs = []  # 32x1 (#samples x #functions)
        for s in featurized_next_states:
            arr = np.array([q.predict([s]) for q in self.q_functions])
            next_qs.append(np.max(arr))
        next_qs = np.array(next_qs)

        # Calculate the updated target values
        # Task 2: TODO: Calculate target based on rewards and next_qs
        targets = rewards + self.gamma * next_qs * (1 - dones)

        # Calculate featurized states
        featurized_states = self.featurize(states)

        # Get new weights for each action separately
        for a in range(self.num_actions):
            # Find states where a was taken
            idx = action == a

            # If a not present in the batch, skip and move to the next action
            if np.any(idx):
                act_states = featurized_states[idx]
                act_targets = targets[idx]

                # Perform a single SGD step on the Q-function params
                self.q_functions[a].partial_fit(act_states, act_targets)

    def store_transition(self, *args):
        self.memory.push(*args)
Beispiel #6
0
class Agent:
    def __init__(self,
                 state_space,
                 n_actions,
                 replay_buffer_size=50000,
                 batch_size=32,
                 hidden_size=64,
                 gamma=0.99):
        self.n_actions = n_actions
        self.state_space_dim = state_space
        self.policy_net = GenericNetwork(state_space,
                                         n_actions,
                                         hidden_size,
                                         name='dqn_network_')
        self.target_net = GenericNetwork(state_space,
                                         n_actions,
                                         hidden_size,
                                         name='target_dqn_network_')
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma
        self.action = {}
        self.j = 0

    def learn(self):
        """
        Learning function
        :return:
        """
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8)

        # avoid having an empty tensor
        test_tensor = T.zeros(self.batch_size)
        while T.all(T.eq(test_tensor, non_final_mask)).item() is True:
            transitions = self.memory.sample(self.batch_size)
            batch = Transition(*zip(*transitions))
            non_final_mask = 1 - T.tensor(batch.done, dtype=T.uint8)

        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = T.stack(non_final_next_states)
        state_batch = T.stack(batch.state)
        action_batch = T.cat(batch.action)
        reward_batch = T.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = T.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        # Compute mse loss
        loss = F.mse_loss(state_action_values.squeeze(),
                          expected_state_action_values)
        # Optimize the model
        self.policy_net.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.policy_net.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        """
        Used to select actions
        :param state:
        :param epsilon:
        :return: action
        """
        sample = random.random()
        if sample > epsilon:
            with T.no_grad():
                state = T.from_numpy(state).float()
                q_values = self.policy_net(state)
                self.action[self.j] = {
                    'list_of_actions': q_values,
                    'max': T.argmax(q_values).item()
                }
                self.j += 1
                return T.argmax(q_values).item() + 1
        else:
            action = random.randrange(self.n_actions)
            return action + 1

    def update_target_network(self):
        """
        Used to update target networks
        :return:
        """
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, reward, next_state, done):
        """
        Used for memory replay purposes
        :param state:
        :param action:
        :param reward:
        :param next_state:
        :param done:
        :return:
        """
        action = T.Tensor([[action]]).long()
        reward = T.tensor([reward], dtype=T.float32)
        next_state = T.from_numpy(next_state).float()
        state = T.from_numpy(state).float()
        self.memory.push(state, action, reward, next_state, done)

    def save_models(self):
        """
        Used to save models
        :return:
        """
        self.policy_net.save_checkpoint()
        self.target_net.save_checkpoint()

    def load_models(self):
        """
        Used to load models
        :return:
        """
        self.policy_net.load_checkpoint()
Beispiel #7
0
class DQNagent(object):
    def __init__(self, filename='dqn0'):
        self.filename = './trained_agents/' + filename
        self.policy_net = DQN(self.filename + '.cfg')
        self.target_net = DQN(self.filename + '.cfg')
        self.memory = ReplayMemory(16384)
        self.gamma = 0.999

    def select_action(self, state, epsilon):
        if np.random.rand() < epsilon:
            idx = LongTensor([[random.randrange(self.policy_net.output_size)]])
        else:
            idx = self.policy_net(
                Variable(state,
                         volatile=True).type(FloatTensor)).data.max(1)[1].view(
                             1, 1)
        return idx

    def update(self, batch_size=16):
        if len(self.memory.memory) < batch_size:
            batch_size = len(self.memory.memory)

        transitions = self.memory.sample(batch_size)
        batch = Transition(*zip(*transitions))

        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))

        non_final_mask = ByteTensor(
            tuple(map(lambda s: s is not None, batch.next_state)))
        non_final_next_states = Variable(torch.cat(
            [s for s in batch.next_state if s is not None]),
                                         volatile=True)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = Variable(torch.zeros(batch_size).type(Tensor))
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0]

        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        expected_state_action_values = Variable(
            expected_state_action_values.data)

        loss = F.mse_loss(state_action_values, expected_state_action_values)

        old_params = freeze_as_np_dict(self.policy_net.state_dict())
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            logging.debug(param.grad.data.sum())
            param.grad.data.clamp_(-1., 1.)
        self.optimizer.step()

        new_params = freeze_as_np_dict(self.policy_net.state_dict())
        check_params_changed(old_params, new_params)
        return loss.data[0]

    def train(self,
              env,
              n_epochs=30,
              epsilon_init=1.,
              epsilon_schedule='exp',
              eps_decay=None,
              lr=0.001,
              batch_size=32):
        if epsilon_schedule == 'linear':
            eps_range = np.linspace(epsilon_init, 0., n_epochs)
        elif epsilon_schedule == 'constant':
            eps_range = [epsilon_init for _ in range(n_epochs)]
        elif epsilon_schedule == 'exp':
            if not eps_decay:
                eps_decay = n_epochs // 4
            eps_range = [
                epsilon_init * math.exp(-1. * i / eps_decay)
                for i in range(n_epochs)
            ]

        history_file = open(self.filename + 'history', mode='a+')
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)

        losses, rewards, change_history = [], [], []

        for epoch in range(n_epochs):
            env.reset()
            last_screen = get_screen(env)
            current_screen = get_screen(env)
            state = current_screen - last_screen
            done = False
            epoch_losses = []
            epoch_rewards = []
            video = []

            while not done:
                if epoch % 10 == 1:
                    video.append(last_screen)
                action = self.select_action(state, eps_range[epoch])

                _, reward, done, _ = env.step(action[0, 0])

                last_screen = current_screen
                current_screen = get_screen(env)

                reward = Tensor([reward])
                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = None

                self.memory.push(state, action, next_state, reward)
                state = next_state
                loss = self.update(batch_size=batch_size)

                epoch_losses.append(loss)
                epoch_rewards.append(reward)

            history_file.write(
                'Epoch {}: loss= {}, reward= {}, duration= {}\n'.format(
                    epoch, np.mean(epoch_losses), np.sum(epoch_rewards),
                    len(epoch_rewards)))

            losses.append(np.mean(epoch_losses))
            rewards.append(np.sum(epoch_rewards))

            if epoch % 10 == 1:
                self.target_net.load_state_dict(self.policy_net.state_dict())
                self.save(ext=str(epoch))
                self.make_video(video, ext='_train_' + str(epoch))

                with open(self.filename + '.train_losses', 'a+') as f:
                    for l in losses:
                        f.write(str(l) + '\n')
                losses = []
                with open(self.filename + '.train_rewards', 'a+') as f:
                    for r in rewards:
                        f.write(str(r) + '\n')
                rewards = []
        self.save()

    def test(self, env, n_epochs=30, verbose=False):
        rewards = []
        self.policy_net = self.policy_net.cuda()
        self.target_net = self.target_net.cuda()
        self.target_net.eval()

        for epoch in range(n_epochs):
            env.reset()
            done = False
            epoch_rewards = []
            video = []

            last_screen = get_screen(env)
            current_screen = get_screen(env)
            state = current_screen - last_screen

            while not done:
                if epoch % 5 == 0:
                    video.append(last_screen)
                action = self.select_action(state, 0.)

                _, reward, done, _ = env.step(action[0, 0])
                last_screen = current_screen
                current_screen = get_screen(env)

                if not done:
                    next_state = current_screen - last_screen
                else:
                    next_state = None

                epoch_rewards.append(reward)
                reward = Tensor([reward])
                state = next_state

                logging.debug(
                    'Test epoch {} :  reward= {}, duration= {}'.format(
                        epoch, np.sum(epoch_rewards), len(epoch_rewards)))
            rewards.append(np.sum(epoch_rewards))

            if epoch % 5 == 0:
                self.make_video(video, ext='_test_' + str(epoch))

            logging.info('Performance estimate : {} pm {}'.format(
                np.mean(rewards), np.std(rewards)))

    def make_video(self, replay, ext=''):
        n_frames = len(replay)
        b_s, n_channels, n_w, n_h = replay[0].shape
        writer = VideoWriter(self.filename + ext + '.mp4')
        for i in range(n_frames):
            writer.writeFrame(replay[i][0][[1, 2, 0]] * 255)
        writer.close()

    def save(self, ext=''):
        torch.save(self.policy_net.state_dict(),
                   self.filename + ext + '.pol.ckpt')
        torch.save(self.target_net.state_dict(),
                   self.filename + ext + '.tgt.ckpt')

    def load(self, filename):
        self.policy_net.load_state_dict(
            torch.load('./trained_agents/' + filename + '.pol.ckpt'))
        self.target_net.load_state_dict(
            torch.load('./trained_agents/' + filename + '.tgt.ckpt'))
Beispiel #8
0
class DDPG_Agent:
    def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args):
        self.args = args
        self.alow = alow
        self.ahigh = ahigh
        self.policy = Policy_net(ob_sp, act_sp)
        self.policy_targ = Policy_net(ob_sp, act_sp)
        self.qnet = Q_net(ob_sp, act_sp)
        self.qnet_targ = Q_net(ob_sp, act_sp)

        self.policy.to(device)
        self.qnet.to(device)
        self.policy_targ.to(device)
        self.qnet_targ.to(device)
        self.MSE_loss = nn.MSELoss()
        self.noise = OUNoise(1, 1)

        hard_update(self.policy_targ, self.policy)
        hard_update(self.qnet_targ, self.qnet)

        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR)
        self.memory = ReplayMemory(int(1e6))
        self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS,
                                                FINAL_STD,
                                                INITIAL_STD,
                                                warmup_steps=WARMUP_STEPS)
        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer

    def get_action(self, state):
        if self.args.use_ounoise:
            noise = self.noise.sample()[0]
        else:
            noise = np.random.normal(
                0, self.epsilon_scheduler.value(self.n_steps))
        st = torch.from_numpy(state).view(1, -1).float()
        action = self.policy(st)
        action_with_noise = np.clip(action.item() + noise, self.alow,
                                    self.ahigh)
        if self.args.use_writer:
            self.writer.add_scalar("action mean", action.item(), self.n_steps)
            self.writer.add_scalar("action noise", noise, self.n_steps)
            self.writer.add_scalar("epsilon",
                                   self.epsilon_scheduler.value(self.n_steps),
                                   self.n_steps)
            self.writer.add_scalar("action", action_with_noise, self.n_steps)
        self.n_steps += 1
        return action_with_noise

    def store_transition(self, state, action, reward, next_state, done):

        self.memory.push(torch.from_numpy(state), torch.tensor(action),
                         torch.tensor(reward), torch.from_numpy(next_state),
                         torch.tensor(done))

    def reset(self):
        self.noise.reset()

    def train(self):
        batch = self.memory.sample(min(BATCH_SIZE, len(self.memory)))
        b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))]
        states, actions, rewards, next_states, dones = \
            b_dict[0], b_dict[1].view(-1, 1), \
            b_dict[2].view(-1, 1).float().to(device), b_dict[3], \
            b_dict[4].view(-1, 1).float().to(device)

        #  CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a))
        # inputs computation
        inputs_critic = self.qnet(states, actions)
        # targets
        with torch.no_grad():
            policy_acts = self.policy_targ(next_states)
        targ_values = self.qnet_targ(next_states, policy_acts)
        targets_critics = rewards + GAMMA * (1 - dones) * targ_values
        loss_critic = self.MSE_loss(inputs_critic, targets_critics)
        self.q_optimizer.zero_grad()
        loss_critic.backward()
        # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP)
        self.q_optimizer.step()

        # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø
        actor_loss = -self.qnet(states, self.policy(states)).mean()
        self.p_optimizer.zero_grad()
        actor_loss.backward()
        # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP)
        self.p_optimizer.step()
        soft_update(self.policy_targ, self.policy, TAU)
        soft_update(self.qnet_targ, self.qnet, TAU)
        if self.args.use_writer:
            self.writer.add_scalar("critic_loss", loss_critic.item(),
                                   self.n_updates)
            self.writer.add_scalar("actor_loss", actor_loss.item(),
                                   self.n_updates)
        self.n_updates += 1
Beispiel #9
0
class Agent:
    """DQN Agent class for training a OpenAI-gym environment
    """
    def __init__(self,
                 learning_rate,
                 gamma,
                 state_shape,
                 actions,
                 batch_size,
                 epsilon_initial=0.9,
                 epsilon_decay=1e-3,
                 epsilon_final=0.01,
                 replay_buffer_capacity=1000000,
                 model_name='dqn_model.h5',
                 model_dir='models/dqn_model',
                 ckpt_dir='models/dqn_model/checkpoints',
                 log_dir='logs'):
        """Initialize DQN agent

        Args:
            learning_rate (float): Optimizer learning rate
            gamma (float): Discount factor in Bellman equation
            state_shape (np.shape): Shape of state space of the environment
            actions (int): Number of actions
            batch_size (int): Size of batch from which agent would learn
            epsilon_initial (float): Initial value of epsilon
            epsilon_decay (float): Decay rate of epsilon
            epsilon_final (float): Final value of epsilon after complete decay
            replay_buffer_capacity (int): Maximum size of experience replay
                                          buffer
            model_name (str): Name of the model file to save/load
            model_dir (str): Directory in which model file is stored
            ckpt_dir (str): Model Checkpoint directory
            log_dir (str): Directory where tensorflow logs are stored
        """
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.actions = actions
        self.batch_size = batch_size
        self.epsilon = epsilon_initial
        self.epsilon_decay = epsilon_decay
        self.epsilon_final = epsilon_final
        self.buffer = ReplayMemory(replay_buffer_capacity, state_shape)
        self.q_network = self._get_model()

        self.model_file = f'{model_dir}/{model_name}'
        self.checkpoint_dir = ckpt_dir

    def select_action(self, state):
        """Select action according to epsilon greedy policy
        
        Args:
            state (list|np.array): Current state of the environment
        """
        if np.random.random() < self.epsilon:
            return np.random.choice(range(self.actions))
        else:
            return np.argmax(self.q_network.predict(np.array([state])))

    def train(self):
        """Optimize the model for the current batch"""
        if self.buffer.current_size >= self.batch_size:
            states, actions, rewards, next_states, dones = self.buffer.sample(
                self.batch_size)

            q_target = np.copy(self.q_network.predict(states))  # Q*(s,a)
            q_values_next = self.q_network.predict(next_states)  # Q*(s',a')

            batch = np.arange(self.batch_size, dtype=np.int32)

            # Bellman equation update
            q_target[batch, actions] = rewards + (
                self.gamma * np.max(q_values_next, axis=1) * dones)

            # Train using fixed q-targets
            self.q_network.train_on_batch(states, q_target)

            # Update epsilon
            if self.epsilon > self.epsilon_final:
                self.epsilon -= self.epsilon_decay
            else:
                self.epsilon = self.epsilon_final

    def store_experience(self, state, action, reward, next_state, done):
        """Store tuple <s, a, r, s', done> to the buffer"""
        self.buffer.store(state, action, reward, next_state, done)

    def save_model(self):
        self.q_network.save(self.model_file)

    def load_model(self):
        self.q_network = keras.models.load_model(self.model_file)

    def save_checkpoint(self, id):
        self.q_network.save(f'{self.checkpoint_dir}/{id}.h5')

    def load_checkpoint(self, id):
        self.q_network = keras.models.load_model(
            f'{self.checkpoint_dir}/{id}.h5')

    def _get_model(self):
        # 2 hidden layers, 1 FC layer
        model = keras.Sequential([
            keras.layers.Dense(256, activation='relu'),
            keras.layers.Dense(256, activation='relu'),
            keras.layers.Dense(self.actions, activation=None)
        ])

        # Use Adam optimizer
        optimizer = keras.optimizers.Adam(learning_rate=self.learning_rate)
        model.compile(optimizer=optimizer, loss='mean_squared_error')

        return model
class Agent(nn.Module):
    def __init__(self, q_models, target_model, hyperbolic, k, gamma,
                 model_params, replay_buffer_size, batch_size, inp_dim, lr):
        super(Agent, self).__init__()
        if hyperbolic:
            self.q_models = torch.nn.ModuleList(q_models)
            self.target_models = torch.nn.ModuleList(target_model)
        else:
            self.q_models = q_models
            self.target_models = target_model
        self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=1e-5)
        self.hyperbolic = hyperbolic
        self.n_actions = model_params.act_space
        self.k = k
        self.gamma = gamma
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.inp_dim = inp_dim

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    @staticmethod
    def get_hyperbolic_train_coeffs(k, num_models):
        coeffs = []
        gamma_intervals = np.linspace(0, 1, num_models + 2)
        for i in range(1, num_models + 1):
            coeffs.append(((gamma_intervals[i + 1] - gamma_intervals[i]) *
                           (1 / k) * gamma_intervals[i]**((1 / k) - 1)))
        return torch.tensor(coeffs) / sum(coeffs)

    def get_action(self, state_batch, epsilon=0.05):
        model_outputs = []
        take_random_action = random.random()
        if take_random_action > epsilon:
            return random.randrange(self.n_actions)
        elif self.hyperbolic:
            if take_random_action > epsilon:
                return random.randrange(self.n_actions)
            else:
                with torch.no_grad():
                    state_batch = torch.tensor(state_batch,
                                               dtype=torch.float32).view(
                                                   -1, self.inp_dim)
                    for ind, mdl in enumerate(self.q_models):
                        model_outputs.append(mdl(state_batch))
                    coeff = self.get_hyperbolic_train_coeffs(
                        self.k, len(self.q_models))
                    model_outputs = torch.cat(model_outputs, 1).reshape(
                        -1, len(self.q_models))
                    model_outputs = (model_outputs * coeff).sum(dim=1)
                    return torch.argmax(model_outputs).item()

    def get_state_act_vals(self, state_batch, action_batch=None):
        if self.hyperbolic:
            model_outputs = []
            for ind, mdl in enumerate(self.q_models):
                model_outputs.append(mdl(state_batch).gather(1, action_batch))
            model_outputs = torch.cat(model_outputs,
                                      1).reshape(-1, len(self.q_models))
            coeffs = self.get_hyperbolic_train_coeffs(self.k,
                                                      len(self.q_models))
            model_outputs = model_outputs * coeffs
            return model_outputs.sum(dim=1).reshape(-1, 1)
        else:
            model_output = self.q_models(state_batch).gather(1, action_batch)
            return model_output

    def get_max_next_state_vals(self, non_final_mask, non_final_next_states):
        if self.hyperbolic:
            target_outptus = []
            gammas = torch.tensor(np.linspace(0, 1,
                                              len(self.q_models) + 1),
                                  dtype=torch.float)[1:]
            for ind, mdl in enumerate(self.target_models):
                next_state_values = torch.zeros(self.batch_size)
                next_state_values[non_final_mask] = mdl(
                    non_final_next_states).max(1)[0].detach()
                target_outptus.append(next_state_values)
            target_outptus = torch.cat(target_outptus,
                                       0).reshape(-1, len(self.target_models))
            target_outptus = target_outptus * gammas
            return target_outptus

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states)
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.get_state_act_vals(state_batch,
                                                      action_batch)
        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        state_action_values = state_action_values.view(-1, 1).repeat(
            1, len(self.q_models))
        next_state_values = self.get_max_next_state_vals(
            non_final_mask, non_final_next_states)
        expected_state_action_values = next_state_values + reward_batch.view(
            -1, 1).repeat(1, len(self.q_models))
        loss = (state_action_values - expected_state_action_values)**2
        coefs = self.get_hyperbolic_train_coeffs(self.k, len(self.q_models))
        loss = torch.sum(loss * coefs)
        # loss = F.smooth_l1_loss(state_action_values.squeeze(),
        #                         expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)
class ActorCritic:
    def __init__(self,
                 sess,
                 training_steps=5000000,
                 learning_rate=0.0001,
                 momentum=0.95,
                 memory_size=100000,
                 discount_rate=0.95,
                 eps_min=0.05):
        self.activation = tf.nn.relu
        self.optimizer = tf.train.MomentumOptimizer
        self.learning_rate = learning_rate
        self.momentum = momentum

        self._build_graph()

        self.memory_size = memory_size
        self.memory = ReplayMemory(self.memory_size)
        '''
        The discount rate is the parameter that indicates how many actions will be considered in the future to evaluate
        the reward of a given action.
        A value of 0 means the agent only considers the present action, and a value close to 1 means the agent
        considers actions very far in the future.
        '''
        self.discount_rate = discount_rate

        self.eps_min = eps_min
        self.eps_decay_steps = int(training_steps / 2)

        self.sess = sess
        self.init = tf.global_variables_initializer()

    def cnn_model(self, X_state, name):
        """
        Creates a CNN network with two convolutional layers followed by two fully connected layers.
        
        :param X_state: Placeholder for the state of the game
        :param name: Name of the network (actor or critic)
        :return : The output (logits) layer and the trainable variables
        """

        initializer = tf.contrib.layers.variance_scaling_initializer()

        conv1_fmaps = 32
        conv1_ksize = 8
        conv1_stride = 2
        conv1_pad = 'SAME'

        conv2_fmaps = 64
        conv2_ksize = 4
        conv2_stride = 2
        conv2_pad = 'SAME'

        n_fc1 = 256

        with tf.variable_scope(name) as scope:

            conv1 = tf.layers.conv2d(X_state,
                                     filters=conv1_fmaps,
                                     kernel_size=conv1_ksize,
                                     activation=self.activation,
                                     strides=conv1_stride,
                                     padding=conv1_pad,
                                     name='conv1')

            conv2 = tf.layers.conv2d(conv1,
                                     filters=conv2_fmaps,
                                     kernel_size=conv2_ksize,
                                     activation=self.activation,
                                     strides=conv2_stride,
                                     padding=conv2_pad,
                                     name='conv2')

            conv2_flat = tf.reshape(conv2, shape=[-1, conv2_fmaps * 5 * 5])

            fc1 = tf.layers.dense(conv2_flat,
                                  n_fc1,
                                  activation=self.activation,
                                  name='fc1',
                                  kernel_initializer=initializer)

            logits = tf.layers.dense(fc1,
                                     N_OUTPUTS,
                                     kernel_initializer=initializer)

            trainable_vars = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)

            trainable_vars_by_name = {
                var.name[len(scope.name):]: var
                for var in trainable_vars
            }
        return logits, trainable_vars_by_name

    def _build_graph(self):
        """
        Creates the Tensorflow graph of the CNN network.
        Two networks will be used, one for the actor, and one for the critic.
        """

        X_state = tf.placeholder(tf.float32, shape=[None, 20, 20, CHANNELS])
        actor_q_values, actor_vars = self.cnn_model(X_state, name="actor")
        critic_q_values, critic_vars = self.cnn_model(X_state, name="critic")

        with tf.variable_scope("train"):
            X_action = tf.placeholder(tf.int32, shape=[None])
            y = tf.placeholder(tf.float32, shape=[None, 1])
            '''A one hot vector (tf.one_hot) is used to only keep the Q-value corresponding to chosen action in the memory. 
            By multiplying the one-hot vector with the actor_q_values, this will zero out all of the Q-values except
            for the one corresponding to the memorized action. Then, by making sum along the first axis (axis=1), 
            we obtain the desired Q-value prediction for each memory.
            '''
            q_value = tf.reduce_sum(actor_q_values *
                                    tf.one_hot(X_action, N_OUTPUTS),
                                    axis=1,
                                    keep_dims=True)
            error = tf.abs(y - q_value)
            loss = tf.reduce_mean(clipped_error(error))
            global_step = tf.Variable(0, trainable=False,
                                      name='global_step')  # iteration step
            optimizer = self.optimizer(self.learning_rate,
                                       self.momentum,
                                       use_nesterov=True)
            training_op = optimizer.minimize(loss, global_step=global_step)

        self.saver = tf.train.Saver()
        self.X_state = X_state
        self.X_action = X_action
        self.y = y
        self.training_op = training_op
        self.loss = loss
        self.actor_q_values, self.actor_vars = actor_q_values, actor_vars
        self.critic_q_values, self.critic_vars = critic_q_values, critic_vars
        self.global_step = global_step

        with tf.variable_scope('summary'):
            self.loss_summary = tf.summary.scalar('loss', loss)
            self.mean_score = tf.placeholder(tf.float32, None)
            self.score_summary = tf.summary.scalar('mean score',
                                                   self.mean_score)
            self.summary_merged = tf.summary.merge(
                [self.loss_summary, self.score_summary])

    def start(self, checkpoint_path):
        """
        Intialize the model or restore the model if it already exists.
        
        :return: Iteration that we want the model to start training
        """
        if os.path.isfile(checkpoint_path + '.index'):
            self.saver.restore(self.sess, checkpoint_path)
            training_start = 1
            print('Restoring model...')
        else:
            # Make the model warm up before training
            training_start = 10000
            self.init.run()
            self.make_copy().run()
            print('New model...')
        return training_start
        return training_start

    def train(self, checkpoint_path, file_writer, mean_score):
        """
        Trains the agent and writes regularly a training summary.

        :param checkpoint_path: The path where the model will be saved
        :param file_writer: The file where the training summary will be written for Tensorboard visualization
        :param mean_score: The mean game score
        """
        copy_steps = 5000
        save_steps = 2000
        summary_steps = 500

        cur_states, actions, rewards, next_states, dones = self.sample_memories(
        )

        next_q_values = self.critic_q_values.eval(
            feed_dict={self.X_state: next_states})
        max_next_q_values = np.max(next_q_values, axis=1, keepdims=True)
        y_vals = rewards + (1 - dones) * self.discount_rate * max_next_q_values
        _, loss_val = self.sess.run([self.training_op, self.loss],
                                    feed_dict={
                                        self.X_state: cur_states,
                                        self.X_action: actions,
                                        self.y: y_vals
                                    })

        step = self.global_step.eval()

        # Regularly copy the online DQN to the target DQN
        if step % copy_steps == 0:
            self.make_copy().run()

        # Save the model regularly
        if step % save_steps == 0:
            self.saver.save(self.sess, checkpoint_path)

        # Write the training summary regularly
        if step % summary_steps == 0:
            summary = self.sess.run(self.summary_merged,
                                    feed_dict={
                                        self.X_state: cur_states,
                                        self.X_action: actions,
                                        self.y: y_vals,
                                        self.mean_score: mean_score
                                    })

            file_writer.add_summary(summary, step)

    def predict(self, cur_state):
        """
        Makes the actor predict q-values based on the current state of the game.
        
        :param cur_state: Current state of the game
        :return The Q-values predicted by the actor
        """
        q_values = self.actor_q_values.eval(
            feed_dict={self.X_state: [cur_state]})
        return q_values

    def remember(self, cur_state, action, reward, new_state, done):
        self.memory.append([cur_state, action, reward, new_state, done])

    def act(self, cur_state, step):
        """
        :param cur_state: Current state of the game
        :param step: Training step
        :return: Action selected by the agent
        """
        eps_max = 1.0
        epsilon = max(
            self.eps_min, eps_max -
            (eps_max - self.eps_min) * 2 * step / self.eps_decay_steps)
        if np.random.rand() < epsilon:
            return np.random.randint(N_OUTPUTS), epsilon  # Random action
        else:
            q_values = self.predict(cur_state)
            return np.argmax(q_values), epsilon  # Optimal action

    def make_copy(self):
        """
        Makes regular copies of the training varibales from the critic to the actor.
        Credits goes to https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb.
        
        :return: A copy of the training variables
        """
        copy_ops = [
            target_var.assign(self.actor_vars[var_name])
            for var_name, target_var in self.critic_vars.items()
        ]
        copy_online_to_target = tf.group(*copy_ops)
        return copy_online_to_target

    def sample_memories(self, batch_size=32):
        """
        Extracts memories from the agent's memory.
        Credits goes to https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb.
        
        :param batch_size: Size of the batch that we extract form the memory
        :return: State, action, reward, next_state, and done values as np.arrays
        """
        cols = [[], [], [], [], []]  # state, action, reward, next_state, done
        for memory in self.memory.sample(batch_size):
            for col, value in zip(cols, memory):
                col.append(value)
        cols = [np.array(col) for col in cols]
        return cols[0], cols[1], cols[2].reshape(-1,
                                                 1), cols[3], cols[4].reshape(
                                                     -1, 1)
Beispiel #12
0
class Model:
    def __init__(self, device, state_size, action_size, folder, config):

        self.folder = folder
        self.config = config
        self.device = device
        self.memory = ReplayMemory(self.config["MEMORY_CAPACITY"])

        self.state_size = state_size
        self.action_size = action_size

        self.critic = Critic(self.state_size, self.action_size, self.device,
                             self.config)
        self.actor = Actor(self.state_size, self.action_size, self.device,
                           self.config)

    def select_action(self, state):
        action = self.actor.select_action(state)
        return action

    def optimize(self):

        if len(self.memory) < self.config["BATCH_SIZE"]:
            return None, None

        transitions = self.memory.sample(self.config["BATCH_SIZE"])
        batch = list(zip(*transitions))

        # Divide memory into different tensors
        states = torch.FloatTensor(batch[0]).to(self.device)
        actions = torch.FloatTensor(batch[1]).to(self.device)
        rewards = torch.FloatTensor(batch[2]).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(batch[3]).to(self.device)
        done = torch.FloatTensor(batch[4]).unsqueeze(1).to(self.device)

        # Compute Q(s,a) using critic network
        current_Q = self.critic(states, actions)

        # Compute deterministic next state action using actor target network
        next_actions = self.actor.target(next_states)

        # Compute next state values at t+1 using target critic network
        target_Q = self.critic.target(next_states, next_actions).detach()
        # Compute expected state action values y[i]= r[i] + Q'(s[i+1], a[i+1])
        target_Q = rewards + done * self.config["GAMMA"] * target_Q

        # Critic loss by mean squared error
        loss_critic = F.mse_loss(current_Q, target_Q)

        # Optimize the critic network
        self.critic.update(loss_critic)

        # Optimize actor
        loss_actor = -self.critic(states, self.actor(states)).mean()
        self.actor.update(loss_actor)

        # Soft parameter update
        update_targets(self.critic.target_nn, self.critic.nn,
                       self.config["TAU"])
        update_targets(self.actor.target_nn, self.actor.nn, self.config["TAU"])

        return loss_actor.item(), loss_critic.item()

    def evaluate(self, environement, n_ep=10):
        rewards = []
        try:
            for i in range(n_ep):
                print('Episode number', i + 1, 'out of', n_ep,
                      'keep waiting...')
                state = environement.reset()
                reward = 0
                done = False
                steps = 0
                while not done and steps < self.config["MAX_STEPS"]:
                    action = self.select_action(state)
                    state, r, done = environement.step(action)
                    reward += r
                    steps += 1
                rewards.append(reward)
                print('Episode reward:', reward)
        except KeyboardInterrupt:
            pass
        if rewards:
            score = sum(rewards) / len(rewards)
        else:
            score = 0

        return score

    def save(self):
        self.actor.save(self.folder)
        self.critic.save(self.folder)

    def load(self):
        try:
            self.actor.load(self.folder)
            self.critic.load(self.folder)
        except FileNotFoundError:
            raise Exception("No model has been saved !") from None
Beispiel #13
0
class Agent:
    """Definition of the Agent that will interact with the environment.

    Attributes:
        REPLAY_MEM_SIZE (:obj:`int`): max capacity of Replay Memory

        BATCH_SIZE (:obj:`int`): Batch size. Default is 40 as specified in the paper.

        GAMMA (:obj:`float`): The discount, should be a constant between 0 and 1
            that ensures the sum converges. It also controls the importance of future
            expected reward.

        EPS_START(:obj:`float`): initial value for epsilon of the e-greedy action
            selection

        EPS_END(:obj:`float`): final value for epsilon of the e-greedy action
            selection

        LEARNING_RATE(:obj:`float`): learning rate of the optimizer
            (Adam)

        INPUT_DIM (:obj:`int`): input dimentionality withut considering batch size.

        HIDDEN_DIM (:obj:`int`): hidden layer dimentionality (for Linear models only)

        ACTION_NUMBER (:obj:`int`): dimentionality of output layer of the Q network

        TARGET_UPDATE (:obj:`int`): period of Q target network updates

        MODEL (:obj:`string`): type of the model.

        DOUBLE (:obj:`bool`): Type of Q function computation.
    """
    def __init__(self,
                 REPLAY_MEM_SIZE=10000,
                 BATCH_SIZE=40,
                 GAMMA=0.98,
                 EPS_START=1,
                 EPS_END=0.12,
                 EPS_STEPS=300,
                 LEARNING_RATE=0.001,
                 INPUT_DIM=24,
                 HIDDEN_DIM=120,
                 ACTION_NUMBER=3,
                 TARGET_UPDATE=10,
                 MODEL='ddqn',
                 DOUBLE=True):

        self.REPLAY_MEM_SIZE = REPLAY_MEM_SIZE
        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_STEPS = EPS_STEPS
        self.LEARNING_RATE = LEARNING_RATE
        self.INPUT_DIM = INPUT_DIM
        self.HIDDEN_DIM = HIDDEN_DIM
        self.ACTION_NUMBER = ACTION_NUMBER
        self.TARGET_UPDATE = TARGET_UPDATE
        self.MODEL = MODEL  # deep q network (dqn) or Dueling deep q network (ddqn)
        self.DOUBLE = DOUBLE  # to understand if use or do not use a 'Double' model (regularization)
        self.TRAINING = True  # to do not pick random actions during testing
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("Agent is using device:\t" + str(self.device))
        '''elif self.MODEL == 'lin_ddqn':
            self.policy_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
        elif self.MODEL == 'lin_dqn':
            self.policy_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
        '''

        if self.MODEL == 'ddqn':
            self.policy_net = ConvDuelingDQN(
                self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = ConvDuelingDQN(
                self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
        elif self.MODEL == 'dqn':
            self.policy_net = ConvDQN(self.INPUT_DIM,
                                      self.ACTION_NUMBER).to(self.device)
            self.target_net = ConvDQN(self.INPUT_DIM,
                                      self.ACTION_NUMBER).to(self.device)

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.LEARNING_RATE)
        self.memory = ReplayMemory(self.REPLAY_MEM_SIZE)
        self.steps_done = 0
        self.training_cumulative_reward = []

    def select_action(self, state):
        """ the epsilon-greedy action selection"""
        state = state.unsqueeze(0).unsqueeze(1)
        sample = random.random()
        if self.TRAINING:
            if self.steps_done > self.EPS_STEPS:
                eps_threshold = self.EPS_END
            else:
                eps_threshold = self.EPS_START
        else:
            eps_threshold = self.EPS_END

        self.steps_done += 1
        # [Exploitation] pick the best action according to current Q approx.
        if sample > eps_threshold:
            with torch.no_grad():
                # Return the number of the action with highest non normalized probability
                # TODO: decide if diverge from paper and normalize probabilities with
                # softmax or at least compare the architectures
                return torch.tensor([self.policy_net(state).argmax()],
                                    device=self.device,
                                    dtype=torch.long)

        # [Exploration]  pick a random action from the action space
        else:
            return torch.tensor([random.randrange(self.ACTION_NUMBER)],
                                device=self.device,
                                dtype=torch.long)

    def optimize_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            # it will return without doing nothing if we have not enough data to sample
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        # Transition is the named tuple defined above.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        #
        # non_final_mask is a column vector telling wich state of the sampled is final
        # non_final_next_states contains all the non-final states sampled
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        nfns = [s for s in batch.next_state if s is not None]
        non_final_next_states = torch.cat(nfns).view(len(nfns), -1)
        non_final_next_states = non_final_next_states.unsqueeze(1)

        state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1)
        state_batch = state_batch.unsqueeze(1)
        action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1)
        reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        # detach removes the tensor from the graph -> no gradient computation is
        # required
        next_state_values = torch.zeros(self.BATCH_SIZE, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        next_state_values = next_state_values.view(self.BATCH_SIZE, -1)

        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch
        # print("expected_state_action_values.shape:\t%s"%str(expected_state_action_values.shape))

        # Compute MSE loss
        loss = F.mse_loss(state_action_values, expected_state_action_values
                          )  # expected_state_action_values.unsqueeze(1)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def optimize_double_dqn_model(self):
        if len(self.memory) < self.BATCH_SIZE:
            # it will return without doing nothing if we have not enough data to sample
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        # Transition is the named tuple defined above.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        #
        # non_final_mask is a column vector telling wich state of the sampled is final
        # non_final_next_states contains all the non-final states sampled
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        nfns = [s for s in batch.next_state if s is not None]
        non_final_next_states = torch.cat(nfns).view(len(nfns), -1)
        non_final_next_states = non_final_next_states.unsqueeze(1)

        state_batch = torch.cat(batch.state).view(self.BATCH_SIZE, -1)
        state_batch = state_batch.unsqueeze(1)
        action_batch = torch.cat(batch.action).view(self.BATCH_SIZE, -1)
        reward_batch = torch.cat(batch.reward).view(self.BATCH_SIZE, -1)
        # print("state_batch shape: %s\nstate_batch[0]:%s\nactionbatch shape: %s\nreward_batch shape: %s"%(str(state_batch.view(40,-1).shape),str(state_batch.view(40,-1)[0]),str(action_batch.shape),str(reward_batch.shape)))

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # ---------- D-DQN Extra Line---------------
        _, next_state_action = self.policy_net(state_batch).max(1,
                                                                keepdim=True)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the actions given by policynet.
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        # detach removes the tensor from the graph -> no gradient computation is
        # required
        next_state_values = torch.zeros(self.BATCH_SIZE,
                                        device=self.device).view(
                                            self.BATCH_SIZE, -1)

        out = self.target_net(non_final_next_states)
        next_state_values[non_final_mask] = out.gather(
            1, next_state_action[non_final_mask])
        # next_state_values = next_state_values.view(self.BATCH_SIZE, -1)
        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch

        # Compute MSE loss
        loss = F.mse_loss(state_action_values, expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def train(self, env, path, num_episodes=40):
        self.TRAINING = True
        cumulative_reward = [0 for t in range(num_episodes)]
        print("Training:")
        for i_episode in tqdm(range(num_episodes)):
            # Initialize the environment and state
            env.reset(
            )  # reset the env st it is set at the beginning of the time serie
            self.steps_done = 0
            state = env.get_state()
            for t in range(len(env.data)):  # while not env.done

                # Select and perform an action
                action = self.select_action(state)
                reward, done, _ = env.step(action)

                cumulative_reward[i_episode] += reward.item()

                # Observe new state: it will be None if env.done = True. It is the next
                # state since env.step() has been called two rows above.
                next_state = env.get_state()

                # Store the transition in memory
                self.memory.push(state, action, next_state, reward)

                # Move to the next state
                state = next_state

                # Perform one step of the optimization (on the policy network): note that
                # it will return without doing nothing if we have not enough data to sample

                if self.DOUBLE:
                    self.optimize_double_dqn_model()
                else:
                    self.optimize_model()

                if done:
                    break

            # Update the target network, copying all weights and biases of policy_net
            if i_episode % self.TARGET_UPDATE == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())

        # save the model
        if self.DOUBLE:
            model_name = env.reward_f + '_reward_double_' + self.MODEL + '_model'
            count = 0
            while os.path.exists(path +
                                 model_name):  # avoid overrinding models
                count += 1
                model_name = model_name + "_" + str(count)

        else:
            model_name = env.reward_f + '_reward_' + self.MODEL + '_model'
            count = 0
            while os.path.exists(path +
                                 model_name):  # avoid overrinding models
                count += 1
                model_name = model_name + "_" + str(count)

        torch.save(self.policy_net.state_dict(), path + model_name)

        return cumulative_reward

    def test(self, env_test, model_name=None, path=None):
        self.TRAINING = False
        cumulative_reward = [0 for t in range(len(env_test.data))]
        reward_list = [0 for t in range(len(env_test.data))]

        if model_name is None:
            pass
        elif path is not None:
            if re.match(".*_dqn_.*", model_name):
                self.policy_net = ConvDQN(self.INPUT_DIM,
                                          self.ACTION_NUMBER).to(self.device)
                if str(self.device) == "cuda":
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name))
                else:
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name,
                                   map_location=torch.device('cpu')))
            elif re.match(".*_ddqn_.*", model_name):
                self.policy_net = ConvDuelingDQN(
                    self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
                if str(self.device) == "cuda":
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name))
                else:
                    self.policy_net.load_state_dict(
                        torch.load(path + model_name,
                                   map_location=torch.device('cpu')))
            else:
                raise RuntimeError(
                    "Please Provide a valid model name or valid path.")
        else:
            raise RuntimeError(
                'Path can not be None if model Name is not None.')

        env_test.reset(
        )  # reset the env st it is set at the beginning of the time serie
        state = env_test.get_state()
        for t in tqdm(range(len(env_test.data))):  # while not env.done

            # Select and perform an action
            action = self.select_action(state)

            reward, done, _ = env_test.step(action)

            cumulative_reward[t] += reward.item(
            ) + cumulative_reward[t - 1 if t - 1 > 0 else 0]
            reward_list[t] = reward

            # Observe new state: it will be None if env.done = True. It is the next
            # state since env.step() has been called two rows above.
            next_state = env_test.get_state()

            # Move to the next state
            state = next_state

            if done:
                break

        return cumulative_reward, reward_list
Beispiel #14
0
    def train(self, config: TrainConfig):
        # experience replay memory
        replay_mem = ReplayMemory(config.memmory_capacity)
        # reward history
        reward = 0
        reward_history = []
        reward_avg = []
        # learning rate related
        alpha = config.lrn_rate
        eps = config.epsilon
        eps_delta = (config.epsilon -
                     config.epsilon_final) / config.warmup_episodes

        step = 0
        for epi in range(config.total_episodes):
            obs = self.env.reset()
            done = False
            traj = []
            reward = 0
            while not done:
                # random choose action with epsilon-greedy
                action = self.act(obs, eps)
                obs_next, r, done, info = self.env.step(action)
                reward += r
                step += 1
                # record trajectories
                traj.append(
                    Transition(obs.flatten(), action, r, obs_next.flatten(),
                               done))
                obs = obs_next
                if replay_mem.size < self.batch_size:
                    continue
                # update q networks with mini-batch replay samples
                batch_data = replay_mem.sample(self.batch_size)
                feed_dict = {
                    self.learning_rate: alpha,
                    self.states: batch_data['s'],
                    self.actions: batch_data['a'],
                    self.rewards: batch_data['r'],
                    self.next_states: batch_data['s_next'],
                    self.dones: batch_data['done'],
                    self.epi_reward: reward_history[-1]
                }
                _, q, q_target, loss, summary = self.session.run([
                    self.optimizer, self.Q, self.Q_target, self.loss,
                    self.merged_summary
                ], feed_dict)
                # update target q networks hardly
                if step % config.target_update_every_steps == 0:
                    self._update_target_q_net()
                self.writer.add_summary(summary)

            replay_mem.add(traj)
            # one episode done
            reward_history.append(reward)
            reward_avg.append(np.mean(reward_history[-10:]))

            # update training param
            alpha *= config.lrn_rate_decay
            if eps > config.epsilon_final:
                eps -= eps_delta

            # report progress
            # if reward_history and config.log_every_episodes and epi % config.log_every_episodes == 0 :
            print(
                "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lrn_rate:{:.4f}, eps:{:.4f}"
                .format(epi, step, np.max(reward_history),
                        np.mean(reward_history[-10:]), reward_history[-5:],
                        alpha, eps))

        self.save_checkpoint(step=step)
        print(
            "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format(
                len(reward_history), np.max(reward_history),
                np.mean(reward_history)))
        return {'rwd': reward_history, 'rwd_avg': reward_avg}
class Agent(object):
    def __init__(self,
                 env_name,
                 state_space,
                 n_actions,
                 replay_buffer_size=500000,
                 batch_size=32,
                 hidden_size=64,
                 gamma=0.99):
        self.env_name = env_name
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.train_device = device
        self.n_actions = n_actions
        self.state_space_dim = state_space
        if "CartPole" in self.env_name:
            self.policy_net = CartpoleDQN(state_space, n_actions, 4)
            self.target_net = CartpoleDQN(state_space, n_actions, 4)
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.target_net.eval()
            self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4)
        elif "WimblepongVisualSimpleAI-v0" in self.env_name:
            self.policy_net = Policy(state_space, n_actions, 4)
            self.target_net = Policy(state_space, n_actions, 4)
            self.target_net.load_state_dict(self.policy_net.state_dict())
            self.target_net.eval()
            self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4)
        else:
            raise ValueError(
                "Wrong environment. An agent has not been specified for %s" %
                env_name)
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8).to(
            self.train_device)
        non_final_mask = non_final_mask.type(torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states).to(
            self.train_device)
        state_batch = torch.stack(batch.state).to(self.train_device)
        action_batch = torch.cat(batch.action).to(self.train_device)
        reward_batch = torch.cat(batch.reward).to(self.train_device)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch).to(self.train_device)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        # Task 4: TODO: Compute the expected Q values
        expected_state_action_values = reward_batch + self.gamma * next_state_values

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values.squeeze(),
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        #print('initial get action',state.shape)

        #print('final get action',state.shape)
        sample = random.random()
        if sample > epsilon:
            with torch.no_grad():
                #print('a',state)
                state = torch.from_numpy(state)
                #print('b',state)
                state = state.unsqueeze(0)
                q_values = self.policy_net(state)
                return torch.argmax(q_values).item()
        else:
            return random.randrange(3)

    def preprocessing(self, observation):
        """ Preprocess the received information: 1) Grayscaling 2) Reducing quality (resizing)
        Params:
            observation: image of pong
        """
        # Grayscaling
        #img_gray = rgb2gray(observation)
        img_gray = np.dot(observation,
                          [0.2989, 0.5870, 0.1140]).astype(np.uint8)

        # Normalize pixel values
        img_norm = img_gray / 255.0

        # Downsampling: we receive squared image (e.g. 200x200) and downsample by x2.5 to (80x80)
        img_resized = cv2.resize(img_norm, dsize=(80, 80))
        #img_resized = img_norm[::2.5,::2.5]
        return img_resized

    def stack_images(self, observation, img_collection, timestep):
        """ Stack up to four frames together
        """
        # image preprocessing
        img_preprocessed = self.preprocessing(observation)

        if (timestep == 0):  # start of new episode
            # img_collection get filled with zeros again
            img_collection = deque(
                [np.zeros((80, 80), dtype=np.int) for i in range(4)], maxlen=4)
            # fill img_collection 4x with the first frame
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            img_collection.append(img_preprocessed)
            # Stack the images in img_collection
            img_stacked = np.stack(img_collection, axis=2)
        else:
            # Delete first/oldest entry and append new image
            #img_collection.pop(0)
            img_collection.append(img_preprocessed)

            # Stack the images in img_collection
            img_stacked = np.stack(img_collection,
                                   axis=2)  # TODO: right axis??

        return img_stacked, img_collection

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long().to(self.train_device)
        reward = torch.tensor([reward],
                              dtype=torch.float32).to(self.train_device)
        next_state = torch.from_numpy(next_state).float().to(self.train_device)
        state = torch.from_numpy(state).float().to(self.train_device)
        self.memory.push(state, action, next_state, reward, done)

    def load_model(self):
        #load_path = '/home/isaac/codes/autonomous_driving/highway-env/data/2020_09_03/Intersection_egoattention_dqn_ego_attention_1_22:00:25/models'
        #policy.load_state_dict(torch.load("./model50000ep_WimblepongVisualSimpleAI-v0_0.mdl"))
        """ Load already created model
        return:
            none
        """
        weights = torch.load("FROM2100v2WimblepongVisualSimpleAI-v0_1900.mdl",
                             map_location=self.train_device)
        self.policy_net.load_state_dict(weights, strict=False)

    def get_name(self):
        """ Interface function to retrieve the agents name
        """
        return self.name

    def reset(self):
        """ Resets the agent’s state after an episode is finished
Beispiel #16
0
class Agent(nn.Module):
    def __init__(self,
                 q_models,
                 target_model,
                 hyperbolic,
                 k,
                 gamma,
                 model_params,
                 replay_buffer_size,
                 batch_size,
                 inp_dim,
                 lr,
                 no_models,
                 act_space,
                 hidden_size,
                 loss_type,
                 target_update=False):
        super(Agent, self).__init__()
        if hyperbolic:
            self.q_models = DQN(state_space_dim=inp_dim,
                                action_space_dim=act_space,
                                hidden=hidden_size,
                                no_models=no_models)
            self.target_models = DQN(state_space_dim=inp_dim,
                                     action_space_dim=act_space,
                                     hidden=hidden_size,
                                     no_models=no_models)
            self.target_models.load_state_dict(self.q_models.state_dict())
            self.target_models.eval()
        else:
            self.q_models = q_models
        self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=lr)
        self.hyperbolic = hyperbolic
        self.n_actions = model_params.act_space
        self.k = k
        # self.gammas = torch.tensor(np.linspace(0, 1, self.q_models.no_models + 1), dtype=torch.float)[1:]
        self.gammas = np.sort(
            np.random.uniform(0, 1, self.q_models.no_models + 1))
        self.gammas = np.append(self.gammas, 0.98)
        self.gammas = torch.tensor(np.sort(self.gammas))
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.inp_dim = inp_dim
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.target_models.to(self.device)
        self.q_models.to(self.device)
        self.gammas = self.gammas.to(self.device)
        self.loss_type = loss_type
        self.criterion = nn.MSELoss()
        self.use_target_network = target_update

    def update_network(self, updates=1):
        for _ in range(updates):
            loss = self._do_network_update()
        return loss

    def get_hyperbolic_train_coeffs(self, k, num_models):
        coeffs = []
        for i in range(1, num_models + 1):
            coeffs.append(((self.gammas[i + 1] - self.gammas[i]) * (1 / k) *
                           self.gammas[i]**((1 / k) - 1)))
        return torch.tensor(coeffs).to(self.device) / sum(coeffs)

    def get_action(self, state_batch, epsilon=0.05, get_among_last=False):
        # epsilon gets smaller as time goes by.
        # (glie_a/(glie_a + eps)) with eps in range(0, no_episodes)
        take_random_action = random.random()
        if take_random_action < epsilon:
            return random.randrange(self.n_actions)
        elif get_among_last:
            state_batch = torch.tensor(state_batch,
                                       dtype=torch.float32,
                                       device=self.device).view(
                                           -1, self.inp_dim)
            model_outputs = self.q_models(state_batch).reshape(
                2, self.q_models.no_models)
            return torch.argmax(model_outputs[:, -10].view(-1)).item()
            model_outputs = model_outputs * self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models)
            actions = torch.argmax(torch.sum(model_outputs, dim=1))
            return actions.item()
        elif self.hyperbolic:
            with torch.no_grad():
                state_batch = torch.tensor(state_batch,
                                           dtype=torch.float32,
                                           device=self.device).view(
                                               -1, self.inp_dim)
                model_outputs = self.q_models(state_batch.double()).reshape(
                    -1, 2)
                coeffs = self.get_hyperbolic_train_coeffs(
                    self.k, self.q_models.no_models).reshape(-1, 1)
                model_outputs = model_outputs * coeffs
                actions = torch.argmax(torch.sum(model_outputs, dim=0))
            return actions.item()

    def get_state_act_vals(self, state_batch, action_batch=None):
        if self.hyperbolic:
            action_batch = action_batch.repeat(
                1, self.q_models.no_models).reshape(-1, 1)
            model_outputs = self.q_models(state_batch.to(self.device).double())
            model_outputs = model_outputs.reshape(-1, self.n_actions)
            model_outputs = model_outputs.gather(1, action_batch)
            # .reshape(self.q_models.no_models * state_batch.shape[0],
            #          2).gather(1, action_batch.reshape(-1))
            return model_outputs
        else:
            model_output = self.q_models(state_batch).gather(1, action_batch)
            return model_output

    def get_max_next_state_vals(self, non_final_mask, non_final_next_states):
        if self.hyperbolic:
            with torch.no_grad():
                next_state_values = torch.zeros(self.batch_size).to(
                    self.device)
                # doing it like this, the model_no will come first and then the batch_no (b1m1, b1m2, b1m3..., b2m1,
                # ...b10m1, b10m2...
                # if False in non_final_mask:
                #     print(non_final_mask)
                #     print(len(non_final_next_states))
                non_final_mask = non_final_mask.reshape(-1, 1).repeat(
                    1, self.q_models.no_models).view(-1)
                # if False in non_final_mask:
                #     print([nf for nf in non_final_mask])
                next_state_values = next_state_values.view(-1, 1).repeat(
                    1, self.q_models.no_models).view(-1)
                if self.use_target_network:
                    # [b1m1o1, b1m1o2], -> max -> [b1m1]
                    # [b1m2o1, b1m2o2],           [b1m2]
                    # [b1m3o1, b1m3o3],           [b1m3]
                    # ...                         ...
                    #
                    next_state_values[non_final_mask] = \
                        self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0]
                    # if False in non_final_mask:
                    #     print("first", self.target_models(non_final_next_states.to(self.device)))
                    #     print("after reshaping", self.target_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions))
                    #     print(self.target_models(non_final_next_states.to(self.device)).shape)
                    #     print("next_state_values", next_state_values)
                else:
                    next_state_values[non_final_mask] = \
                        self.q_models(non_final_next_states.to(self.device)).reshape(-1, self.n_actions).max(1)[0]
                target_outptus = next_state_values
                return target_outptus * self.gammas[2:].repeat(self.batch_size)

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        non_final_mask = ~torch.tensor(batch.done, dtype=torch.bool)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal
        ]
        non_final_next_states = torch.stack(non_final_next_states).to(
            self.device)
        state_batch = torch.stack(batch.state).to(self.device)
        action_batch = torch.cat(batch.action).to(self.device)
        reward_batch = torch.cat(batch.reward).to(self.device)
        state_action_values = self.get_state_act_vals(state_batch,
                                                      action_batch).view(-1)
        next_state_values = self.get_max_next_state_vals(
            non_final_mask, non_final_next_states)
        # this should be perfect
        expected_state_action_values = next_state_values + \
                                       reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1)
        # print(reward_batch.view(-1, 1).repeat(1, self.q_models.no_models).view(-1).shape)
        if self.loss_type == "weighted_loss":
            loss = (state_action_values - expected_state_action_values)**2
            hyp_coef = self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models).repeat(self.batch_size)
            loss = (loss.reshape(-1).view(-1) * hyp_coef).view(-1)
            loss = torch.mean(loss)
        elif self.loss_type == "separate_summarized_loss":
            loss = F.smooth_l1_loss(state_action_values,
                                    expected_state_action_values).double()
            # loss = (state_action_values - expected_state_action_values) ** 2
            # loss = torch.sum(loss)
        elif self.loss_type == "one_output_loss":
            hyp_coef = self.get_hyperbolic_train_coeffs(
                self.k, self.q_models.no_models)
            state_action_values = state_action_values.reshape(
                self.batch_size, -1) * hyp_coef
            state_action_values = torch.sum(state_action_values, dim=1)
            expected_state_action_values = expected_state_action_values.reshape(
                self.batch_size, -1) * hyp_coef
            expected_state_action_values = torch.sum(
                expected_state_action_values, dim=1)
            loss = self.criterion(state_action_values,
                                  expected_state_action_values)

        loss_item = loss.item()
        # print(hyp_coef.repeat(self.batch_size).shape)
        # print(loss.shape)
        # loss = (state_action_values - expected_state_action_values) ** 2 * self.get_hyperbolic_train_coeffs(self.k,
        #                                                                                                     self.q_models.no_models).repeat(
        #     self.batch_size)
        # # loss = torch.sum(loss)
        # loss = F.smooth_l1_loss(stsave_figate_action_values.squeeze(),
        #                         expected_state_action_values)
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.q_models.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()
        return loss_item

    def update_target_network(self):
        self.target_models.load_state_dict(self.q_models.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)
Beispiel #17
0
class Agent(object):
    def __init__(self,
                 state_space,
                 n_actions,
                 replay_buffer_size=50000,
                 batch_size=32,
                 hidden_size=12,
                 gamma=0.98):
        self.n_actions = n_actions
        self.state_space_dim = state_space
        self.policy_net = DQN(state_space, n_actions, hidden_size)
        self.target_net = DQN(state_space, n_actions, hidden_size)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-3)
        self.memory = ReplayMemory(replay_buffer_size)
        self.batch_size = batch_size
        self.gamma = gamma

    def update_network(self, updates=1):
        for _ in range(updates):
            self._do_network_update()

    def _do_network_update(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = 1 - torch.tensor(batch.done, dtype=torch.uint8)
        non_final_next_states = [
            s for nonfinal, s in zip(non_final_mask, batch.next_state)
            if nonfinal > 0
        ]
        non_final_next_states = torch.stack(non_final_next_states)
        state_batch = torch.stack(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()

        # Task 4: TODO: Compute the expected Q values
        expected_state_action_values = reward_batch + self.gamma * next_state_values

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values.squeeze(),
                                expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1e-1, 1e-1)
        self.optimizer.step()

    def get_action(self, state, epsilon=0.05):
        sample = random.random()
        if sample > epsilon:
            with torch.no_grad():
                state = torch.from_numpy(state).float()
                q_values = self.policy_net(state)
                return torch.argmax(q_values).item()
        else:
            return random.randrange(self.n_actions)

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

    def store_transition(self, state, action, next_state, reward, done):
        action = torch.Tensor([[action]]).long()
        reward = torch.tensor([reward], dtype=torch.float32)
        next_state = torch.from_numpy(next_state).float()
        state = torch.from_numpy(state).float()
        self.memory.push(state, action, next_state, reward, done)
Beispiel #18
0
class Execute:
    def __init__(self, path):
        self.config = Configuration.construct(path)
        self.env = Environment(self.config)
        self.memory = ReplayMemory(self.config)
        self.model = Model(self.config)
        self.ep = None

    def get_epsilon(self, is_play):
        if is_play:
            return self.config.play.ep
        ep_start = self.config.train.ep.start
        ep_final = self.config.train.ep.final
        ep_num_frames = self.config.train.ep.num_frames
        decay = (ep_start - ep_final) / ep_num_frames
        if self.ep is None:
            self.ep = ep_start
        self.ep = max(self.ep - decay, ep_final)
        return self.ep

    def log(self, **kawrgs):
        log = ""
        for name, value in kawrgs.items():
            log += f"{name}: {value}, "
        print(log)

    def run_episode(self, episode=1, steps=0, is_play=True, debug=False):
        config = self.config

        self.env.reset()
        action = 1
        _, _, curr_state, is_done = self.env.step(action)
        total_reward = 0
        update_net = 0; C = config.train.network_update_freq
        t = 0; T = config.max_episode_length

        while not is_done and t < T:
            if t % config.action_repeat == 0:
                ep = self.get_epsilon(is_play)
                action = self.model.choose_action(curr_state, ep)
            prev_state, reward, curr_state, is_done = self.env.step(action)
            total_reward += reward
            t += 1

            if is_play:
                self.env.render("human")
                if debug and t % config.play.debug.time == 0:
                    self.log(ftype=self.env.get_frame_type(), action=action, reward=total_reward)
                continue

            self.memory.add((prev_state, action, reward, curr_state, is_done))
            if self.memory.get_size() > config.train.replay_start_size:
                for i in range(config.train.batch_run):
                    batch = self.memory.sample()
                    self.model.optimize(batch)
                    steps = (steps + 1) % C
                if steps % C == 0:
                    self.model.update_qhat()
                    update_net += 1

        if not is_play and debug and episode % config.train.debug.time == 0:
            self.log(ftype=self.env.get_frame_type(), total_reward=total_reward, network_update_steps=update_net, episode_time=t, ep=ep)

        return total_reward, steps

    def load_model(self):
        ftype = self.env.get_frame_type()
        in_size = self.env.get_in_size()
        num_actions = self.env.get_num_actions()
        self.model.load_model(ftype, in_size, num_actions)

    def play(self, debug=False):
        self.load_model()
        for ep in range(1):
            self.run_episode(is_play=True, debug=debug)

    def train(self, debug=False):
        self.load_model()
        optimize_steps = 0
        episodes = self.config.train.episodes
        for episode in range(1, episodes+1):
            reward, steps = self.run_episode(episode=episode, steps=optimize_steps, is_play=False, debug=debug)
            optimize_steps += steps
            if episode % self.config.train.save_model_episode == 0:
                self.model.save_model()
        self.model.update_qhat()
        self.model.save_model()

    def close(self):
        self.env.close()
        self.memory.close()
def train(agent, env, num_episode=50, test_interval=25, num_test=20, num_iteration=200, iteration_cutoff=0, 
          BATCH_SIZE=128, num_sample=50, action_space=[-1,1], debug=True, memory=None, seed=2020,
          update_mode=UPDATE_PER_ITERATION, reward_mode=FUTURE_REWARD_NO, gamma=0.99, 
          loss_history=[], loss_historyA=[], lr_history=[], lr_historyA=[], reward_mean_var=(0,-1),
          save_sim_intv=50, save_sim_fnames=[], imdir='screencaps/', useVid=False, save_intm_models=False,
          not_use_rand_in_action=False, not_use_rand_in_test=True, 
         return_memory=False):
    test_hists = []
    steps = 0
    if memory is None:
        ### UPDate 11/05: Changed memory size based on number of agents
        memory = ReplayMemory(1000 * env.N)
    if iteration_cutoff <= 0:
        iteration_cutoff = num_iteration # Save all iterations into the memory
    
    # Values that would be useful
    N = env.N
    # Note that the seed only controls the numpy random, which affects the environment.
    # To affect pytorch, refer to further documentations: https://github.com/pytorch/pytorch/issues/7068
    np.random.seed(seed)
#     torch.manual_seed(seed)
    test_seeds = np.random.randint(0, 5392644, size=int(num_episode // test_interval)+1)
    
#     rmean = 0
#     rvar = -1
    (rmean, rvar) = reward_mean_var

    for e in range(num_episode):
        steps = 0
        state = env.reset()
        if agent.centralized:
            state = env.state
        state = torch.from_numpy(state).float()
        state = Variable(state)
        if debug:
            env.render()
        # Train History
        state_pool = []
        action_pool = []
        reward_pool = []
        next_state_pool = []
        loss_history.append([])
        loss_historyA.append([])

        for t in range(num_iteration):
#             agent.net.train()
            agent.set_train(True)
            # Try to pick an action, react, and store the resulting behavior in the pool here
            if agent.centralized:
                action = agent.select_action(state, **{
                        'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action
                    }).T
            else:
                actions = []
                for i in range(N):
                    action = agent.select_action(state[i], **{
                        'steps_done':t, 'num_sample':50, 'action_space':action_space, 'rand':not_use_rand_in_action
                    })
                    actions.append(action)
                if torch.is_tensor(action):
                    action = torch.cat(actions).view(-1,env.N)#.T
                else:
                    action = np.array(actions).T # Shape would become (2,N)

            if torch.is_tensor(action):
                next_state, reward, done, _ = env.step(action.detach().numpy())
            else:
                next_state, reward, done, _ = env.step(action)
                
            if agent.centralized:
                next_state = env.state
            next_state = Variable(torch.from_numpy(next_state).float()) # The float() probably avoids bug in net.forward()
            action = action.T # Turn shape back to (N,2)

            if agent.needsExpert:
                # If we need to use expert input during training, then we consult it and get the best action for this state
                actions = env.controller()
                action = actions.T # Shape should already be (2,N), so we turn it into (N,2)
            
            if not(agent.centralized):
                # if reward_mode & FUTURE_REWARD_YES == 0:
                #     # Push everything directly inside if we don't use future discounts
                #     for i in range(N):
                #         memory.push(state[i], action[i], next_state[i], reward[i])
                # else:
                #     # Store and push them outside the loop
                #     state_pool.append(state)
                #     action_pool.append(action)
                #     reward_pool.append(reward)
                #     next_state_pool.append(next_state)
                pass
            else:
                # if reward_mode & FUTURE_REWARD_YES == 0:
                #     # Push everything directly inside if we don't use future discounts
                #     memory.push(state, action, next_state, reward)
                # else:
                #     # Store and push them outside the loop
                #     state_pool.append(state)
                #     action_pool.append(action)
                #     reward_pool.append(reward)
                #     next_state_pool.append(next_state)
                # Centralized training should directly use the real states, instead of observations
                reward = np.sum(reward)

            # Update 1028: Moved this training step outside the loop
            if update_mode == UPDATE_PER_ITERATION:
                # Added 1214: Push the samples to memory if no need for extra processing
                if reward_mode & FUTURE_REWARD_YES == 0 and reward_mode & FUTURE_REWARD_NORMALIZE == 0:
                    if agent.centralized:
                        memory.push(state, action, next_state, reward, reward)
                    else:
                        for i in range(N):
                            memory.push(state[i], action[i], next_state[i], reward[i], reward[i])
                # Learn
                if len(memory) >= BATCH_SIZE:
                    transitions = memory.sample(BATCH_SIZE)
                    batch = Transition(*zip(*transitions))
                    agent.optimize_model(batch, **{'B':BATCH_SIZE})
                elif len(memory) > 0:
                    transitions = memory.sample(len(memory))
                    batch = Transition(*zip(*transitions))
                    agent.optimize_model(batch, **{'B':len(memory)})
                loss_history[-1].append(agent.losses[:])
#                 print(e,t,agent.losses)
                agent.losses=[]
                # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then
                # we can know from the learning rate if we're in a flatter area.
                # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2
                # The scheduler requires the validation loss - can I just use the average training loss instead?
#                 try:
#                     agent.scheduler.step(np.mean(loss_history[-1]))
#                     lr_history.append(agent.optimizer.param_groups[0]['lr'])
#                 except:
#                     agent.schedulerC.step(np.mean(loss_history[-1]))
#                     lr_history.append(agent.optimizerC.param_groups[0]['lr'])
                try:
                    loss_historyA[-1].append(agent.lossesA[:])
                    agent.lossesA=[]
#                     agent.schedulerA.step(np.mean(loss_historyA[-1]))
#                     lr_historyA.append(agent.optimizerA.param_groups[0]['lr'])
                except:
                    pass
            elif update_mode == UPDATE_ON_POLICY:
                # This case would ditch sampling, and just update by the current thing.
                # Note that methods that use future cumulative reward would be highly incompatible with this...
                if not(agent.centralized) or reward_mode & FUTURE_REWARD_YES != 0:
                    print("Error: Update-on-policy might be incompatible with decentralized planning or cumulative reward")
                    return None
                if rvar == -1 and rmean == 0 and reward_mode & FUTURE_REWARD_NORMALIZE != 0:
                    rvar = np.abs(reward)
                    rmean = reward
                reward = (reward - rmean) / rvar
                
                batch = Transition(state, action, next_state, [[reward]], [[reward]])
                agent.optimize_model(batch, **{'B':1})
#                 batch = Transition(state, action, next_state, reward, reward)
# #                 transitions = [batch,batch]
# #                 agent.optimize_model(Transition(*zip(*transitions)), **{'B':2})
#                 transitions = [batch,batch]
#                 agent.optimize_model(batch, **{'B':1})
                loss_history[-1].append(agent.losses[:])
                agent.losses=[]
                try:
                    loss_historyA[-1].append(agent.lossesA[:])
                    agent.lossesA=[]
                except:
                    pass
                
            else:
                # Store and push them outside the loop
                state_pool.append(state)
                if torch.is_tensor(action):
                    action_pool.append(action.detach().numpy())
                else:
                    action_pool.append(action)
                reward_pool.append(reward)
                next_state_pool.append(next_state)
                    
            state = next_state
            steps += 1

            if debug:
                env.render()

            if debug and done:
                print("Took ", t, " steps to converge")
                break
        
        # Now outside the iteration loop - prepare for per-episode trainings
        if update_mode == UPDATE_ON_POLICY:
            pass
        elif update_mode == UPDATE_PER_EPISODE: #se:
            inst_reward = torch.tensor(reward_pool)
            if reward_mode & FUTURE_REWARD_YES != 0:
                for j in range(len(reward_pool)): ### IT was previously miswritten as "reward". Retard bug that might had effects
                    if j > 0:
                        reward_pool[-j-1] += gamma * reward_pool[-j]
            reward_pool = torch.tensor(reward_pool)
            if reward_mode & FUTURE_REWARD_NORMALIZE != 0:
                if rvar == -1 and rmean == 0:
                    rmean = reward_pool.mean()
                    rvar = reward_pool.std()
                    print("Updated mean and stdev: {0} and {1}".format(rmean.numpy(), rvar.numpy()))
                reward_pool = (reward_pool - rmean) / rvar
                inst_reward = (inst_reward - rmean) / rvar

            # Update: 0106 added option to only push the first few iterations into the memory.
            # if agent.centralized:
            # #             print(state_pool[0].shape, action_pool[0].shape)
            #     for j in range(len(reward_pool)):
            #         memory.push(state_pool[-j-1], action_pool[-j-1], 
            #                     next_state_pool[-j-1], reward_pool[-j-1], inst_reward[-j-1])
            # else:
            #     for j in range(len(reward_pool)):
            #         for i in range(N):
            #             memory.push(state_pool[-j-1][i], action_pool[-j-1][i], 
            #                         next_state_pool[-j-1][i], reward_pool[-j-1][i], inst_reward[-j-1][i])
            if agent.centralized:
                for j in range(iteration_cutoff):
                    print(j, len(reward_pool))
                    memory.push(state_pool[j], action_pool[j], 
                                next_state_pool[j], reward_pool[j], inst_reward[j])
            else:
                for j in range(iteration_cutoff):
                    for i in range(N):
                        memory.push(state_pool[j][i], action_pool[j][i], 
                                    next_state_pool[j][i], reward_pool[j][i], inst_reward[j][i])
            

        if update_mode == UPDATE_PER_EPISODE:
            if len(memory) >= BATCH_SIZE:
                transitions = memory.sample(BATCH_SIZE)
                batch = Transition(*zip(*transitions))
                agent.optimize_model(batch, **{'B':BATCH_SIZE})
            elif len(memory) > 0:
                transitions = memory.sample(len(memory))
                batch = Transition(*zip(*transitions))
                agent.optimize_model(batch, **{'B':len(memory)})
            loss_history[-1].append(agent.losses[:])
            agent.losses=[]
            # Also record scheduler history for learning rate. If the scheduler is a Plateau one, then
            # we can know from the learning rate if we're in a flatter area.
            # https://discuss.pytorch.org/t/how-to-retrieve-learning-rate-from-reducelronplateau-scheduler/54234/2
#             try:
#                 agent.scheduler.step(np.mean(loss_history[-1]))
#                 lr_history.append(agent.optimizer.param_groups[0]['lr'])
#             except:
#                 agent.schedulerC.step(np.mean(loss_history[-1]))
#                 lr_history.append(agent.optimizerC.param_groups[0]['lr'])
            try:
                loss_historyA[-1].append(agent.lossesA[:])
                agent.lossesA=[]
#                 agent.schedulerA.step(np.mean(loss_historyA[-1]))
#                 lr_historyA.append(agent.optimizerA.param_groups[0]['lr'])
            except:
                pass
        
        if debug:
            print("Episode ", e, " finished; t = ", t)
        
        if e % test_interval == 0:
            print("Test result at episode ", e, ": ")
            test_hist = test(agent, env, num_test, num_iteration, num_sample, action_space, 
                             seed=test_seeds[int(e/test_interval)], debug=debug, not_use_rand_in_action=not_use_rand_in_test)
            test_hists.append(test_hist)
        
        # Save demos of simulation if wanted
        if e % save_sim_intv == (save_sim_intv-1) and e > 0:
            try:
                fnames = [f+'_{0}'.format(e) for f in save_sim_fnames]
                plot_test(agent, env, fnames=fnames,
                    num_iteration=num_iteration, action_space=action_space, imdir=imdir,
                    debug=debug, useVid=useVid, not_use_rand=not_use_rand_in_test)
                for f in fnames:
                    os.system('ffmpeg -y -pattern_type glob -i "'+imdir+f+'*.jpg" '+f+'.gif')
            except:
                print("Failed to save simulation at e={0}".format(e))
            if save_intm_models and len(save_sim_fnames) > 0:
                agent.save_model(save_sim_fnames[0]+'_{0}'.format(e))
    if return_memory:
        return test_hists, memory
    else:
        return test_hists
class DqnPolicy(BaseTFModel):
    def __init__(self,
                 env,
                 training,
                 name=None,
                 model_path=None,
                 gamma=0.99,
                 lr=0.001,
                 lr_decay=1.0,
                 epsilon=1.0,
                 epsilon_final=0.02,
                 batch_size=32,
                 memory_capacity=100000,
                 model_params={},
                 layer_sizes=[32, 32],
                 target_update_type='hard',
                 target_update_params={},
                 double_q=True,
                 dueling=True,
                 **kwargs):
        if name is None:
            self.name = self.__class__.__name__
        else:
            self.name = name
        if model_path is None:
            self.model_path = os.path.join('model', self.name)
        else:
            self.model_path = model_path
        self.env = env
        self.training = training
        self.gamma = gamma
        self.lr = lr
        self.lr_decay = lr_decay
        self.epsilon = epsilon
        self.epsilon_final = epsilon_final
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.model_params = model_params
        self.layer_sizes = layer_sizes
        self.double_q = double_q
        self.dueling = dueling

        self.target_update_type = target_update_type
        self.target_update_every_step = target_update_params.get(
            'every_step', 100)
        self.target_update_tau = target_update_params.get('tau', 0.05)

        self.memory = ReplayMemory(capacity=memory_capacity)

        self.action_size = self.env.action_space.n
        self.state_size = np.prod(list(self.env.observation_space.shape))
        print 'action_size: {a}, state_size: {s}'.format(a=self.action_size,
                                                         s=self.state_size)

        if self.training:
            # clear existing model files
            if os.path.exists(self.model_path):
                print 'deleting existing model files at {}'.format(
                    self.model_path)
                if os.path.isdir(self.model_path):
                    shutil.rmtree(self.model_path)
                else:
                    os.remove(self.model_path)

        BaseTFModel.__init__(self,
                             self.name,
                             self.model_path,
                             saver_max_to_keep=5)

        print 'building graph ...'
        with self.graph.as_default():
            self.__build_graph()

    def act(self, state, epsilon=0.1):
        """
        :param state: 1d np.ndarray
        :param epsilon:
        :return: int
        """
        assert isinstance(state, np.ndarray) and state.ndim == 1
        if self.training and np.random.random() < epsilon:
            return self.env.action_space.sample()

        with self.sess.as_default():
            return self.actions_selected_by_q.eval(
                {self.states: state.reshape((1, -1))})[0]

    def train(self,
              n_episodes=500,
              annealing_episodes=450,
              every_episode=10,
              **kwargs):
        if self.training is False:
            raise Exception(
                'prohibited to call train() for a non-training model')

        reward_history = [0.0]
        reward_averaged = []
        lr = self.lr
        eps = self.epsilon
        annealing_episodes = annealing_episodes or n_episodes
        eps_drop = (self.epsilon - self.epsilon_final) / annealing_episodes
        print "eps_drop: {}".format(eps_drop)
        step = 0

        # calling the property method of BaseTFModel to start a session
        self.sess.run(self.init_vars)
        self.__init_target_q_net()

        for n_episode in range(n_episodes):
            ob = self.env.reset()
            done = False
            traj = []
            reward = 0.
            while not done:
                a = self.act(ob, eps)
                assert a >= 0
                new_ob, r, done, _ = self.env.step(a)
                step += 1
                reward += r
                traj.append(Transition(ob, a, r, new_ob, done))
                ob = new_ob

                # No enough samples in the buffer yet.
                if self.memory.size < self.batch_size:
                    continue
                # Training with a mini batch of samples
                batch_data = self.memory.sample(self.batch_size)
                feed_dict = {
                    self.learning_rate: lr,
                    self.states: batch_data['s'],
                    self.actions: batch_data['a'],
                    self.rewards: batch_data['r'],
                    self.states_next: batch_data['s_next'],
                    self.done_flags: batch_data['done']
                }

                if self.double_q:
                    actions_next = self.sess.run(
                        self.actions_selected_by_q,
                        {self.states: batch_data['s_next']})
                    feed_dict.update({self.actions_next: actions_next})

                _, q_val, q_target_val, loss, summ_str = self.sess.run(
                    [
                        self.optimizer, self.q, self.q_target, self.loss,
                        self.merged_summary
                    ],
                    feed_dict=feed_dict)
                self.writer.add_summary(summ_str, step)

                # update the target q net if necessary
                self.__update_target_q_net(step)

            self.memory.add(traj)
            reward_history.append(reward)
            reward_averaged.append(np.mean(reward_history[-10:]))

            # Annealing the learning and exploration rate after every episode
            lr *= self.lr_decay
            if eps > self.epsilon_final:
                eps -= eps_drop

            if reward_history and every_episode and n_episode % every_episode == 0:
                print "[episodes: {}/step: {}], best: {}, avg: {:.2f}:{}, lr: {:.4f}, eps: {:.4f}".format(
                    n_episode, step, np.max(reward_history),
                    np.mean(reward_history[-10:]), reward_history[-5:], lr,
                    eps)

        self.save_model(step=step)
        print "[training completed] episodes: {}, Max reward: {}, Average reward: {}".format(
            len(reward_history), np.max(reward_history),
            np.mean(reward_history))

        fig_path = os.path.join(self.model_path, 'figs')
        makedirs(fig_path)
        fig_file = os.path.join(
            fig_path, '{n}-{t}.png'.format(n=self.name, t=int(time.time())))
        plot_learning_curve(fig_file, {
            'reward': reward_history,
            'reward_avg': reward_averaged
        },
                            xlabel='episode')

    def evaluate(self, n_episodes):
        if self.training:
            raise Exception(
                'prohibited to call evaluate() for a training model')

        reward_history = []
        for episode in xrange(n_episodes):
            state = self.env.reset()
            reward_episode = 0.
            while True:
                action = self.act(state)
                new_state, reward, done, _ = self.env.step(action)
                reward_episode += reward
                state = new_state
                if done:
                    break
            reward_history.append(reward_episode)
        return reward_history

    def __build_graph(self):
        self.__create_q_networks()

        # q is the Q(s, a) of the behavior policy
        self.actions_selected_by_q = tf.argmax(self.q,
                                               axis=-1,
                                               name='action_selected')
        action_one_hot = tf.one_hot(self.actions,
                                    self.action_size,
                                    dtype=tf.float32,
                                    name='action_one_hot')
        pred = tf.reduce_sum(self.q * action_one_hot, axis=-1, name='pred')
        # q_target is the Q(s, a) of the target policy that is what we learning for.
        if self.double_q:
            action_next_one_hot = tf.one_hot(self.actions_next,
                                             self.action_size,
                                             dtype=tf.float32,
                                             name='action_next_one_hot')
            max_q_next_target = tf.reduce_sum(self.q_target *
                                              action_next_one_hot,
                                              axis=-1,
                                              name='max_q_next_target')
        else:
            max_q_next_target = tf.reduce_max(self.q_target, axis=-1)
        y = self.rewards + (1. -
                            self.done_flags) * self.gamma * max_q_next_target

        self.loss = tf.reduce_mean(tf.square(pred - tf.stop_gradient(y)),
                                   name="loss_mse_train")
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(
            self.loss, name="adam")
        self.init_vars = tf.global_variables_initializer()
        with tf.variable_scope('summary'):
            q_summ = []
            avg_q = tf.reduce_mean(self.q, 0)
            for idx in range(self.action_size):
                q_summ.append(tf.summary.histogram('q/%s' % idx, avg_q[idx]))
            self.q_summ = tf.summary.merge(q_summ, 'q_summary')

            self.q_y_summ = tf.summary.histogram("batch/y", y)
            self.q_pred_summ = tf.summary.histogram("batch/pred", pred)
            self.loss_summ = tf.summary.scalar("loss", self.loss)

            self.merged_summary = tf.summary.merge_all(
                key=tf.GraphKeys.SUMMARIES)

    def __create_q_networks(self):
        # mini-batch
        self.states = tf.placeholder(tf.float32,
                                     shape=(None, self.state_size),
                                     name='state')
        self.states_next = tf.placeholder(tf.float32,
                                          shape=(None, self.state_size),
                                          name='state_next')
        self.actions = tf.placeholder(tf.int32, shape=(None, ), name='action')
        # actions_next is not the actual actions in the next step;
        # it is used to predict the action value in the Bellman equation.
        self.actions_next = tf.placeholder(tf.int32,
                                           shape=(None, ),
                                           name='action_next')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, ),
                                      name='reward')
        self.done_flags = tf.placeholder(tf.float32,
                                         shape=(None, ),
                                         name='done')
        self.learning_rate = tf.placeholder(tf.float32,
                                            shape=None,
                                            name='learning_rate')

        if self.dueling:
            with tf.variable_scope('Q_primary'):
                self.q_hidden = dense_nn(self.states,
                                         self.layer_sizes[:-1],
                                         name='q_hidden',
                                         training=self.training)
                # advantage function A(s, a)
                self.adv = dense_nn(self.q_hidden,
                                    [self.layer_sizes[-1], self.action_size],
                                    name='adv',
                                    training=self.training)
                # state value function V(s)
                self.v = dense_nn(self.q_hidden, [self.layer_sizes[-1], 1],
                                  name='v',
                                  training=self.training)
                self.q = self.v + (self.adv - tf.reduce_mean(
                    self.adv, reduction_indices=1, keep_dims=True))

            with tf.variable_scope('Q_target'):
                self.q_target_hidden = dense_nn(self.states_next,
                                                self.layer_sizes[:-1],
                                                name='q_hidden',
                                                training=self.training)
                self.adv_target = dense_nn(
                    self.q_target_hidden,
                    [self.layer_sizes[-1], self.action_size],
                    name='adv',
                    training=self.training)
                self.v_target = dense_nn(self.q_target_hidden,
                                         [self.layer_sizes[-1], 1],
                                         name='v',
                                         training=self.training)
                self.q_target = self.v_target + (
                    self.adv_target - tf.reduce_mean(
                        self.adv_target, reduction_indices=1, keep_dims=True))
        else:
            self.q = dense_nn(self.states,
                              self.layer_sizes + [self.action_size],
                              name='Q_primary',
                              training=self.training)
            self.q_target = dense_nn(self.states_next,
                                     self.layer_sizes + [self.action_size],
                                     name='Q_target',
                                     training=self.training)

        self.q_vars = self.scope_vars('Q_primary')
        self.q_target_vars = self.scope_vars('Q_target')
        assert len(self.q_vars) == len(
            self.q_target_vars), "Two Q-networks are not same in structure."

    def __init_target_q_net(self):
        self.__update_target_q_net_hard()

    def __update_target_q_net_hard(self):
        self.sess.run(
            [v_t.assign(v) for v_t, v in zip(self.q_target_vars, self.q_vars)])

    def __update_target_q_net_soft(self, tau=0.05):
        self.sess.run([
            v_t.assign(v_t * (1. - tau) + v * tau)
            for v_t, v in zip(self.q_target_vars, self.q_vars)
        ])

    def __update_target_q_net(self, step):
        if self.target_update_type == 'hard':
            if step % self.target_update_every_step == 0:
                self.__update_target_q_net_hard()
        else:
            self.__update_target_q_net_soft(self.target_update_tau)
Beispiel #21
0
class DQfDAgent(DQNAgent):
    def __init__(self, model, env, demo_memory, **kwargs):
        DQNAgent.__init__(self, model, env, **kwargs)
        self.EXPERT_MARGIN = kwargs.pop("expert_margin", 0.8)
        self.DEMO_PER = kwargs.pop("demo_percent", 0.3)
        self.N_STEP = kwargs.pop("n_step", 5)
        self.LAMBDA_1 = kwargs.pop("lambda_1", 0.1)
        self.LAMBDA_2 = kwargs.pop("lambda_2", 0.5)
        self.LAMBDA_3 = kwargs.pop("lambda_3", 0)
        self.memory = ReplayMemory(self.REPLAY_CAPACITY, self.N_STEP,
                                   self.GAMMA)
        self.demo_memory = demo_memory
        self.demo_memory.n_step = self.N_STEP
        self.demo_memory.gamma = self.GAMMA
        self.is_pre_train = False

    def _n_step_loss(self, y_pred, n_returns_batch, non_final_n_mask,
                     non_final_n_states):
        q_n = Variable(torch.zeros(self.BATCH_SIZE).type(FloatTensor))
        target_q_n = self.target_model(non_final_n_states)
        if self.DOUBLE_DQN:
            max_act_n = self.model(non_final_n_states).max(1)[1].view(-1, 1)
            q_n[non_final_n_mask] = target_q_n.gather(1, max_act_n).data.view(
                target_q_n.gather(1, max_act_n).data.shape[0])
        else:
            q_n[non_final_n_mask] = target_q_n.max(1)[0].data

        y_n_step = q_n * np.power(self.GAMMA, self.N_STEP) + n_returns_batch
        return nn.functional.mse_loss(y_pred, y_n_step)

    def _expert_loss(self, q_pred, action_batch, non_demo_mask):
        y_pred = q_pred.gather(1, action_batch).squeeze()
        expert_margin = torch.zeros(self.BATCH_SIZE, self.out_size)
        expert_margin[:, action_batch.data] = self.EXPERT_MARGIN
        q_l = q_pred + Variable(expert_margin)
        j_e = q_l.max(1)[0] - y_pred
        j_e[non_demo_mask] = 0
        return j_e.sum()

    def _collect_batch(self):
        non_demo_mask = ByteTensor([False] * self.BATCH_SIZE)
        if self.is_pre_train:
            batch, n_returns, n_step_states = self.demo_memory.sample(
                self.BATCH_SIZE)
        else:
            demo_num = int(self.BATCH_SIZE * self.DEMO_PER)
            replay_demo, n_returns_demo, n_step_states_demo = \
                self.demo_memory.sample(demo_num)
            replay_agent, n_returns_agent, n_step_states_agent = \
                self.memory.sample(self.BATCH_SIZE - demo_num)
            batch = replay_demo.extend(replay_agent)
            if demo_num != self.BATCH_SIZE:
                non_demo_mask[demo_num:] = 1
            n_returns_demo.extend(n_returns_agent)
            n_returns = n_returns_demo
            n_step_states = np.concatenate(
                [n_step_states_demo, n_step_states_agent])

        return batch, n_returns, n_step_states, non_demo_mask

    def _calc_loss(self):
        batch, n_returns, n_step_states, non_demo_mask = self._collect_batch()

        non_final_mask = ByteTensor(
            tuple([s is not None for s in batch.next_state]))
        non_final_next_states = Variable(
            torch.cat([s for s in batch.next_state if s is not None]))
        non_final_n_mask = ByteTensor(
            tuple([s is not None for s in n_step_states]))
        non_final_n_states = Variable(
            torch.cat([s for s in n_step_states if s is not None]))

        state_batch = Variable(
            torch.cat([s for s in batch.state if s is not None]))
        action_batch = Variable(
            torch.cat([s for s in batch.action if s is not None]))
        reward_batch = Variable(
            torch.cat([s for s in batch.reward if s is not None]))
        n_returns_batch = Variable(torch.cat(n_returns))

        q_pred = self.model(state_batch)
        y_pred = q_pred.gather(1, action_batch).squeeze()

        dq_loss = self._DQ_loss(y_pred, reward_batch, non_final_mask,
                                non_final_next_states)
        n_step_loss = self._n_step_loss(y_pred, n_returns_batch,
                                        non_final_n_mask, non_final_n_states)
        expert_loss = self._expert_loss(q_pred, action_batch, non_demo_mask)
        loss = dq_loss + self.LAMBDA_1 * n_step_loss + self.LAMBDA_2 * expert_loss
        self.container.add("dq_loss", torch.mean(dq_loss.data))
        self.container.add("expert_loss", torch.mean(expert_loss.data))
        self.container.add("y_pred", torch.mean(y_pred.data))
        self.container.add("loss", torch.mean(loss.data))
        return loss

    def pre_train(self, steps):
        self.i_episode = 0
        self.i_step = 0
        self.is_pre_train = True
        print("Pre training...")
        for i in range(steps):
            if i % 500 == 0:
                print("Pre train steps: {}".format(i))
            self.update_policy()
            self.update_target_network()
        print("Pre train done")
        self.is_pre_train = False
Beispiel #22
0
class EGreedyAgent(AgentWithWheel):
    """
    This agent use actor critic algorithm optimize model.
    """
    def __init__(self,
                 x,
                 y,
                 r,
                 color,
                 agent_type,
                 features_n,
                 actions_n,
                 discounted_value,
                 memory_capacity=4096,
                 batch_size=512,
                 learning_rate=0.0001,
                 need_restore=False):
        super(EGreedyAgent, self).__init__(x, y, r, color, agent_type)
        self.gamma = discounted_value
        self.features_n = features_n
        self.actions_n = actions_n
        self.lr = learning_rate
        self.save_file_path = 'model/dqn.pkl'
        self.device = 'cpu'
        self.policy_net = DQNet(self.features_n, self.actions_n)
        self.target_net = DQNet(self.features_n, self.actions_n)
        # let target net has the same params as policy net
        self.target_net.eval()
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                       lr=self.lr)
        self.memory = []
        self.eps_start = 0.9
        self.eps_end = 0.05
        self.eps_decay = 5000
        self.steps_count = 0
        self.batch_size = batch_size
        self.memory = ReplayMemory(memory_capacity)
        self.need_exploit = True
        if need_restore:
            self.restore()

    def act(self, state):
        """
        Chose action with probability.
        """
        state = torch.FloatTensor([state])
        sample = random.random()
        # chose action randomly at the beginning, then slowly chose max Q_value
        eps_threhold = self.eps_end + (self.eps_start - self.eps_end) * \
                       math.exp(-1. * self.steps_count / self.eps_decay) \
            if self.need_exploit else 0.01
        self.steps_count += 1
        if sample > eps_threhold:
            with torch.no_grad():
                left_v, right_v = self.policy_net(state)
                l, r = left_v.max(1)[1].view(1,
                                             1).item(), right_v.max(1)[1].view(
                                                 1, 1).item()
                # print('left: %d\tright: %d' % (l, r))
                return l, r
        else:
            l, r = random.randrange(self.actions_n), random.randrange(
                self.actions_n)
            return l, r

    def optimize_model(self):
        """
        Train model.
        """
        if len(self.memory) < self.batch_size:
            return 0.0
        transitions = self.memory.sample(self.batch_size)
        # batch is ([state], [left_v, right_v], [next_state], [reward])
        batch = Transition(*zip(*transitions))
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device)
        non_final_next_states = torch.cat([
            torch.tensor([s], dtype=torch.float) for s in batch.next_state
            if s is not None
        ])
        state_batch = torch.cat(
            [torch.tensor([s], dtype=torch.float) for s in batch.state])
        left_batch = torch.cat(
            [torch.tensor([[s[0]]], dtype=torch.long) for s in batch.action])
        right_batch = torch.cat(
            [torch.tensor([[s[1]]], dtype=torch.long) for s in batch.action])
        reward_batch = torch.cat(
            [torch.tensor([[s]], dtype=torch.float) for s in batch.reward])
        left_eval, right_eval = self.policy_net(state_batch)
        left_q_eval = left_eval.gather(1, left_batch)
        right_q_eval = right_eval.gather(1, right_batch)
        left_q_next, right_q_next = self.target_net(non_final_next_states)
        left_q_next = left_q_next.max(1)[0].detach()
        right_q_next = right_q_next.max(1)[0].detach()
        left_q_target = (left_q_next * self.gamma) + reward_batch.squeeze()
        right_q_target = (right_q_next * self.gamma) + reward_batch.squeeze()

        loss = F.mse_loss(left_q_eval,
                          left_q_target.unsqueeze(1)) + F.mse_loss(
                              right_q_eval, right_q_target.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

    def save(self):
        """
        Save trained model.
        """
        torch.save(self.policy_net.state_dict(), self.save_file_path)
        print('Model saved succeed!')

    def restore(self):
        """
        Restore model from saved file.
        """
        self.policy_net.load_state_dict(torch.load(self.save_file_path))
Beispiel #23
0
class Agent(object):
    def __init__(self,
                 num_actions,
                 gamma=0.98,
                 memory_size=5000,
                 batch_size=32):
        self.scaler = None
        self.featurizer = None
        self.q_functions = None
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_actions = num_actions
        self.memory = ReplayMemory(memory_size)
        self.initialize_model()

    def initialize_model(self):
        # Draw some samples from the observation range and initialize the scaler
        obs_limit = np.array([4.8, 5, 0.5, 5])
        samples = np.random.uniform(-obs_limit, obs_limit,
                                    (1000, obs_limit.shape[0]))
        self.scaler = StandardScaler()
        self.scaler.fit(samples)

        # Initialize the RBF featurizer
        self.featurizer = FeatureUnion([
            ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
            ("rbf2", RBFSampler(gamma=2.0, n_components=80)),
            ("rbf3", RBFSampler(gamma=1.0, n_components=50)),
        ])
        self.featurizer.fit(self.scaler.transform(samples))

        # Create a value approximator for each action
        self.q_functions = [
            SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3)
            for _ in range(self.num_actions)
        ]

        # Initialize it to whatever values; implementation detail
        for q_a in self.q_functions:
            q_a.partial_fit(self.featurize(samples),
                            np.zeros((samples.shape[0], )))

    def featurize(self, state):
        if len(state.shape) == 1:
            state = state.reshape(1, -1)
        # Task 1: TODO: Use (s, abs(s)) as features
        #return np.concatenate((state, np.abs(state)), axis=1)
        # RBF features
        return self.featurizer.transform(self.scaler.transform(state))

    def get_action(self, state, epsilon=0.0):
        if np.random.random() < epsilon:
            a = int(np.random.random() * self.num_actions)
            return a
        else:
            featurized = self.featurize(state)
            qs = [q.predict(featurized)[0] for q in self.q_functions]
            qs = np.array(qs)
            a = np.argmax(qs, axis=0)
            return a

    def single_update(self, state, action, next_state, reward, done):
        # Calculate feature representations of the
        # Task 1: TODO: Set the feature state and feature next state

        featurized_state = self.featurize(state)
        featurized_next_state = self.featurize(next_state)

        # Task 1:  TODO Get Q(s', a) for the next state
        next_qs = [
            q.predict(featurized_next_state)[0] for q in self.q_functions
        ]

        # Calculate the updated target Q- values
        # Task 1: TODO: Calculate target based on rewards and next_qs
        if done:
            target = reward
        else:
            target = reward + self.gamma * np.max(next_qs)
        # Update Q-value estimation
        self.q_functions[action].partial_fit(featurized_state, [target])

    def update_estimator(self):
        if len(self.memory) < self.batch_size:
            # Use the whole memory
            samples = self.memory.memory
        else:
            # Sample some data
            samples = self.memory.sample(self.batch_size)
        # Task 2: TODO: Reformat data in the minibatch
        states = []
        action = []
        next_states = []
        rewards = []
        dones = []
        for s in samples:
            states.append(s.state)
            action.append(s.action)
            next_states.append(s.next_state)
            rewards.append(s.reward)
            dones.append(s.done)
        states = np.array(states)
        next_states = np.array(next_states)
        action = np.array(action)
        rewards = np.array(rewards)
        dones = np.array(dones)

        # Task 2: TODO: Calculate Q(s', a)
        featurized_next_states = self.featurize(next_states)
        next_qs = np.max(np.array(
            [q.predict(featurized_next_states) for q in self.q_functions]).T,
                         axis=1)

        # Calculate the updated target values
        # Task 2: TODO: Calculate target based on rewards and next_qs
        targets = rewards + self.gamma * next_qs * np.invert(dones)

        # Calculate featurized states
        featurized_states = self.featurize(states)
        # Get new weights for each action separately
        for a in range(self.num_actions):
            # Find states where a was taken
            idx = action == a

            # If a not present in the batch, skip and move to the next action
            if np.any(idx):
                act_states = featurized_states[idx]
                act_targets = targets[idx]
                # Perform a single SGD step on the Q-function params
                self.q_functions[a].partial_fit(act_states, act_targets)

    def store_transition(self, *args):
        self.memory.push(*args)
Beispiel #24
0
class DQN(object):
    def __init__(self, game_name, gamma, batch_size, eps_start, eps_end,
                 eps_decay, mem_size, device):
        if batch_size > mem_size:
            print(
                "Error: the training crushes due to batch size smaller than memory size."
            )
            return
        self.gamma = gamma
        self.batch_size = batch_size
        self.eps_start = eps_start
        self.eps_end = eps_end
        self.eps_decay = eps_decay
        self.env = Environment(game_name)
        self.step_done = 0
        self.device = device
        self.memory = ReplayMemory(mem_size)
        # define the policy net and target net
        _, _, height, width = self.env.get_screen().shape
        self.policy_net = Net(height, width,
                              self.env.num_action).to(self.device)
        self.target_net = Net(height, width,
                              self.env.num_action).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters())

    def select_action(self, state):
        sample = random.random()
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end)\
                        * np.exp(-1 * self.step_done / self.eps_decay)
        self.step_done += 1
        # decide whether to exploitation or exploration
        if sample > eps_threshold:
            with torch.no_grad():
                # return the action with the largest expected reward
                # similar to classification task but not the same
                # both tasks use the scoring mechanism to achieve their goals
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
            return torch.tensor([[random.randrange(self.env.num_action)]],
                                device=self.device,
                                dtype=torch.long)

    def optimize(self):
        # see https://stackoverflow.com/a/19343/3343043
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        # creat masks
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        # use the policy_net as the behavior network
        # use the target_net as the Q-values fitting network
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(
            non_final_next_states).max(1)[0].detach()
        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch
        # compute the loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def plot_duration(self):
        pass