Exemple #1
0
class DQNAgent:
    def __init__(self):
        # self.config = config
        self.gamma = 0.75

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(),
                                      lr=0.0001)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 250

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = os.path.join(os.getcwd(), "model") + "/"
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)

        t = time.localtime()
        self.save_tensorboard_path = os.path.join(
            os.getcwd(), "tensorboard_record") + "/run_" + time.strftime(
                "%d_%m_%Y_%H_%M", t) + "/"
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)
        self.writer = SummaryWriter(self.save_tensorboard_path)

    def run(self):
        """
        This function will the operator
        :return:
        """
        try:
            self.train()

        except KeyboardInterrupt as e:
            print(e)

    def select_action(self, state, random_only=False):
        """
        The action selection function, it either uses the model to choose an action or samples one uniformly.
        :param state: current state of the model
        :return:
        """

        self.eps_start = 0.90
        self.eps_end = 0.35
        self.eps_decay = 500

        if self.cuda:
            state = state.cuda()
        sample = random.random()
        eps_threshold = self.eps_start - (
            self.eps_start - self.eps_end) * math.exp(
                -1. * self.current_iteration / self.eps_decay)

        self.writer.add_scalar('epsilon', eps_threshold,
                               self.current_iteration)
        # print("Eps thresh: ", eps_threshold)
        if sample < eps_threshold and not random_only:
            # print("Model step")
            with torch.no_grad():
                return self.policy_model(state).max(1)[1].view(1,
                                                               1)  # size (1,1)
        else:
            # print("Random step")
            return torch.tensor([[random.randrange(5)]],
                                device=self.device,
                                dtype=torch.long)

    def get_action(self, state):

        if self.cuda:
            state = state.cuda()
        with torch.no_grad():
            return self.policy_model(state).max(1)[1].view(1, 1)  # size (1,1)

    def optimize_policy_model(self):
        """
        performs a single step of optimization for the policy model
        :return:
        """
        if self.memory.length() < self.batch_size:
            return

        self.memory.setup_epoch_training()

        total_loss = None
        training_len = math.ceil(self.memory.length() / self.batch_size)
        for i in range(training_len):
            # sample a batch
            transitions = self.memory.sample_batch(self.batch_size, i)
            len_transitions = len(transitions)

            one_batch = Transition(*zip(*transitions))

            non_final_mask = torch.tensor(tuple(
                map(lambda s: s is not None, one_batch.next_state)),
                                          device=self.device,
                                          dtype=torch.uint8)
            non_final_next_states = torch.cat(
                [s for s in one_batch.next_state if s is not None])

            state_batch = torch.cat(one_batch.state)
            action_batch = torch.cat(one_batch.action)
            reward_batch = torch.cat(one_batch.reward)

            state_batch = state_batch.to(self.device)
            non_final_next_states = non_final_next_states.to(self.device)

            curr_state_values = self.policy_model(state_batch)  # [128, 2]
            curr_state_action_values = curr_state_values.gather(
                1, action_batch)  # [128, 1]

            next_state_values = torch.zeros(len_transitions,
                                            device=self.device)  # [128]
            next_state_values[non_final_mask] = self.target_model(
                non_final_next_states).max(1)[0].detach()  # [< 128]

            # Get the expected Q values
            expected_state_action_values = (next_state_values *
                                            self.gamma) + reward_batch  # [128]
            # compute loss: temporal difference error
            loss = self.loss(curr_state_action_values,
                             expected_state_action_values.unsqueeze(1))

            if total_loss is None:
                total_loss = loss
            else:
                total_loss += loss

            # optimizer step
            self.optim.zero_grad()
            loss.backward()
            for param in self.policy_model.parameters():
                param.grad.data.clamp_(-1, 1)
            self.optim.step()

        self.writer.add_scalar('loss', total_loss / training_len,
                               self.current_iteration)
        # return loss

    def train(self):
        """
        Training loop based on the number of episodes
        :return:
        """

        self.num_episodes = 2000
        self.target_update = 1

        mean_score, max_score, min_score = self.run_sim(100, random_only=True)

        self.writer.add_scalar('mean_score', mean_score, 0)
        self.writer.add_scalar('max_score', max_score, 0)
        self.writer.add_scalar('min_score', min_score, 0)

        for episode in tqdm(range(self.current_episode, self.num_episodes)):
            self.current_iteration += 1
            self.current_episode = episode
            # reset environment
            self.train_one_epoch()
            # The target network has its weights kept frozen most of the time
            if self.current_episode % self.target_update == 0:
                self.target_model.load_state_dict(
                    self.policy_model.state_dict())

            if self.current_episode % 25 == 0:
                torch.save(
                    self.policy_model.state_dict(), self.savepath +
                    "policy_epoch" + str(self.current_episode) + ".pth")
                torch.save(
                    self.target_model.state_dict(), self.savepath +
                    "target_epoch" + str(self.current_episode) + ".pth")

    def run_sim(self, count=20, random_only=False):
        score_list = []
        for i in range(count):
            self.env.reset_game()
            episode_duration = 0

            curr_state = torch.Tensor(self.env.get_state()).permute(
                2, 0, 1).unsqueeze(0)

            while (1):
                # time.sleep(0.1)
                episode_duration += 1

                # select action
                action = self.select_action(curr_state, random_only)

                images, reward, done, score = self.env.step(
                    action.item())  #TODO

                if self.cuda:
                    reward = torch.Tensor([reward]).to(self.device)
                else:
                    reward = torch.Tensor([reward]).to(self.device)

                # assign next state
                if done:
                    next_state = None
                else:
                    next_state = torch.Tensor(images).permute(2,
                                                              0, 1).unsqueeze(
                                                                  0)  #TODO

                # add this transition into memory
                self.memory.push_transition(curr_state, action, next_state,
                                            reward)

                curr_state = next_state

                if done:
                    score_list.append(score)
                    break

        return np.mean(np.array(score_list)), np.max(
            np.array(score_list)), np.min(np.array(score_list))

    def train_one_epoch(self):
        """
        One episode of training; it samples an action, observe next screen and optimize the model once
        :return:
        """

        mean_score, max_score, min_score = self.run_sim()
        # print(mean_score)
        self.writer.add_scalar('mean_score', mean_score,
                               self.current_iteration)
        self.writer.add_scalar('max_score', max_score, self.current_iteration)
        self.writer.add_scalar('min_score', min_score, self.current_iteration)

        # Policy model optimization step
        self.optimize_policy_model()

    def validate(self):

        curr_state = torch.Tensor(self.env.get_state()).permute(2, 0,
                                                                1).unsqueeze(0)

        while (1):
            # time.sleep(0.1)

            episode_duration += 1
            # select action
            action = self.get_action(curr_state)

            images, reward, done, score = self.env.step(action.item())  #TODO

            if self.cuda:
                reward = torch.Tensor([reward]).to(self.device)
            else:
                reward = torch.Tensor([reward]).to(self.device)

            # assign next state
            if done:
                next_state = None
            else:
                next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze(
                    0)  #TODO

            curr_state = next_state

            if done:
                print(score)
                break
class DQNAgent:
    def __init__(self):
        # self.config = config
        self.gamma = 0.4

        # self.logger = logging.getLogger("DQNAgent")

        self.screen_width = 600

        # define models (policy and target)
        self.policy_model = DQN()
        self.target_model = DQN()

        # define memory
        self.memory = ReplayMemory()

        # define loss
        self.loss = HuberLoss()

        # define optimizer
        self.optim = torch.optim.Adam(self.policy_model.parameters(), lr=0.01)

        # define environment
        self.env = PyCar()  #TODO
        # self.cartpole = PyCar(self.screen_width)

        # initialize counter
        self.current_episode = 0
        self.current_iteration = 0
        self.episode_durations = []

        self.batch_size = 250

        # set cuda flag
        self.is_cuda = torch.cuda.is_available()

        self.cuda = self.is_cuda

        if self.cuda:
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.policy_model = self.policy_model.to(self.device)
        self.target_model = self.target_model.to(self.device)
        self.loss = self.loss.to(self.device)

        # Initialize Target model with policy model state dict
        self.target_model.load_state_dict(self.policy_model.state_dict())
        self.target_model.eval()

        self.savepath = "/home/sk002/Documents/RL-Project/model/"

    def run(self):
        """
        This function will the operator
        :return:
        """
        try:
            self.train()

        except KeyboardInterrupt as e:
            print(e)

    def select_action(self, state):
        """
        The action selection function, it either uses the model to choose an action or samples one uniformly.
        :param state: current state of the model
        :return:
        """

        self.eps_start = 0.95
        self.eps_end = 0.65
        self.eps_decay = 2000

        if self.cuda:
            state = state.cuda()
        sample = random.random()
        eps_threshold = self.eps_start - (
            self.eps_start - self.eps_end) * math.exp(
                -1. * self.current_iteration / self.eps_decay)
        self.current_iteration += 1
        # print("Eps thresh: ", eps_threshold)
        if sample < eps_threshold:
            # print("Model step")
            with torch.no_grad():
                return self.policy_model(state).max(1)[1].view(1,
                                                               1)  # size (1,1)
        else:
            # print("Random step")
            return torch.tensor([[random.randrange(5)]],
                                device=self.device,
                                dtype=torch.long)

    def get_action(self, state):

        if self.cuda:
            state = state.cuda()
        with torch.no_grad():
            return self.policy_model(state).max(1)[1].view(1, 1)  # size (1,1)

    def optimize_policy_model(self):
        """
        performs a single step of optimization for the policy model
        :return:
        """
        if self.memory.length() < self.batch_size:
            return
        # sample a batch
        transitions = self.memory.sample_batch(self.batch_size)

        one_batch = Transition(*zip(*transitions))

        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, one_batch.next_state)),
                                      device=self.device,
                                      dtype=torch.uint8)
        non_final_next_states = torch.cat(
            [s for s in one_batch.next_state if s is not None])

        state_batch = torch.cat(one_batch.state)
        action_batch = torch.cat(one_batch.action)
        reward_batch = torch.cat(one_batch.reward)

        state_batch = state_batch.to(self.device)
        non_final_next_states = non_final_next_states.to(self.device)

        curr_state_values = self.policy_model(state_batch)  # [128, 2]
        curr_state_action_values = curr_state_values.gather(
            1, action_batch)  # [128, 1]

        next_state_values = torch.zeros(self.batch_size,
                                        device=self.device)  # [128]
        next_state_values[non_final_mask] = self.target_model(
            non_final_next_states).max(1)[0].detach()  # [< 128]

        # Get the expected Q values
        expected_state_action_values = (next_state_values *
                                        self.gamma) + reward_batch  # [128]
        # compute loss: temporal difference error
        loss = self.loss(curr_state_action_values,
                         expected_state_action_values.unsqueeze(1))

        # optimizer step
        self.optim.zero_grad()
        loss.backward()
        for param in self.policy_model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim.step()

        return loss

    def train(self):
        """
        Training loop based on the number of episodes
        :return:
        """

        self.num_episodes = 1000
        self.target_update = 5

        for episode in tqdm(range(self.current_episode, self.num_episodes)):
            self.current_episode = episode
            # reset environment
            self.env.reset_game()
            self.train_one_epoch()
            # The target network has its weights kept frozen most of the time
            if self.current_episode % self.target_update == 0:
                self.target_model.load_state_dict(
                    self.policy_model.state_dict())

            if self.current_episode % 50 == 0:
                torch.save(
                    self.policy_model.state_dict(), self.savepath +
                    "policy_epoch" + str(self.current_episode) + ".pth")
                torch.save(
                    self.target_model.state_dict(), self.savepath +
                    "target_epoch" + str(self.current_episode) + ".pth")

    def train_one_epoch(self):
        """
        One episode of training; it samples an action, observe next screen and optimize the model once
        :return:
        """
        episode_duration = 0

        curr_state = torch.Tensor(self.env.get_state()).permute(2, 0,
                                                                1).unsqueeze(0)

        while (1):
            # time.sleep(0.1)

            episode_duration += 1
            # select action
            action = self.select_action(curr_state)

            images, reward, done, score = self.env.step(action.item())  #TODO

            if self.cuda:
                reward = torch.Tensor([reward]).to(self.device)
            else:
                reward = torch.Tensor([reward]).to(self.device)

            # assign next state
            if done:
                next_state = None
            else:
                next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze(
                    0)  #TODO

            # add this transition into memory
            self.memory.push_transition(curr_state, action, next_state, reward)

            curr_state = next_state

            # Policy model optimization step
            curr_loss = self.optimize_policy_model()
            if curr_loss is not None:
                if self.cuda:
                    curr_loss = curr_loss.cpu()

            if done:
                print(score)
                break

    def validate(self):

        curr_state = torch.Tensor(self.env.get_state()).permute(2, 0,
                                                                1).unsqueeze(0)

        while (1):
            # time.sleep(0.1)

            episode_duration += 1
            # select action
            action = self.get_action(curr_state)

            images, reward, done, score = self.env.step(action.item())  #TODO

            if self.cuda:
                reward = torch.Tensor([reward]).to(self.device)
            else:
                reward = torch.Tensor([reward]).to(self.device)

            # assign next state
            if done:
                next_state = None
            else:
                next_state = torch.Tensor(images).permute(2, 0, 1).unsqueeze(
                    0)  #TODO

            curr_state = next_state

            if done:
                print(score)
                break