Esempio n. 1
0
class Agent():
    def __init__(self, render=False, method='Duel'):

        # Create an instance of the network itself, as well as the memory.
        # Here is also a good place to set environmental parameters,
        # as well as training parameters - number of episodes / iterations, etc.
        self.render = render
        if render:
            self.env = gym.make('NEL-render-v0')
        else:
            self.env = gym.make('NEL-v0')
        #self.test_env = gym.make('NEL-v0')
        self.an = self.env.action_space.n  # No. of actions in env
        self.epsilon = 0.5
        self.training_time = PARAM.TRAINING_TIME  # Training Time
        self.df = PARAM.DISCOUNT_FACTOR  # Discount Factor
        self.batch_size = PARAM.BATCH_SIZE
        self.method = method
        self.test_curr_state = None
        self.log_time = 100.0
        self.test_time = 1000.0
        self.prioritized_replay = PARAM.PRIORITIZED_REPLAY
        self.prioritized_replay_eps = 1e-6
        #self.prioritized_replay_alpha = 0.6
        self.prioritized_replay_alpha = 0.8
        self.prioritized_replay_beta0 = 0.4
        self.burn_in = PARAM.BURN_IN

        # Create Replay Memory and initialize with burn_in transitions
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                PARAM.REPLAY_MEMORY_SIZE, alpha=self.prioritized_replay_alpha)
            self.beta_schedule = LinearSchedule(
                float(self.training_time),
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE)
            self.beta_schedule = None

        # Create QNetwork instance
        if self.method == 'Duel':
            print('Using Duel Network.')
            self.net = DuelQNetwork(self.an)
        elif self.method == 'DoubleQ':
            print('Using DoubleQ Network.')
            self.net = DoubleQNetwork(self.an)
        else:
            raise NotImplementedError

        cur_dir = os.getcwd()
        self.dump_dir = cur_dir + '/tmp_' + self.method + '_' + time.strftime(
            "%Y%m%d-%H%M%S") + '/'
        # Create output directory
        if not os.path.exists(self.dump_dir):
            os.makedirs(self.dump_dir)
        self.train_file = open(self.dump_dir + 'train_rewards.txt', 'w')
        self.test_file = open(self.dump_dir + 'test_rewards.txt', 'w')

    def update_epsilon(self):
        ''' Epsilon decay from 0.5 to 0.05 over 100000 iterations. '''
        if self.epsilon <= 0.05:
            self.epsilon = 0.05
            return

        self.epsilon = self.epsilon - (0.5 - 0.1) / 200000.0

    def epsilon_greedy_policy(self, q_values, epsilon):
        # Creating epsilon greedy probabilities to sample from.
        val = np.random.rand(1)
        if val <= epsilon:
            return np.random.randint(q_values.shape[1])
        return np.argmax(q_values)

    def greedy_policy(self, q_values):
        # Creating greedy policy for test time.
        return np.argmax(q_values)

    def train(self):
        train_rewards = []
        test_rewards = []
        count = 0
        steps = 0
        test_steps = 0

        cum_reward = 0.0
        elapsed = 0.0

        curr_state = self.env.reset()
        curr_state = self.burn_in_memory(curr_state)
        prev_action = -1
        if self.render:
            self.env.render()
        for i in range(self.training_time):
            # Get q_values based on the current state
            Vt, St = self.get_input_tensor(curr_state)
            q_values = self.net.get_Q_output(Vt, St)

            # Selecting an action based on the policy
            action = self.epsilon_greedy_policy(q_values, self.epsilon)
            #if not curr_state['moved'] and action == prev_action and self.epsilon > 0.1:
            #  action = self.epsilon_greedy_policy(q_values, 0.5)

            # Executing action in simulator
            nextstate, reward, _, _ = self.env.step(action)
            steps = steps + 1
            test_steps = test_steps + 1
            if self.render:
                self.env.render()

            # Store Transition
            if nextstate['moved'] or prev_action != action:
                self.replay_buffer.add(curr_state, action, reward / 100.0,
                                       nextstate, 0)
            prev_action = action

            # Sample random minibatch from experience replay
            if self.prioritized_replay:
                batch, weights, batch_idxes = self.replay_buffer.sample(
                    self.batch_size, beta=self.beta_schedule.value(i))
            else:
                batch = self.replay_buffer.sample(self.batch_size)
                weights, batch_idxes = np.ones(self.batch_size), None

            # Train the Network with mini batches
            xVT, xST = self.get_input_tensors(batch)
            yT = self.get_output_tensors(batch)

            # Mask to select the actions from the Q network output
            mT = torch.zeros(self.batch_size, self.an, dtype=torch.uint8)
            for k, tran in enumerate(batch):
                mT[k, tran[1]] = 1
            td_errors = self.net.train(xVT, xST, yT, mT, weights)

            if self.prioritized_replay:
                #new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                #new_priorities = []
                #for i, tran in enumerate(batch):
                #  new_priorities.append(tran[2] + self.prioritized_replay_eps)
                self.replay_buffer.update_priorities(batch_idxes, weights)

            # Decay epsilon
            self.update_epsilon()

            cum_reward += reward
            curr_state = nextstate

            if steps == 100:
                cum_reward = cum_reward / float(self.log_time)
                train_rewards.append(cum_reward)
                self.train_file.write(str(cum_reward))
                self.train_file.write('\n')
                self.train_file.flush()
                cum_reward = 0.0
                print('Train Reward: %.4f' % (train_rewards[-1]))
                steps = 0

                x = list(range(len(train_rewards)))
                plt.plot(x, train_rewards, '-bo')
                plt.xlabel('Time')
                plt.ylabel('Average Reward')
                plt.title('Training Curve')
                plt.savefig(self.dump_dir + 'Training_Curve_' + self.method +
                            '.png')
                plt.close()

                plot(self.dump_dir + self.method, train_rewards)


#      if test_steps == 500:
#        self.net.set_eval()
#        test_rewards.append(self.test())
#        self.test_file.write(str(test_rewards[-1]))
#        self.test_file.write('\n')
#        self.test_file.flush()
#        self.net.set_train()
#        count = count + 1
#        print('\nTest Reward: %.4f\n' % (test_rewards[-1]))
#        test_steps = 0
#
#        x = list(range(len(test_rewards)))
#        plt.plot(x, test_rewards, '-bo')
#        plt.xlabel('Time')
#        plt.ylabel('Average Reward')
#        plt.title('Testing Curve')
#        plt.savefig(self.dump_dir + 'Testing_Curve_' + self.method + '.png')
#        plt.close()

            if count > 0 and count % 30 == 0:
                self.net.save_model_weights(count, self.dump_dir)

    def test(self, testing_steps=100, model_file=None, capture=False):
        if model_file is not None:
            self.net.load_model(model_file)

        if capture:
            self.test_env = gym.wrappers.Monitor(self.test_env, './')

        epsilon = 0.05
        rewards = []

        self.test_curr_state = self.test_env.reset()
        #if self.render:
        #  self.test_env.render()
        cum_reward = 0.0
        for i in range(testing_steps):
            # Initializing the episodes
            Vt, St = self.get_input_tensor(self.test_curr_state)
            q_values = self.net.get_Q_output(Vt, St)
            action = self.epsilon_greedy_policy(q_values, epsilon)

            # Executing action in simulator
            nextstate, reward, _, _ = self.test_env.step(action)
            #if self.render:
            #  self.test_env.render()

            cum_reward += reward
            self.test_curr_state = nextstate
        avg_reward = cum_reward / float(testing_steps)
        rewards.append(avg_reward)

        return avg_reward

    def burn_in_memory(self, curr_state):
        # Initialize your replay memory with a burn_in number of episodes / transitions.
        cnt = 0
        while self.burn_in > cnt:
            # Randomly selecting action for burn in. Not sure if this is correct.
            action = self.env.action_space.sample()
            next_state, reward, _, _ = self.env.step(action)

            self.replay_buffer.add(curr_state, action, reward / 100.0,
                                   next_state, 0)

            curr_state = next_state
            cnt = cnt + 1
        return curr_state

    def get_input_tensor(self, obs):
        ''' Returns an input tensor from the observation. '''
        iV = np.zeros((1, 3, 11, 11))
        iS = np.zeros((1, 4))

        iV[0] = np.moveaxis(obs['vision'], -1, 0)
        iS[0] = np.concatenate((obs['scent'], np.array([int(obs['moved'])])),
                               axis=0)
        iVt, iSt = torch.from_numpy(iV).float(), torch.from_numpy(iS).float()
        return iVt, iSt

    def get_input_tensors(self, batch, next_state=False):
        ''' Returns an input tensor created from the sampled batch. '''
        V = np.zeros((self.batch_size, 3, 11, 11))
        S = np.zeros((self.batch_size, 4))
        for i, tran in enumerate(batch):
            if next_state:
                obs = tran[3]  # next state
            else:
                obs = tran[0]  # current state

            V[i] = np.moveaxis(obs['vision'], -1, 0)
            S[i] = np.concatenate(
                (obs['scent'], np.array([int(obs['moved'])])), axis=0)
        Vt, St = torch.from_numpy(V).float(), torch.from_numpy(S).float()
        return Vt, St

    def get_output_tensors(self, batch):
        ''' Returns an output tensor created from the sampled batch. '''
        Y = np.zeros(self.batch_size)
        Vt, St = self.get_input_tensors(batch, next_state=True)
        q_values_a = self.net.get_Q_output(Vt, St)
        q_values_e = self.net.get_target_output(Vt, St)
        for i, tran in enumerate(batch):
            action = self.greedy_policy(q_values_a[i])
            Y[i] = tran[2] + self.df * q_values_e[i][action]

        Yt = torch.from_numpy(Y).float()
        return Yt
Esempio n. 2
0
class DQNAgent:
    def __init__(self, gamma, action_number, minibatch, episodes, begin_train,
                 train_step, begin_copy, copy_step, epsilon_delta,
                 epsilon_start, epsilon_end, load_model, path_to_load,
                 path_to_save, episode_steps, episode_to_save, max_buffer_len):

        # Epsilon

        self.epsilon_delta = epsilon_delta
        self.epsilon_end = epsilon_end
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start

        # Main Params

        self.minibatch = minibatch
        self.action_number = action_number
        self.gamma = gamma

        # Episode Params

        self.begin_train = begin_train
        self.begin_copy = begin_copy
        self.copy_step = copy_step
        self.train_step = train_step
        self.episodes = episodes
        self.episode_steps = episode_steps
        self.episode_to_save = episode_to_save

        # I/O params

        self.path_to_load = path_to_load
        self.path_to_save = path_to_save
        self.load_model = load_model

        # Model Fields

        self.action = None
        self.state = None
        self.replay_buffer = ReplayBuffer(max_buffer_len)

        # Model
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')
        # self.device = torch.device('cpu')
        self.model = BoxModel((150, 100, 1), action_number).to(self.device)
        if self.load_model:
            self.model.load_state_dict(torch.load(self.path_to_load))

        # Rewards

        self.rewards_white, self.rewards_black, self.rewards = [], [], []

    def reduce_epsilon(self, episode):
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            np.exp(-1. * episode / self.epsilon_delta)

    def epsilon_greedy(self):
        if (1 - self.epsilon) <= np.random.random():
            self.action = np.random.randint(self.action_number)
        else:
            state = torch.autograd.Variable(
                torch.FloatTensor(self.state).to(self.device).unsqueeze(0))
            self.action = self.model(state).max(1)[1].item()
        return self.action

    @staticmethod
    def preprocess_observation(observation):
        rgb = observation[30:180, 30:130] / 255
        r, g, b = rgb[:, :, 0], rgb[:, :, 1], rgb[:, :, 2]
        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
        return gray.reshape(1, 150, 100)

    def transition_process(self, o_state, o_act, o_reward, o_next_state,
                           o_done):

        return \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \
            torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device))

    def train_model(self):
        o_state, o_act, o_reward, o_next_state, o_done = \
            self.transition_process(*self.replay_buffer.sample(self.minibatch))
        q = self.model(o_state)
        q_next = self.model(o_next_state)
        y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done)
        loss = (q.gather(1, o_act.unsqueeze(1)).squeeze(1) -
                torch.autograd.Variable(y_hat.data)).pow(2).mean()
        self.model.optimizer.zero_grad()
        loss.backward()
        self.model.optimizer.step()

    def print(self, episode, reward_black, reward_white, epsilon):
        print(f"For episode {episode} reward white - "
              f"{reward_white} and black - {reward_black},"
              f"epsilon - {epsilon}")

    def train(self, env: gym.wrappers.time_limit.TimeLimit):
        start = time()
        print("Begin to Train")

        for episode in range(self.episodes):
            observation = env.reset()
            self.state = self.preprocess_observation(observation)
            reward_black, reward_white, total_reward = 0, 0, 0
            for episode_steps in range(self.episode_steps):
                action = self.epsilon_greedy()
                next_observation, reward, done, _ = env.step(action)
                reward_black += (reward < 0) * abs(reward)
                reward_white += (reward > 0) * reward
                total_reward += reward
                next_state = self.preprocess_observation(next_observation)
                self.replay_buffer.push(self.state, action, reward, next_state,
                                        done)
                if len(self.replay_buffer) >= self.begin_train:
                    self.train_model()
                # if (episode_step >= self.begin_copy) and (episode_step % self.copy_step == 0):
                #     plt.plot(total_reward)
                #     plt.show()
                # self.const_model = self.model.clone()
                if done:
                    break

            self.reduce_epsilon(episode)
            if episode != 0 and episode % self.episode_to_save == 0:
                torch.save(self.model.state_dict(), self.path_to_save)
                plt.plot(self.rewards)
                plt.show()

            self.rewards_black.append(reward_black)
            self.rewards_white.append(reward_white)
            self.rewards.append(total_reward)
            self.print(episode,
                       reward_black=reward_black,
                       reward_white=reward_white,
                       epsilon=self.epsilon)
            print(time() - start)

    def play(self, env: gym.wrappers.time_limit.TimeLimit):
        observation = env.reset()
        reward_black, reward_white, total_reward = 0, 0, 0
        for episode_steps in range(self.episode_steps):
            state = self.preprocess_observation(observation)
            state = torch.autograd.Variable(
                torch.FloatTensor(state).to(self.device).unsqueeze(0))
            print(self.model(state))
            action = self.model(state).max(1)[1].item()
            observation, reward, done, _ = env.step(action)
            reward_black += (reward < 0) * abs(reward)
            reward_white += (reward > 0) * reward
            total_reward += reward
            sleep(0.01)
            env.render()
            if done:
                break
        print(total_reward)
Esempio n. 3
0
class DDQNAgentCnn(GeneralAgent):
    def __init__(self,
                 gamma,
                 action_number,
                 minibatch,
                 episodes,
                 begin_train,
                 copy_step,
                 epsilon_delta,
                 epsilon_start,
                 epsilon_end,
                 load_model,
                 path_to_load,
                 path_to_save,
                 plots_to_save,
                 episode_steps,
                 episode_to_save,
                 max_buffer_len,
                 model_type
                 ):

        super().__init__(gamma=gamma,
                         action_number=action_number,
                         path_to_load=path_to_load,
                         path_to_save=path_to_save,
                         plots_to_save=plots_to_save,
                         load_model=load_model,
                         episode_to_save=episode_to_save,
                         episodes=episodes,
                         model_type=model_type)
        # Epsilon

        self.epsilon_delta = epsilon_delta
        self.epsilon_end = epsilon_end
        self.epsilon_start = epsilon_start
        self.epsilon = epsilon_start

        # Main Params

        self.minibatch = minibatch

        # Episode Params

        self.begin_train = begin_train
        self.copy_step = copy_step
        self.episode_steps = episode_steps

        # Model Fields

        self.action = None
        self.state = None
        self.replay_buffer = ReplayBuffer(max_buffer_len)

        # Model
        self.target_model = model_type(action_number).to(self.device)
        self.update_target()

        # Rewards

        self.rewards_white, self.rewards_black, self.rewards = [], [], []
        self.losses = []
        self.periodic_reward = 0
        self.periodic_rewards = []

    def update_target(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def reduce_epsilon(self, episode):
        self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            np.exp(-1. * episode / self.epsilon_delta)

    def epsilon_greedy(self):
        if (1 - self.epsilon) <= np.random.random():
            self.action = np.random.randint(self.action_number)
        else:
            state = torch.autograd.Variable(torch.FloatTensor(self.state).to(self.device).unsqueeze(0))
            self.action = self.model(state).max(1)[1].item()
        return self.action

    @staticmethod
    def preprocess_observation(obs):
        img = resize(rgb2gray(obs[0:188, 23:136, :]), (28, 28), mode='constant')
        img = img.reshape(1, 28, 28)
        return img

    def transition_process(self, o_state, o_act, o_reward, o_next_state, o_done):

        return \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_state)).to(self.device)), \
            torch.autograd.Variable(torch.LongTensor(o_act).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_reward).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(np.float32(o_next_state)).to(self.device)), \
            torch.autograd.Variable(torch.FloatTensor(o_done).to(self.device))

    def train_model(self):
        o_state, o_act, o_reward, o_next_state, o_done = \
            self.transition_process(*self.replay_buffer.sample(self.minibatch))
        q = self.model(o_state).gather(1, o_act.unsqueeze(1)).squeeze(1)
        q_next = self.target_model(o_next_state)
        y_hat = o_reward + self.gamma * q_next.max(1)[0] * (1 - o_done)
        loss = (q - y_hat.detach()).pow(2).mean()

        self.model.optimizer.zero_grad()
        loss.backward()
        self.model.optimizer.step()
        return loss

    def init_new_episode(self, env):
        observation = env.reset()
        self.state = self.preprocess_observation(observation)

    def episode_check(self, episode, loss):

        if episode % self.copy_step == 0:
            self.losses.append(loss)
            self.update_target()

        if episode % self.episode_steps == 0:
            self.periodic_rewards.append(self.periodic_reward / self.episode_steps)
            self.periodic_reward = 0

        if episode % self.episode_to_save == 0:
            torch.save(self.model.state_dict(), self.path_to_save)
            fig = plt.figure()
            plt.plot(self.rewards)
            fig.savefig(self.plots_to_save + '_reward.png')
            plt.close(fig)
            fig = plt.figure()
            plt.plot(self.losses)
            fig.savefig(self.plots_to_save + '_loss.png')
            plt.close(fig)
            fig = plt.figure()
            plt.plot(self.periodic_rewards)
            fig.savefig(self.plots_to_save + '_periodic_reward.png')
            plt.close(fig)
            
    def train(self, env: gym.wrappers.time_limit.TimeLimit):
        self.init_new_episode(env)
        total_reward = 0
        episode_reward = 0
        loss = 0
        for episode in self.trangle:
            self.trangle.set_description(
                f"Episode: {episode} | Episode Reward {episode_reward} | Periodic reward "
                f"{self.periodic_reward / self.episode_steps} | Average Reward {total_reward / (episode + 1)}"
            )
            self.trangle.refresh()
            action = self.epsilon_greedy()
            next_observation, reward, done, _ = env.step(action)
            total_reward += reward
            episode_reward += reward
            self.periodic_reward += reward
            next_state = self.preprocess_observation(next_observation)
            self.replay_buffer.push(self.state, action, reward, next_state, done)
            self.state = next_state
            if len(self.replay_buffer) >= self.begin_train:
                loss = self.train_model()

            self.reduce_epsilon(episode)
            self.episode_check(episode, loss)

            if done:
                self.init_new_episode(env)
                self.rewards.append(episode_reward)
                episode_reward = 0