Ejemplo n.º 1
0
 def __init__(self,
              agent,
              name,
              environment,
              network,
              global_dict,
              report_frequency,
              step_size=0.9,
              epsilon_annealing_start=0.9,
              epsilon_annealing_end=0,
              load_model_path=None,
              using_e_greedy=True):
     super().__init__(agent=agent,
                      name=name,
                      environment=environment,
                      network=network,
                      global_dict=global_dict,
                      report_frequency=report_frequency)
     self.step_size = step_size
     self.epsilon_start = epsilon_annealing_start
     self.epsilon_end = epsilon_annealing_end
     self.load_model_path = load_model_path
     self.using_e_greedy = using_e_greedy
     r, _ = environment.get_state_space().get_range()
     self.num_of_states = len(r)
     self.table = [0.5 for _ in range(self.num_of_states)]
     self.epsilon_annealer = Annealer(epsilon_annealing_start,
                                      epsilon_annealing_end,
                                      self.agent.max_training_steps)
     self.current_epsilon = epsilon_annealing_start
Ejemplo n.º 2
0
    def __init__(self,
                 agent,
                 name,
                 environment,
                 network,
                 global_dict,
                 async_update_steps=1,
                 reward_clip_vals=None,
                 using_e_greedy=True,
                 epsilon_annealing_start=1,
                 epsilon_annealing_choices=[0.1, 0.01, 0.5],
                 epsilon_annealing_probabilities=[0.4, 0.3, 0.3],
                 epsilon_annealing_steps=100000,
                 global_epsilon_annealing=True,
                 report_frequency=1):
        MOBaseLearner.__init__(self,
                               network.get_config().get_num_of_objectives())
        threading.Thread.__init__(self)

        range, is_range = environment.get_action_space().get_range()
        if not is_range:
            raise ValueError("Does not support this type of action space")

        self.using_e_greedy = using_e_greedy
        if using_e_greedy:
            end_rand = np.random.choice(epsilon_annealing_choices,
                                        p=epsilon_annealing_probabilities)
            self.epsilon_annealer = Annealer(epsilon_annealing_start, end_rand,
                                             epsilon_annealing_steps)

        self.current_epsilon = epsilon_annealing_start
        self.step_count = 0
        self.eps_count = 0
        self.environment = environment
        self.reward_clip_vals = reward_clip_vals
        self.name = name
        self.agent = agent

        self.num_actions = len(range)

        self.network = network
        self.config = network.network_config
        self.history_length = self.config.get_history_length()
        if self.history_length > 1:
            self.frame_buffer = StateBuffer([1] +
                                            self.config.get_input_shape())

        self.async_update_step = async_update_steps
        self.global_dict = global_dict
        self.global_epsilon_annealing = global_epsilon_annealing

        self.report_frequency = report_frequency

        self.minibatch_vars = {}
        self.reset_minibatch()

        self.testing = False
Ejemplo n.º 3
0
 def __init__(self, agent, name, environment, network, global_dict, report_frequency,
              network_update_steps=5, reward_clip_thresholds=(-1, 1)
              ):
     super().__init__(agent=agent, name=name, environment=environment, network=network,
                      global_dict=global_dict,
                      report_frequency=report_frequency)
     self.async_update_steps = network_update_steps
     self.reward_clip_thresholds = reward_clip_thresholds
     self.initial_learning_rate = network.get_config().get_initial_learning_rate()
     self.current_learning_rate = self.initial_learning_rate
     self.learning_rate_annealer = Annealer(self.initial_learning_rate, 0, self.agent.max_training_steps)
Ejemplo n.º 4
0
 def __init__(self,
              agent,
              name,
              environment,
              network,
              global_dict,
              report_frequency,
              network_update_steps=5,
              reward_clip_thresholds=(-1, 1),
              auxiliary_model_path=None,
              alpha=0.2,
              epsilon=0.02):
     super().__init__(agent=agent,
                      name=name,
                      environment=environment,
                      network=network,
                      global_dict=global_dict,
                      report_frequency=report_frequency)
     self.async_update_steps = network_update_steps
     self.reward_clip_thresholds = reward_clip_thresholds
     self.initial_learning_rate = network.get_config(
     ).get_initial_learning_rate()
     self.current_learning_rate = self.initial_learning_rate
     self.learning_rate_annealer = Annealer(self.initial_learning_rate, 0,
                                            self.agent.max_training_steps)
     self.auxiliary_model_path = auxiliary_model_path
     self.alpha = alpha
     self.epsilon = epsilon
     self.load_model()
Ejemplo n.º 5
0
 def __init__(self,
              agent,
              name,
              environment,
              network,
              global_dict,
              report_frequency,
              batch_size=32,
              warmup_steps=50000,
              training_frequency=4,
              experience_replay_size=2**19,
              epsilon_annealing_start=1,
              epsilon_annealing_end=0.1,
              initial_beta=0.4,
              prioritized_alpha=0.6,
              epsilon_annealing_steps=1e6,
              reward_clip_thresholds=(-1, 1)):
     super().__init__(agent=agent,
                      name=name,
                      environment=environment,
                      network=network,
                      global_dict=global_dict,
                      report_frequency=report_frequency)
     global experience_replay
     with global_dict[AgentMonitor.Q_LOCK]:
         if experience_replay is None:
             experience_replay = SyncSumTree(
                 alpha=prioritized_alpha,
                 size=experience_replay_size,
                 state_history=network.get_config().get_history_length(),
                 debug=False)
     self.replay = experience_replay
     self.batch_size = batch_size
     self.warmup_steps = warmup_steps
     self.training_frequency = training_frequency
     self.reward_clip_thresholds = reward_clip_thresholds
     self.epsilon_annealer = Annealer(epsilon_annealing_start,
                                      epsilon_annealing_end,
                                      epsilon_annealing_steps)
     self.current_learning_rate = network.get_config(
     ).get_initial_learning_rate()
     self.current_epsilon = epsilon_annealing_start
     self.beta_annealer = Annealer(initial_beta, 1,
                                   self.agent.max_training_steps)
     self.current_beta = initial_beta
     self.initial_beta = initial_beta
     self.prioritized_alpha = prioritized_alpha
Ejemplo n.º 6
0
    def __init__(self, environment, num_of_epochs=10, steps_per_epoch=100000,
                 log_dir='./train/moq', using_e_greedy=True, report_frequency=100,
                 summary_frequency=900000, discounted_factor=0.9, learning_rate=0.9, traces_factor=0.9, batch_size=5,
                 epsilon_annealing_start=0.9, load_model_path=None, thresholds=None, target_reward=None, is_linear=False):
        super().__init__(None, environment, num_of_threads=1, num_of_epochs=num_of_epochs,
                         steps_per_epoch=steps_per_epoch, log_dir=log_dir, using_e_greedy=using_e_greedy,
                         anneal_learning_rate=False, report_frequency=report_frequency,
                         save_frequency=summary_frequency
                         )

        self.initial_learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.load_model_path = load_model_path

        self.gamma = discounted_factor
        self.traces_factor = traces_factor

        self.is_linear = is_linear

        # Disable annealing learning rate
        self.learning_rate_annealer = Annealer(self.initial_learning_rate, 0, None)

        self.num_of_objectives = environment.get_num_of_objectives()

        self.init_q_values = [0.] * self.num_of_objectives
        self.thresholds = [0.] * (self.num_of_objectives - 1)

        if not is_linear:
            self.table = TLOLookupTable(environment=environment, init_value=0., thresholds=self.thresholds)
        else:
            self.table = LinearLookupTable(environment=environment, init_value=0., thresholds=self.thresholds)

        self.env_pool = [self.env.clone() for _ in range(self.num_of_threads)]

        self.thread_host = AgentMonitor(self, network=None, log_dir=self.log_dir, save_interval=summary_frequency,
                                        max_training_epochs=self.num_of_epochs, steps_per_epoch=self.steps_per_epoch,
                                        multi_objectives=True, idle_time=0)

        self.thread_pool = [MOQWorker(self, name='MOQWorker: ' + str(t), environment=self.env_pool[t],
                                      global_dict=self.thread_host.shared_dict, num_of_objs=self.num_of_objectives,
                                      async_update_steps=1,
                                      using_e_greedy=self.using_e_greedy,
                                      report_frequency=report_frequency,
                                      epsilon_annealing_start=epsilon_annealing_start,
                                      epsilon_annealing_choices=[0],
                                      epsilon_annealing_probabilities=[1.0],
                                      epsilon_annealing_steps=num_of_epochs * steps_per_epoch,
                                      global_epsilon_annealing=True,
                                      gamma=discounted_factor,
                                      traces_factor=traces_factor,
                                      batch_size=batch_size,
                                      load_model_path=load_model_path,
                                      lookup_table=self.table,
                                      thresholds=thresholds,
                                      target_reward=target_reward,
                                      is_linear=is_linear
                                      )
                                for t in range(self.num_of_threads)]
Ejemplo n.º 7
0
 def __init__(self,
              agent,
              name,
              environment,
              network,
              global_dict,
              report_frequency,
              batch_size=32,
              warmup_steps=50000,
              training_frequency=4,
              experience_replay_size=2**19,
              epsilon_annealing_start=1,
              epsilon_annealing_end=0.1,
              epsilon_annealing_steps=1e6,
              reward_clip_thresholds=(-1, 1)):
     super().__init__(agent=agent,
                      name=name,
                      environment=environment,
                      network=network,
                      global_dict=global_dict,
                      report_frequency=report_frequency)
     global experience_replay
     with global_dict[AgentMonitor.Q_LOCK]:
         if experience_replay is None:
             experience_replay = SyncExperienceReplay(
                 experience_replay_size,
                 state_history=network.network_config.get_history_length())
     self.replay = experience_replay
     self.batch_size = batch_size
     self.warmup_steps = warmup_steps
     self.training_frequency = training_frequency
     self.reward_clip_thresholds = reward_clip_thresholds
     self.epsilon_annealer = Annealer(epsilon_annealing_start,
                                      epsilon_annealing_end,
                                      epsilon_annealing_steps)
     self.current_learning_rate = network.get_config(
     ).get_initial_learning_rate()
     self.current_epsilon = epsilon_annealing_start
Ejemplo n.º 8
0
    def __init__(self,
                 agent,
                 name,
                 environment,
                 network,
                 global_dict,
                 report_frequency,
                 batch_size=5,
                 discounted_factor=0.9,
                 learning_rate=0.9,
                 traces_factor=0.9,
                 epsilon_annealing_start=0.9,
                 epsilon_annealing_end=0,
                 load_model_path=None,
                 thresholds=None,
                 target_reward=None,
                 is_linear=False,
                 using_e_greedy=True,
                 async_update_steps=1):
        super().__init__(agent=agent,
                         name=name,
                         environment=environment,
                         network=network,
                         global_dict=global_dict,
                         report_frequency=report_frequency)

        self.load_model_path = load_model_path
        self.target_reward = target_reward
        self.is_linear = is_linear
        self.discounted_factor = discounted_factor
        self.traces_factor = traces_factor
        self.using_e_greedy = using_e_greedy
        self.async_update_steps = async_update_steps

        self.num_of_objectives = environment.get_number_of_objectives()
        self.init_q_values = [0.] * self.num_of_objectives
        if thresholds is None:
            if not is_linear:
                self.thresholds = [0.] * (self.num_of_objectives - 1)
            else:
                self.thresholds = [1. / self.num_of_objectives
                                   ] * self.num_of_objectives
        else:
            self.thresholds = thresholds

        global table
        with global_dict[AgentMonitor.Q_LOCK]:
            if table is None:
                if not is_linear:
                    table = TLOLookupTable(environment=environment,
                                           init_value=0.,
                                           thresholds=self.thresholds)
                else:
                    table = LinearLookupTable(environment=environment,
                                              init_value=0.,
                                              thresholds=self.thresholds)

        self.table = table
        self.batch_size = batch_size
        self.epsilon_annealer = Annealer(epsilon_annealing_start,
                                         epsilon_annealing_end,
                                         self.agent.max_training_steps)
        self.current_learning_rate = learning_rate
        self.current_epsilon = epsilon_annealing_start
        self.converged = False
Ejemplo n.º 9
0
class MOQLearner(Learner):
    def __init__(self,
                 agent,
                 name,
                 environment,
                 network,
                 global_dict,
                 report_frequency,
                 batch_size=5,
                 discounted_factor=0.9,
                 learning_rate=0.9,
                 traces_factor=0.9,
                 epsilon_annealing_start=0.9,
                 epsilon_annealing_end=0,
                 load_model_path=None,
                 thresholds=None,
                 target_reward=None,
                 is_linear=False,
                 using_e_greedy=True,
                 async_update_steps=1):
        super().__init__(agent=agent,
                         name=name,
                         environment=environment,
                         network=network,
                         global_dict=global_dict,
                         report_frequency=report_frequency)

        self.load_model_path = load_model_path
        self.target_reward = target_reward
        self.is_linear = is_linear
        self.discounted_factor = discounted_factor
        self.traces_factor = traces_factor
        self.using_e_greedy = using_e_greedy
        self.async_update_steps = async_update_steps

        self.num_of_objectives = environment.get_number_of_objectives()
        self.init_q_values = [0.] * self.num_of_objectives
        if thresholds is None:
            if not is_linear:
                self.thresholds = [0.] * (self.num_of_objectives - 1)
            else:
                self.thresholds = [1. / self.num_of_objectives
                                   ] * self.num_of_objectives
        else:
            self.thresholds = thresholds

        global table
        with global_dict[AgentMonitor.Q_LOCK]:
            if table is None:
                if not is_linear:
                    table = TLOLookupTable(environment=environment,
                                           init_value=0.,
                                           thresholds=self.thresholds)
                else:
                    table = LinearLookupTable(environment=environment,
                                              init_value=0.,
                                              thresholds=self.thresholds)

        self.table = table
        self.batch_size = batch_size
        self.epsilon_annealer = Annealer(epsilon_annealing_start,
                                         epsilon_annealing_end,
                                         self.agent.max_training_steps)
        self.current_learning_rate = learning_rate
        self.current_epsilon = epsilon_annealing_start
        self.converged = False

    @staticmethod
    def get_default_number_of_learners():
        return 1

    def load_model(self):
        self.table.load_value_function(self.load_model_path)
        print("Load values:")
        self.table.print_values()

    def save_model(self, file_name):
        print("Save values:")
        self.table.print_values()
        self.table.save_value_function(file_name)

    def get_action(self, state):
        if self.using_e_greedy:
            if np.random.uniform(0, 1) <= self.current_epsilon:
                e_greedy = np.random.randint(self.num_actions)
                return e_greedy
            else:
                return self.table.select_greedy_action(state)
        else:
            return self.table.select_greedy_action(state)

    def report(self, reward):
        print(self.name, 'Episode Count:', self.eps_count,
              'Episode reward:', reward, 'Steps:',
              self.environment.get_current_steps(), 'Step count:',
              self.step_count, 'Learning rate:', self.current_learning_rate,
              'Epsilon:', self.current_epsilon, 'Thresholds:', self.thresholds)

        # Testing purpose
        if self.target_reward is not None and self.thresholds is not None:
            backup_epsilon = self.current_epsilon
            self.current_epsilon = 0
            greedy_reward = self.run_episode()
            self.global_dict[AgentMonitor.Q_ADD_REWARD](
                greedy_reward, self.environment.get_current_steps())
            self.current_epsilon = backup_epsilon
            converged = True
            for i in range(len(greedy_reward)):
                if greedy_reward[i] != self.target_reward[i]:
                    converged = False
                    break
            if converged:
                print("Converged")
                self.converged = True

    def update(self, state, action, reward, next_state, terminal):
        self.step_count += 1
        self.global_dict['counter'] += 1

        if not self.testing:
            if self.step_count % self.async_update_steps == 0:
                if not terminal:
                    greedy = self.get_action(state)
                    self.table.calculate_td_errors(action, state, greedy,
                                                   next_state,
                                                   self.discounted_factor,
                                                   reward)
                else:
                    self.table.calculate_terminal_td_errors(
                        action, state, self.discounted_factor, reward)
                self.table.update(action, state, 1.0,
                                  self.current_learning_rate)

                self.current_epsilon = self.epsilon_annealer.anneal(
                    self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
Ejemplo n.º 10
0
class TDLearner(Learner):
    def __init__(self,
                 agent,
                 name,
                 environment,
                 network,
                 global_dict,
                 report_frequency,
                 step_size=0.9,
                 epsilon_annealing_start=0.9,
                 epsilon_annealing_end=0,
                 load_model_path=None,
                 using_e_greedy=True):
        super().__init__(agent=agent,
                         name=name,
                         environment=environment,
                         network=network,
                         global_dict=global_dict,
                         report_frequency=report_frequency)
        self.step_size = step_size
        self.epsilon_start = epsilon_annealing_start
        self.epsilon_end = epsilon_annealing_end
        self.load_model_path = load_model_path
        self.using_e_greedy = using_e_greedy
        r, _ = environment.get_state_space().get_range()
        self.num_of_states = len(r)
        self.table = [0.5 for _ in range(self.num_of_states)]
        self.epsilon_annealer = Annealer(epsilon_annealing_start,
                                         epsilon_annealing_end,
                                         self.agent.max_training_steps)
        self.current_epsilon = epsilon_annealing_start

    def report(self, reward):
        print(self.name, 'Episode Count:', self.eps_count,
              'Episode reward:', reward, 'Steps:',
              self.environment.get_current_steps(), 'Step count:',
              self.step_count, 'Learning rate:', self.step_size, 'Epsilon:',
              self.current_epsilon)

    def get_action(self, state):
        if self.using_e_greedy:
            if np.random.uniform(0, 1) <= self.current_epsilon:
                e_greedy = np.random.randint(self.num_actions)
                return e_greedy
            else:
                return self.table.select_greedy_action(state)
        else:
            return self.table.select_greedy_action(state)

    def update(self, state, action, reward, next_state, terminal):
        self.step_count += 1
        self.global_dict[AgentMonitor.Q_GLOBAL_STEPS] += 1

        if not self.testing:

            if terminal:
                self.data_dict['returns'].append(0)
                for i in range(len(self.data_dict['states'])):
                    new_val = self.data_dict['returns'][-i - 2] = self.data_dict['rewards'][-i - 1] + \
                                                        self.discounted_factor * self.data_dict['returns'][-i - 1]

                    cur_val = self.table.get_q_values(
                        self.data_dict['actions'][-i - 1],
                        self.data_dict['states'][-i - 1])

                    new_val = (cur_val * self.eps_count +
                               new_val) / (self.eps_count + 1)

                    self.table.set_q_values(self.data_dict['actions'][-i - 1],
                                            self.data_dict['states'][-i - 1],
                                            new_val)

            self.current_epsilon = self.epsilon_annealer.anneal(
                self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
Ejemplo n.º 11
0
class A3CLearner(Learner):
    def __init__(self, agent, name, environment, network, global_dict, report_frequency,
                 network_update_steps=5, reward_clip_thresholds=(-1, 1)
                 ):
        super().__init__(agent=agent, name=name, environment=environment, network=network,
                         global_dict=global_dict,
                         report_frequency=report_frequency)
        self.async_update_steps = network_update_steps
        self.reward_clip_thresholds = reward_clip_thresholds
        self.initial_learning_rate = network.get_config().get_initial_learning_rate()
        self.current_learning_rate = self.initial_learning_rate
        self.learning_rate_annealer = Annealer(self.initial_learning_rate, 0, self.agent.max_training_steps)

    @staticmethod
    def get_default_number_of_learners():
        return multiprocessing.cpu_count()

    def reset(self):
        super().reset()
        self.reset_batch()

    def reset_batch(self):
        self.data_dict['states'] = []
        self.data_dict['actions'] = []
        self.data_dict['rewards'] = []
        self.data_dict['next_states'] = []
        self.data_dict['terminals'] = []

    def get_action(self, state):
        probs = self.get_probs(state)
        action_probs = probs - np.finfo(np.float32).epsneg
        try:
            sample = np.random.multinomial(1, action_probs)
            action_index = int(np.nonzero(sample)[0])
        except:
            print('Select greedy action', action_probs)
            action_index = np.argmax(probs)
        return action_index

    def update(self, state, action, reward, next_state, terminal):
        if self.history_length > 1:
            self.frame_buffer.add_state(state)

        if self.reward_clip_thresholds is not None:
            reward = np.clip(reward, self.reward_clip_thresholds[0], self.reward_clip_thresholds[1])

        if not self.testing:
            if self.history_length > 1:
                current_s = self.frame_buffer.get_buffer()[0]
                next_s = self.frame_buffer.get_buffer_add_state(next_state)[0]
            else:
                current_s = state
                next_s = next_state
            self.data_dict['states'].append(current_s)
            self.data_dict['actions'].append(action)
            self.data_dict['rewards'].append(reward)
            self.data_dict['next_states'].append(next_s)
            self.data_dict['terminals'].append(terminal)

        self.step_count += 1
        self.global_dict[AgentMonitor.Q_GLOBAL_STEPS] += 1

        if not self.testing:
            if self.step_count % self.async_update_steps == 0 or terminal:
                logging = self.global_dict[AgentMonitor.Q_LOGGING]
                self.current_learning_rate = self.learning_rate_annealer.anneal(
                    self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
                self.data_dict['learning_rate'] = self.current_learning_rate
                self.global_dict[AgentMonitor.Q_LEARNING_RATE] = self.current_learning_rate
                if logging:
                    self.global_dict[AgentMonitor.Q_LOGGING] = False
                    self.data_dict['logging'] = True
                    summary = self.network.train_network(self.data_dict)
                    self.global_dict[AgentMonitor.Q_WRITER].\
                        add_summary(summary, global_step=self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
                else:
                    self.data_dict['logging'] = False
                    self.network.train_network(self.data_dict)
                self.reset_batch()
Ejemplo n.º 12
0
class PrioritizedDQNLearner(Learner):
    def __init__(self,
                 agent,
                 name,
                 environment,
                 network,
                 global_dict,
                 report_frequency,
                 batch_size=32,
                 warmup_steps=50000,
                 training_frequency=4,
                 experience_replay_size=2**19,
                 epsilon_annealing_start=1,
                 epsilon_annealing_end=0.1,
                 initial_beta=0.4,
                 prioritized_alpha=0.6,
                 epsilon_annealing_steps=1e6,
                 reward_clip_thresholds=(-1, 1)):
        super().__init__(agent=agent,
                         name=name,
                         environment=environment,
                         network=network,
                         global_dict=global_dict,
                         report_frequency=report_frequency)
        global experience_replay
        with global_dict[AgentMonitor.Q_LOCK]:
            if experience_replay is None:
                experience_replay = SyncSumTree(
                    alpha=prioritized_alpha,
                    size=experience_replay_size,
                    state_history=network.get_config().get_history_length(),
                    debug=False)
        self.replay = experience_replay
        self.batch_size = batch_size
        self.warmup_steps = warmup_steps
        self.training_frequency = training_frequency
        self.reward_clip_thresholds = reward_clip_thresholds
        self.epsilon_annealer = Annealer(epsilon_annealing_start,
                                         epsilon_annealing_end,
                                         epsilon_annealing_steps)
        self.current_learning_rate = network.get_config(
        ).get_initial_learning_rate()
        self.current_epsilon = epsilon_annealing_start
        self.beta_annealer = Annealer(initial_beta, 1,
                                      self.agent.max_training_steps)
        self.current_beta = initial_beta
        self.initial_beta = initial_beta
        self.prioritized_alpha = prioritized_alpha

    def initialize(self):
        self.data_dict = {
            'states': [],
            'actions': [],
            'rewards': [],
            'next_states': [],
            'terminals': [],
            'learning_rate':
            self.network.get_config().get_initial_learning_rate(),
            'logging': False,
            'global_step': 0,
            'is_weights': None
        }

    @staticmethod
    def get_default_number_of_learners():
        return 1

    def get_action(self, state):
        probs = self.get_probs(state)
        if self.current_epsilon is not None:
            if np.random.uniform(0, 1) < self.current_epsilon:
                return np.random.randint(0, len(probs))
            else:
                return np.argmax(probs)
        else:
            return np.argmax(probs)

    def report(self, reward):
        print(self.name, 'Episode Count:', self.eps_count,
              'Episode reward:', reward, 'Steps:',
              self.environment.get_current_steps(), 'Step count:',
              self.step_count, 'Learning rate:',
              self.global_dict[AgentMonitor.Q_LEARNING_RATE], 'Epsilon:',
              self.current_epsilon, 'Beta:', self.current_beta)

    def update(self, state, action, reward, next_state, terminal):
        if self.history_length > 1:
            self.frame_buffer.add_state(state)

        if self.reward_clip_thresholds is not None:
            reward = np.clip(reward, self.reward_clip_thresholds[0],
                             self.reward_clip_thresholds[1])

        if not self.testing:
            if self.history_length > 1:
                current_s = self.frame_buffer.get_buffer()[0]
                next_s = self.frame_buffer.get_buffer_add_state(next_state)[0]
            else:
                current_s = state
                next_s = next_state
            self.replay.append(current_s, action, reward, next_s, terminal)

        self.step_count += 1
        self.global_dict['counter'] += 1

        if self.step_count < self.warmup_steps:
            return

        if not self.testing:
            if self.step_count % self.training_frequency == 0:
                logging = self.global_dict[AgentMonitor.Q_LOGGING]
                s, a, r, n, t, e, w, p, mw = self.replay.get_mini_batch(
                    batch_size=self.batch_size, current_beta=self.current_beta)
                self.current_beta = self.beta_annealer.anneal(
                    self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
                self.data_dict['states'] = s
                self.data_dict['actions'] = a
                self.data_dict['rewards'] = r
                self.data_dict['next_states'] = n
                self.data_dict['terminals'] = t
                self.data_dict['learning_rate'] = self.current_learning_rate
                self.data_dict['global_step'] = self.global_dict[
                    AgentMonitor.Q_GLOBAL_STEPS]
                self.data_dict['is_weights'] = w
                if logging:
                    self.global_dict[AgentMonitor.Q_LOGGING] = False
                    self.data_dict['logging'] = True
                    summary = self.network.train_network(self.data_dict)
                    self.global_dict[AgentMonitor.Q_WRITER]. \
                        add_summary(summary, global_step=self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
                else:
                    self.data_dict['logging'] = False
                    self.network.train_network(self.data_dict)

                td_errors = self.network.network_config.get_td_errors(
                    self.network.get_session(), self.data_dict)
                if self.reward_clip_thresholds is not None:
                    td_errors = np.clip(td_errors,
                                        self.reward_clip_thresholds[0],
                                        self.reward_clip_thresholds[1])

                self.replay.update_mini_batch(e, td_errors)

                if self.step_count % 100000 == 0:
                    print(
                        '###################################################################'
                    )
                    print('TD Errors:', td_errors)
                    print('Beta:', self.current_beta)
                    print('Mini Batches:', e)
                    print('Weights:', w)
                    print('Max Weight:', mw)
                    print('Probability:', p)
                    print(
                        '###################################################################'
                    )

            self.current_epsilon = self.epsilon_annealer.anneal(
                self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
Ejemplo n.º 13
0
    def __init__(self,
                 agent,
                 name,
                 environment,
                 global_dict,
                 num_of_objs=1,
                 async_update_steps=5,
                 using_e_greedy=True,
                 epsilon_annealing_start=1,
                 epsilon_annealing_choices=[0.1, 0.01, 0.5],
                 epsilon_annealing_probabilities=[0.4, 0.3, 0.3],
                 epsilon_annealing_steps=10000,
                 global_epsilon_annealing=True,
                 report_frequency=1,
                 gamma=0.9,
                 traces_factor=0.9,
                 batch_size=5,
                 load_model_path=None,
                 lookup_table=None,
                 thresholds=None,
                 target_reward=None,
                 is_linear=False):
        MOBaseLearner.__init__(self, num_of_objs)
        threading.Thread.__init__(self)

        range, is_range = environment.get_action_space().get_range()
        if not is_range:
            raise ValueError("Does not support this type of action space")

        self.using_e_greedy = using_e_greedy
        if using_e_greedy:
            end_rand = np.random.choice(epsilon_annealing_choices,
                                        p=epsilon_annealing_probabilities)
            self.epsilon_annealer = Annealer(epsilon_annealing_start, end_rand,
                                             epsilon_annealing_steps)

        self.current_epsilon = epsilon_annealing_start
        self.step_count = 0
        self.eps_count = 0
        self.environment = environment
        self.name = name
        self.agent = agent
        self.gamma = gamma
        self.traces_factor = traces_factor
        self.batch_size = batch_size
        self.load_model_path = load_model_path

        self.num_actions = len(range)

        self.async_update_step = async_update_steps
        self.global_dict = global_dict
        self.global_epsilon_annealing = global_epsilon_annealing

        self.report_frequency = report_frequency

        self.minibatch_vars = {}
        self.reset_minibatch()

        self.testing = False
        self.target_reward = target_reward
        self.is_linear = is_linear

        self.thresholds = thresholds
        if self.thresholds is None:
            self.thresholds = [0] * (self.num_of_objs - 1)

        self.pareto_solutions = self.environment.get_pareto_solutions()
        # if self.pareto_solutions is not None:
        #    for i in range(len(self.pareto_solutions)):

        self.table = lookup_table
        self.table.set_threshold(self.thresholds)
        if self.load_model_path is not None:
            self.agent.load_model()

        self.alpha = self.agent.get_current_learning_rate()
Ejemplo n.º 14
0
class MOBaseThreadLearner(threading.Thread, MOBaseLearner):
    def __init__(self,
                 agent,
                 name,
                 environment,
                 global_dict,
                 num_of_objs=1,
                 async_update_steps=5,
                 using_e_greedy=True,
                 epsilon_annealing_start=1,
                 epsilon_annealing_choices=[0.1, 0.01, 0.5],
                 epsilon_annealing_probabilities=[0.4, 0.3, 0.3],
                 epsilon_annealing_steps=10000,
                 global_epsilon_annealing=True,
                 report_frequency=1,
                 gamma=0.9,
                 traces_factor=0.9,
                 batch_size=5,
                 load_model_path=None,
                 lookup_table=None,
                 thresholds=None,
                 target_reward=None,
                 is_linear=False):
        MOBaseLearner.__init__(self, num_of_objs)
        threading.Thread.__init__(self)

        range, is_range = environment.get_action_space().get_range()
        if not is_range:
            raise ValueError("Does not support this type of action space")

        self.using_e_greedy = using_e_greedy
        if using_e_greedy:
            end_rand = np.random.choice(epsilon_annealing_choices,
                                        p=epsilon_annealing_probabilities)
            self.epsilon_annealer = Annealer(epsilon_annealing_start, end_rand,
                                             epsilon_annealing_steps)

        self.current_epsilon = epsilon_annealing_start
        self.step_count = 0
        self.eps_count = 0
        self.environment = environment
        self.name = name
        self.agent = agent
        self.gamma = gamma
        self.traces_factor = traces_factor
        self.batch_size = batch_size
        self.load_model_path = load_model_path

        self.num_actions = len(range)

        self.async_update_step = async_update_steps
        self.global_dict = global_dict
        self.global_epsilon_annealing = global_epsilon_annealing

        self.report_frequency = report_frequency

        self.minibatch_vars = {}
        self.reset_minibatch()

        self.testing = False
        self.target_reward = target_reward
        self.is_linear = is_linear

        self.thresholds = thresholds
        if self.thresholds is None:
            self.thresholds = [0] * (self.num_of_objs - 1)

        self.pareto_solutions = self.environment.get_pareto_solutions()
        # if self.pareto_solutions is not None:
        #    for i in range(len(self.pareto_solutions)):

        self.table = lookup_table
        self.table.set_threshold(self.thresholds)
        if self.load_model_path is not None:
            self.agent.load_model()

        self.alpha = self.agent.get_current_learning_rate()

    def reset(self):
        self.testing = self.agent.is_testing_mode

        self.reset_minibatch()

        # self.environment.render()

    def run(self):
        while not self.global_dict['done']:
            reward = self.run_episode(self.environment)
            self.eps_count += 1
            if self.target_reward is None:
                self.global_dict['add_reward'](
                    reward, self.environment.get_current_steps())

            if self.eps_count % self.report_frequency == 0:
                current_epsilon = ''
                if self.using_e_greedy:
                    current_epsilon = 'Current epsilon: {0}'.format(
                        self.current_epsilon)
                print(self.name, 'Episode Count:', self.eps_count,
                      'Episode reward:', reward, 'Steps:',
                      self.environment.get_current_steps(), 'Step count:',
                      self.step_count, current_epsilon)

                # Testing purpose
                if self.target_reward is not None and self.thresholds is not None:
                    backup_epsilon = self.current_epsilon
                    self.current_epsilon = 0
                    greedy_reward = self.run_episode(self.environment)
                    self.global_dict['add_reward'](
                        greedy_reward, self.environment.get_current_steps())
                    self.current_epsilon = backup_epsilon
                    converged = True
                    for i in range(len(greedy_reward)):
                        if greedy_reward[i] != self.target_reward[i]:
                            converged = False
                            break
                    if converged:
                        print("Converged")
                        self.agent.converged = True

            if not self.testing:
                print(self.current_epsilon)
                self.anneal_epsilon()

    def update(self, *args, **kwargs):
        return NotImplemented

    def anneal_epsilon(self):
        if self.using_e_greedy:
            anneal_step = self.global_dict[
                'counter'] if self.global_epsilon_annealing else self.step_count
            self.current_epsilon = self.epsilon_annealer.anneal_to(anneal_step)

    def get_action(self, state):
        if self.using_e_greedy:
            # print(self.table.select_greedy_action(state))
            if np.random.uniform(0, 1) <= self.current_epsilon:
                e_greedy = np.random.randint(self.num_actions)
                return e_greedy
            else:
                return self.table.select_greedy_action(state)
        else:
            return self.table.select_greedy_action(state)

    def reset_minibatch(self):
        pass
Ejemplo n.º 15
0
class MOExpReplayBaseThreadLearner(threading.Thread, MOBaseLearner):
    def __init__(self,
                 agent,
                 name,
                 environment,
                 network,
                 global_dict,
                 async_update_steps=1,
                 reward_clip_vals=None,
                 using_e_greedy=True,
                 epsilon_annealing_start=1,
                 epsilon_annealing_choices=[0.1, 0.01, 0.5],
                 epsilon_annealing_probabilities=[0.4, 0.3, 0.3],
                 epsilon_annealing_steps=100000,
                 global_epsilon_annealing=True,
                 report_frequency=1):
        MOBaseLearner.__init__(self,
                               network.get_config().get_num_of_objectives())
        threading.Thread.__init__(self)

        range, is_range = environment.get_action_space().get_range()
        if not is_range:
            raise ValueError("Does not support this type of action space")

        self.using_e_greedy = using_e_greedy
        if using_e_greedy:
            end_rand = np.random.choice(epsilon_annealing_choices,
                                        p=epsilon_annealing_probabilities)
            self.epsilon_annealer = Annealer(epsilon_annealing_start, end_rand,
                                             epsilon_annealing_steps)

        self.current_epsilon = epsilon_annealing_start
        self.step_count = 0
        self.eps_count = 0
        self.environment = environment
        self.reward_clip_vals = reward_clip_vals
        self.name = name
        self.agent = agent

        self.num_actions = len(range)

        self.network = network
        self.config = network.network_config
        self.history_length = self.config.get_history_length()
        if self.history_length > 1:
            self.frame_buffer = StateBuffer([1] +
                                            self.config.get_input_shape())

        self.async_update_step = async_update_steps
        self.global_dict = global_dict
        self.global_epsilon_annealing = global_epsilon_annealing

        self.report_frequency = report_frequency

        self.minibatch_vars = {}
        self.reset_minibatch()

        self.testing = False

    def reset(self):

        self.testing = self.agent.is_testing_mode

        self.reset_minibatch()

        self.network.reset_network()

        if self.history_length > 1:
            self.frame_buffer.reset()

        state = self.environment.get_state()

        if self.history_length > 1:
            for _ in range(self.history_length):
                self.frame_buffer.add_state(state)

    def run(self):
        while not self.global_dict['done']:
            reward = self.run_episode(self.environment)
            self.eps_count += 1
            #self.global_dict['add_reward'](reward, self.environment.get_current_steps())

            if self.eps_count % self.report_frequency == 0:
                current_epsilon = ''
                if self.using_e_greedy:
                    current_epsilon = 'Current epsilon: {0}'.format(
                        self.current_epsilon)
                print(self.name, 'Episode Count:', self.eps_count,
                      'Episode reward:', reward, 'Steps:',
                      self.environment.get_current_steps(), 'Step count:',
                      self.step_count, current_epsilon)

                # Testing purpose
                backup_epsilon = self.current_epsilon
                self.current_epsilon = 0
                greedy_reward = self.run_episode(self.environment)

                print("Greedy reward:", greedy_reward)

                self.global_dict['add_reward'](
                    greedy_reward, self.environment.get_current_steps())
                self.current_epsilon = backup_epsilon

    def update(self, *args, **kwargs):
        return NotImplemented

    def anneal_epsilon(self):
        if self.using_e_greedy:
            anneal_step = self.global_dict[
                'counter'] if self.global_epsilon_annealing else self.step_count
            self.current_epsilon = self.epsilon_annealer.anneal_to(anneal_step)

    def get_action(self, state):
        if self.using_e_greedy:
            if np.random.uniform(0, 1) <= self.current_epsilon:
                e_greedy = np.random.randint(self.num_actions)
                return e_greedy
            else:
                if self.history_length > 1:
                    return self.network.get_output(
                        self.frame_buffer.get_buffer_add_state(state))
                else:
                    return self.network.get_output(state)
        else:
            if self.history_length > 1:
                return self.network.get_output(
                    self.frame_buffer.get_buffer_add_state(state))
            else:
                return self.network.get_output(state)

    def reset_minibatch(self):
        pass
Ejemplo n.º 16
0
class DQNLearner(Learner):
    def __init__(self,
                 agent,
                 name,
                 environment,
                 network,
                 global_dict,
                 report_frequency,
                 batch_size=32,
                 warmup_steps=50000,
                 training_frequency=4,
                 experience_replay_size=2**19,
                 epsilon_annealing_start=1,
                 epsilon_annealing_end=0.1,
                 epsilon_annealing_steps=1e6,
                 reward_clip_thresholds=(-1, 1)):
        super().__init__(agent=agent,
                         name=name,
                         environment=environment,
                         network=network,
                         global_dict=global_dict,
                         report_frequency=report_frequency)
        global experience_replay
        with global_dict[AgentMonitor.Q_LOCK]:
            if experience_replay is None:
                experience_replay = SyncExperienceReplay(
                    experience_replay_size,
                    state_history=network.network_config.get_history_length())
        self.replay = experience_replay
        self.batch_size = batch_size
        self.warmup_steps = warmup_steps
        self.training_frequency = training_frequency
        self.reward_clip_thresholds = reward_clip_thresholds
        self.epsilon_annealer = Annealer(epsilon_annealing_start,
                                         epsilon_annealing_end,
                                         epsilon_annealing_steps)
        self.current_learning_rate = network.get_config(
        ).get_initial_learning_rate()
        self.current_epsilon = epsilon_annealing_start

    @staticmethod
    def get_default_number_of_learners():
        return 1

    def get_action(self, state):
        probs = self.get_probs(state)
        if self.current_epsilon is not None:
            if np.random.uniform(0, 1) < self.current_epsilon:
                return np.random.randint(0, len(probs))
            else:
                return np.argmax(probs)
        else:
            return np.argmax(probs)

    def report(self, reward):
        print(self.name, 'Episode Count:', self.eps_count,
              'Episode reward:', reward, 'Steps:',
              self.environment.get_current_steps(), 'Step count:',
              self.step_count, 'Learning rate:',
              self.global_dict[AgentMonitor.Q_LEARNING_RATE], 'Epsilon:',
              self.current_epsilon)

    def update(self, state, action, reward, next_state, terminal):
        if self.history_length > 1:
            self.frame_buffer.add_state(state)

        if self.reward_clip_thresholds is not None:
            reward = np.clip(reward, self.reward_clip_thresholds[0],
                             self.reward_clip_thresholds[1])

        if not self.testing:
            if self.history_length > 1:
                current_s = self.frame_buffer.get_buffer()[0]
                next_s = self.frame_buffer.get_buffer_add_state(next_state)[0]
            else:
                current_s = state
                next_s = next_state
            self.replay.append(current_s, action, reward, next_s, terminal)

        self.step_count += 1
        self.global_dict['counter'] += 1

        if self.step_count < self.warmup_steps:
            return

        if not self.testing:
            if self.step_count % self.training_frequency == 0:
                logging = self.global_dict[AgentMonitor.Q_LOGGING]
                s, a, r, n, t = self.replay.get_mini_batch(
                    batch_size=self.batch_size)
                self.data_dict['states'] = s
                self.data_dict['actions'] = a
                self.data_dict['rewards'] = r
                self.data_dict['next_states'] = n
                self.data_dict['terminals'] = t
                self.data_dict['learning_rate'] = self.current_learning_rate
                self.data_dict['global_step'] = self.global_dict[
                    AgentMonitor.Q_GLOBAL_STEPS]
                if logging:
                    self.global_dict[AgentMonitor.Q_LOGGING] = False
                    self.data_dict['logging'] = True
                    summary = self.network.train_network(self.data_dict)
                    self.global_dict[AgentMonitor.Q_WRITER]. \
                        add_summary(summary, global_step=self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])
                else:
                    self.data_dict['logging'] = False
                    self.network.train_network(self.data_dict)

            self.current_epsilon = self.epsilon_annealer.anneal(
                self.global_dict[AgentMonitor.Q_GLOBAL_STEPS])