Beispiel #1
0
 def __init__(self, sess, policy_name, learning_params, curriculum,
              num_features, num_states, num_actions):
     # initialize attributes
     self.sess = sess
     self.learning_params = learning_params
     self.use_double_dqn = learning_params.use_double_dqn
     self.use_priority = learning_params.prioritized_replay
     self.policy_name = policy_name
     self.tabular_case = learning_params.tabular_case
     # This proxy adds the machine state representation to the MDP state
     self.feature_proxy = FeatureProxy(num_features, num_states,
                                       self.tabular_case)
     self.num_actions = num_actions
     self.num_features = self.feature_proxy.get_num_features()
     # create dqn network
     self._create_network(learning_params.lr, learning_params.gamma,
                          learning_params.num_neurons,
                          learning_params.num_hidden_layers)
     # create experience replay buffer
     if self.use_priority:
         self.replay_buffer = PrioritizedReplayBuffer(
             learning_params.buffer_size,
             alpha=learning_params.prioritized_replay_alpha)
         if learning_params.prioritized_replay_beta_iters is None:
             learning_params.prioritized_replay_beta_iters = curriculum.total_steps
         self.beta_schedule = LinearSchedule(
             learning_params.prioritized_replay_beta_iters,
             initial_p=learning_params.prioritized_replay_beta0,
             final_p=1.0)
     else:
         self.replay_buffer = ReplayBuffer(learning_params.buffer_size)
         self.beta_schedule = None
     # count of the number of environmental steps
     self.step = 0
    def __init__(self, sess, policy_name, options, option2file, rm, use_rm, learning_params, num_features, num_states, show_print, epsilon=0.1):
        
        self.show_print = show_print
        self.options = options
        self.option2file = option2file
        self.epsilon = epsilon
        self.gamma = learning_params.gamma
        self.rm = rm 
        self.use_rm = use_rm
        self.tabular_case = learning_params.tabular_case

        # This proxy adds the machine state representation to the MDP state
        self.feature_proxy = FeatureProxy(num_features, num_states, self.tabular_case)
        self.num_actions  = len(options)
        self.num_features = self.feature_proxy.get_num_features()        
        
        # network parameters
        num_hidden_layers = 2                 # this has no effect on the tabular case
        num_neurons = 64                      # this has no effect on the tabular case
        self.target_network_update_freq = 100 # this has no effect on the tabular case
        if self.tabular_case:
            lr = 0.7
            buffer_size = 1
            self.batch_size = 1
            self.learning_starts = 0 
        else:
            lr = 1e-3 
            buffer_size = 50000
            self.batch_size =  32
            self.learning_starts = 100

        # create dqn network
        self.neuralnet = MCNet(sess, self.num_actions, self.num_features, policy_name, self.tabular_case, learning_params.use_double_dqn, lr, num_neurons, num_hidden_layers)

        # create experience replay buffer
        self.er_buffer = MCReplayBuffer(buffer_size)
        self.step = 0

        # preprocessing action masks (for pruning useless options)
        self.mask = {}
        for u in self.rm.get_states():
            a_mask = np.ones(self.num_actions, dtype=np.float)
            if use_rm and not self.rm.is_terminal_state(u):
                a_mask = np.zeros(self.num_actions, dtype=np.float)                
                # Options that would move the RM to another state is useful
                useful_options = self.rm.get_useful_transitions(u)
                # looking for an exact match
                for i in range(self.num_actions):
                    if _is_match(option2file[i].split("&"), useful_options, True):
                        a_mask[i] = 1
                # if no exact match is found, we relax this condition and use any option that might be useful
                if np.sum(a_mask) < 1:
                    a_mask = np.zeros(self.num_actions, dtype=np.float)                
                    for i in range(self.num_actions):
                        if _is_match(option2file[i].split("&"), useful_options, False):
                            a_mask[i] = 1
            self.mask[u] = a_mask