def __init__(self, sess, policy_name, learning_params, curriculum, num_features, num_states, num_actions): # initialize attributes self.sess = sess self.learning_params = learning_params self.use_double_dqn = learning_params.use_double_dqn self.use_priority = learning_params.prioritized_replay self.policy_name = policy_name self.tabular_case = learning_params.tabular_case # This proxy adds the machine state representation to the MDP state self.feature_proxy = FeatureProxy(num_features, num_states, self.tabular_case) self.num_actions = num_actions self.num_features = self.feature_proxy.get_num_features() # create dqn network self._create_network(learning_params.lr, learning_params.gamma, learning_params.num_neurons, learning_params.num_hidden_layers) # create experience replay buffer if self.use_priority: self.replay_buffer = PrioritizedReplayBuffer( learning_params.buffer_size, alpha=learning_params.prioritized_replay_alpha) if learning_params.prioritized_replay_beta_iters is None: learning_params.prioritized_replay_beta_iters = curriculum.total_steps self.beta_schedule = LinearSchedule( learning_params.prioritized_replay_beta_iters, initial_p=learning_params.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(learning_params.buffer_size) self.beta_schedule = None # count of the number of environmental steps self.step = 0
def __init__(self, sess, policy_name, options, option2file, rm, use_rm, learning_params, num_features, num_states, show_print, epsilon=0.1): self.show_print = show_print self.options = options self.option2file = option2file self.epsilon = epsilon self.gamma = learning_params.gamma self.rm = rm self.use_rm = use_rm self.tabular_case = learning_params.tabular_case # This proxy adds the machine state representation to the MDP state self.feature_proxy = FeatureProxy(num_features, num_states, self.tabular_case) self.num_actions = len(options) self.num_features = self.feature_proxy.get_num_features() # network parameters num_hidden_layers = 2 # this has no effect on the tabular case num_neurons = 64 # this has no effect on the tabular case self.target_network_update_freq = 100 # this has no effect on the tabular case if self.tabular_case: lr = 0.7 buffer_size = 1 self.batch_size = 1 self.learning_starts = 0 else: lr = 1e-3 buffer_size = 50000 self.batch_size = 32 self.learning_starts = 100 # create dqn network self.neuralnet = MCNet(sess, self.num_actions, self.num_features, policy_name, self.tabular_case, learning_params.use_double_dqn, lr, num_neurons, num_hidden_layers) # create experience replay buffer self.er_buffer = MCReplayBuffer(buffer_size) self.step = 0 # preprocessing action masks (for pruning useless options) self.mask = {} for u in self.rm.get_states(): a_mask = np.ones(self.num_actions, dtype=np.float) if use_rm and not self.rm.is_terminal_state(u): a_mask = np.zeros(self.num_actions, dtype=np.float) # Options that would move the RM to another state is useful useful_options = self.rm.get_useful_transitions(u) # looking for an exact match for i in range(self.num_actions): if _is_match(option2file[i].split("&"), useful_options, True): a_mask[i] = 1 # if no exact match is found, we relax this condition and use any option that might be useful if np.sum(a_mask) < 1: a_mask = np.zeros(self.num_actions, dtype=np.float) for i in range(self.num_actions): if _is_match(option2file[i].split("&"), useful_options, False): a_mask[i] = 1 self.mask[u] = a_mask
class MetaController: def __init__(self, sess, policy_name, options, option2file, rm, use_rm, learning_params, num_features, num_states, show_print, epsilon=0.1): self.show_print = show_print self.options = options self.option2file = option2file self.epsilon = epsilon self.gamma = learning_params.gamma self.rm = rm self.use_rm = use_rm self.tabular_case = learning_params.tabular_case # This proxy adds the machine state representation to the MDP state self.feature_proxy = FeatureProxy(num_features, num_states, self.tabular_case) self.num_actions = len(options) self.num_features = self.feature_proxy.get_num_features() # network parameters num_hidden_layers = 2 # this has no effect on the tabular case num_neurons = 64 # this has no effect on the tabular case self.target_network_update_freq = 100 # this has no effect on the tabular case if self.tabular_case: lr = 0.7 buffer_size = 1 self.batch_size = 1 self.learning_starts = 0 else: lr = 1e-3 buffer_size = 50000 self.batch_size = 32 self.learning_starts = 100 # create dqn network self.neuralnet = MCNet(sess, self.num_actions, self.num_features, policy_name, self.tabular_case, learning_params.use_double_dqn, lr, num_neurons, num_hidden_layers) # create experience replay buffer self.er_buffer = MCReplayBuffer(buffer_size) self.step = 0 # preprocessing action masks (for pruning useless options) self.mask = {} for u in self.rm.get_states(): a_mask = np.ones(self.num_actions, dtype=np.float) if use_rm and not self.rm.is_terminal_state(u): a_mask = np.zeros(self.num_actions, dtype=np.float) # Options that would move the RM to another state is useful useful_options = self.rm.get_useful_transitions(u) # looking for an exact match for i in range(self.num_actions): if _is_match(option2file[i].split("&"), useful_options, True): a_mask[i] = 1 # if no exact match is found, we relax this condition and use any option that might be useful if np.sum(a_mask) < 1: a_mask = np.zeros(self.num_actions, dtype=np.float) for i in range(self.num_actions): if _is_match(option2file[i].split("&"), useful_options, False): a_mask[i] = 1 self.mask[u] = a_mask def _get_mask(self, u): return self.mask[u] def finish_option(self, option_id, true_props): option = self.options[option_id] u0 = option.get_initial_state() return u0 != option.get_next_state(u0, true_props) def get_option(self, option_id): option = self.options[option_id] rm_id, rm_u = option_id, option.get_initial_state() return rm_id, rm_u def learn(self, s1, u1, a, r, s2, u2, done, steps): # adding this experience to the buffer s1 = self.feature_proxy.add_state_features(s1, u1) s2 = self.feature_proxy.add_state_features(s2, u2) self.er_buffer.add(s1, a, r, s2, self._get_mask(u2), 1.0 if done else 0.0, self.gamma**steps) if len(self.er_buffer) > self.learning_starts: if self.show_print: print("MC: Learning", self.step) # Learning s1, a, r, s2, s2_mask, done, gamma = self.er_buffer.sample( self.batch_size) self.neuralnet.learn(s1, a, r, s2, s2_mask, done, gamma) self.step += 1 # Updating the target network if self.step % self.target_network_update_freq == 0: if self.show_print: print("MC: Update network", self.step) self.neuralnet.update_target_network() def get_action_epsilon_greedy(self, s, u): # Before learning starts, the agent behaves completely random if len(self.er_buffer) <= self.learning_starts or random.random( ) < self.epsilon: # we have to pick a random option such that its mask is 1.0 mask = self._get_mask(u) useful_options = [ i for i in range(self.num_actions) if mask[i] > 0 ] return random.choice(useful_options) return self.get_best_action(s, u) def get_best_action(self, s, u): s = self.feature_proxy.add_state_features(s, u).reshape( (1, self.num_features)) action_id = self.neuralnet.get_best_action(s, self._get_mask(u)) return int(action_id)
class DQN: """ This baseline solves the problem using standard q-learning over the cross product between the RM and the MDP """ def __init__(self, sess, policy_name, learning_params, curriculum, num_features, num_states, num_actions): # initialize attributes self.sess = sess self.learning_params = learning_params self.use_double_dqn = learning_params.use_double_dqn self.use_priority = learning_params.prioritized_replay self.policy_name = policy_name self.tabular_case = learning_params.tabular_case # This proxy adds the machine state representation to the MDP state self.feature_proxy = FeatureProxy(num_features, num_states, self.tabular_case) self.num_actions = num_actions self.num_features = self.feature_proxy.get_num_features() # create dqn network self._create_network(learning_params.lr, learning_params.gamma, learning_params.num_neurons, learning_params.num_hidden_layers) # create experience replay buffer if self.use_priority: self.replay_buffer = PrioritizedReplayBuffer( learning_params.buffer_size, alpha=learning_params.prioritized_replay_alpha) if learning_params.prioritized_replay_beta_iters is None: learning_params.prioritized_replay_beta_iters = curriculum.total_steps self.beta_schedule = LinearSchedule( learning_params.prioritized_replay_beta_iters, initial_p=learning_params.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(learning_params.buffer_size) self.beta_schedule = None # count of the number of environmental steps self.step = 0 def _create_network(self, lr, gamma, num_neurons, num_hidden_layers): total_features = self.num_features total_actions = self.num_actions # Inputs to the network self.s1 = tf.placeholder(tf.float64, [None, total_features]) self.a = tf.placeholder(tf.int32) self.r = tf.placeholder(tf.float64) self.s2 = tf.placeholder(tf.float64, [None, total_features]) self.done = tf.placeholder(tf.float64) self.IS_weights = tf.placeholder( tf.float64) # Importance sampling weights for prioritized ER # Creating target and current networks with tf.variable_scope( self.policy_name ): # helps to give different names to this variables for this network # Defining regular and target neural nets if self.tabular_case: with tf.variable_scope("q_network") as scope: q_values, _ = create_linear_regression( self.s1, total_features, total_actions) scope.reuse_variables() q_target, _ = create_linear_regression( self.s2, total_features, total_actions) else: with tf.variable_scope("q_network") as scope: q_values, q_values_weights = create_net( self.s1, total_features, total_actions, num_neurons, num_hidden_layers) if self.use_double_dqn: scope.reuse_variables() q2_values, _ = create_net(self.s2, total_features, total_actions, num_neurons, num_hidden_layers) with tf.variable_scope("q_target"): q_target, q_target_weights = create_net( self.s2, total_features, total_actions, num_neurons, num_hidden_layers) self.update_target = create_target_updates( q_values_weights, q_target_weights) # Q_values -> get optimal actions self.best_action = tf.argmax(q_values, 1) # Optimizing with respect to q_target action_mask = tf.one_hot(indices=self.a, depth=total_actions, dtype=tf.float64) q_current = tf.reduce_sum(tf.multiply(q_values, action_mask), 1) if self.use_double_dqn: # DDQN best_action_mask = tf.one_hot(indices=tf.argmax(q2_values, 1), depth=total_actions, dtype=tf.float64) q_max = tf.reduce_sum(tf.multiply(q_target, best_action_mask), 1) else: # DQN q_max = tf.reduce_max(q_target, axis=1) # Computing td-error and loss function q_max = q_max * (1.0 - self.done ) # dead ends must have q_max equal to zero q_target_value = self.r + gamma * q_max q_target_value = tf.stop_gradient(q_target_value) if self.use_priority: # prioritized experience replay self.td_error = q_current - q_target_value huber_loss = 0.5 * tf.square(self.td_error) # without clipping loss = tf.reduce_mean( self.IS_weights * huber_loss) # weights fix bias in case of using priorities else: # standard experience replay loss = 0.5 * tf.reduce_sum( tf.square(q_current - q_target_value)) # Defining the optimizer if self.tabular_case: optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr) else: optimizer = tf.train.AdamOptimizer(learning_rate=lr) self.train = optimizer.minimize(loss=loss) # Initializing the network values self.sess.run(tf.variables_initializer(self._get_network_variables())) self.update_target_network() # copying weights to target net def _train(self, s1, a, r, s2, done, IS_weights): if self.use_priority: _, td_errors = self.sess.run( [self.train, self.td_error], { self.s1: s1, self.a: a, self.r: r, self.s2: s2, self.done: done, self.IS_weights: IS_weights }) else: self.sess.run(self.train, { self.s1: s1, self.a: a, self.r: r, self.s2: s2, self.done: done }) td_errors = None return td_errors def get_number_features(self): return self.num_features def learn(self): if self.use_priority: experience = self.replay_buffer.sample( self.learning_params.batch_size, beta=self.beta_schedule.value(self.get_step())) s1, a, r, s2, done, weights, batch_idxes = experience else: s1, a, r, s2, done = self.replay_buffer.sample( self.learning_params.batch_size) weights, batch_idxes = None, None td_errors = self._train(s1, a, r, s2, done, weights) # returns the absolute td_error if self.use_priority: new_priorities = np.abs( td_errors) + self.learning_params.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) def add_experience(self, s1, u1, a, r, s2, u2, done): s1 = self.feature_proxy.add_state_features(s1, u1) s2 = self.feature_proxy.add_state_features(s2, u2) self.replay_buffer.add(s1, a, r, s2, done) def get_step(self): return self.step def add_step(self): self.step += 1 def get_best_action(self, s1, u1): s1 = self.feature_proxy.add_state_features(s1, u1).reshape( (1, self.num_features)) return self.sess.run(self.best_action, {self.s1: s1}) def update_target_network(self): if not self.tabular_case: self.sess.run(self.update_target) def _get_network_variables(self): return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.policy_name)