def online_td_lambda(env, lamda, alpha, gamma, n_episodes): # Initialize value function. v = LinearValueFunction(env.n_states) for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.n_states) z = np.zeros(env.n_states) V_old = 0 while not done: obs_prime, reward, done = env.step() obs_prime_vec = encode_state(obs_prime, env.n_states) V = v.evaluate(obs_vec) V_prime = v.evaluate(obs_prime_vec) delta = reward + gamma * V_prime - V # Update eligibility traces. z = gamma * lamda * z + ( 1 - alpha * gamma * lamda * np.dot(z, obs_vec)) * obs_vec # Update weights. v.weights += alpha * (delta + V - V_old) * z - alpha * (V - V_old) * obs_vec V_old = V_prime obs_vec = obs_prime_vec return v
def one_step_actor_critic(env, alpha_th, alpha_w, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size*env.action_space_size) v = LinearValueFunction(env.observation_space_size) for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.observation_space_size) I = 1 while not done: sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(sa_pairs) obs_prime, reward, done = env.step(a) obs_prime_vec = encode_state(obs_prime, env.observation_space_size) delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec) v.weights += alpha_w * I * delta * obs_vec policy.weights += alpha_th * I * delta * policy.eligibility_vector(a, sa_pairs) I *= I obs_vec = obs_prime_vec obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def actor_critic_eligibility_traces(env, eta, alpha_th, alpha_w, lambda_th, lambda_w, \ gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) v = LinearValueFunction(env.observation_space_size) z_th = np.zeros(env.observation_space_size * env.action_space_size) z_w = np.zeros(env.observation_space_size) R_bar = 0 for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.observation_space_size) while not done: sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(sa_pairs) obs_prime, reward, done = env.step(a) obs_prime_vec = encode_state(obs_prime, env.observation_space_size) delta = reward - R_bar + v.evaluate(obs_prime_vec) - v.evaluate( obs_vec) R_bar += eta * delta z_w = lambda_w * z_w + obs_vec z_th = lambda_th * z_th + policy.eligibility_vector(a, sa_pairs) v.weights += alpha_w * delta * z_w policy.weights += alpha_th * delta * z_th obs_vec = obs_prime_vec obs = obs_prime print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy
def compute_expected_future_return(self, state, action, lookahead): state_hash = encode_state(state) state_action_occurrence = self.state_action_transition_count[ state_hash][action] next_state_occurrence_dict = self.transitions[state_hash][action] state_probabilities = defaultdict(float) for next_state_hash in next_state_occurrence_dict: if state_action_occurrence < self.known_threshold: state_probabilities[next_state_hash] = 1 else: count = next_state_occurrence_dict[next_state_hash] state_probabilities[next_state_hash] = ( count / state_action_occurrence) weighted_future_returns = list() for next_state_hash in state_probabilities: prev_action_weight = self.weights[state_hash][action] next_state = my_apply_action_to_state(state, action, self.services.parser) weighted_future_returns.append( self.get_max_q_value(next_state, lookahead - 1, prev_action_weight) * state_probabilities[next_state_hash]) return sum(weighted_future_returns)
def make_plan(self, state): curr_state = copy.deepcopy(state) if self.active_goal is None: self.active_goal = self.uncompleted_goals[0] problem = self.services.problem_generator.generate_problem( self.active_goal, curr_state) self.plan = self.services.planner(self.services.pddl.domain_path, problem) for i in range(len(self.plan)): action = self.plan[i] curr_state_hash = encode_state(curr_state) weight = float(i + 1) / len(self.plan) if self.weights[curr_state_hash][action.lower()] < weight: self.weights[curr_state_hash][action.lower()] = weight curr_state = my_apply_action_to_state(curr_state, action, self.services.parser) local_weights = list() for state_hash in self.weights: vals = list(self.weights[state_hash].values()) local_weights.extend(vals) self.state_recurrence_punish = median(local_weights) self.lookahead = min([4, int(len(self.plan) / 2)])
def __init__(self, cargos, trucks, warehouses, initial: FluentState, goal: list): self.state_map = initial.pos + initial.neg self.initial_state_TF = encode_state(initial, self.state_map) Problem.__init__(self, self.initial_state_TF, goal=goal) self.cargos = cargos self.trucks = trucks self.actions_list = self.get_actions()
def store_paths(self, paths): i1, i2 = itertools.tee(itertools.chain.from_iterable(paths)) next(i2) for (__, s), (a, ns) in zip(i1, i2): if a is None: continue encode_state(self.states[self.index, :, :, :], s) self.actions[self.index, :, :] = 0.0 self.actions[self.index, a.x, a.y] = 1.0 self.rewards[self.index] = ns.reward encode_state(self.nstates[self.index, :, :, :], ns) if ns.status != helicopter3x3.Status.flying: self.done[self.index] = 1.0 else: self.done[self.index] = 0.0 self.index += 1 if self.index >= self.size: self.index = 0 if self.index > self.maxSize: self.maxSize = self.index
def get_reward(self, state, action): state_hash = encode_state(state) if self.state_action_rewards_count[state_hash][ action] >= self.known_threshold: state_action_rewards = self.rewards[state_hash][action] reward = float( sum(state_action_rewards)) / len(state_action_rewards) else: reward = self.weights[state_hash][action] return reward
def semi_gradient_td_lambda(env, lamda, alpha, gamma, n_episodes): # Initialize value function. v = LinearValueFunction(env.n_states) for episode in range(n_episodes): done = False obs = env.reset() obs_vec = encode_state(obs, env.n_states) z = np.zeros(env.n_states) while not done: obs_prime, reward, done = env.step() obs_prime_vec = encode_state(obs_prime, env.n_states) # Update eligibility traces. z = gamma * lamda * z + obs_vec delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec) # Update weights. v.weights += alpha * delta * z obs_vec = obs_prime_vec return v
def get_moves_with_rewards(self): valid_moves = self.get_all_moves() valid_moves_idx = map(get_move_idx, valid_moves) best_move = None inp = encode_state(self.board.values).float() pred = self.model(inp) valid_moves_prob = [[valid_moves[i], pred[idx].item()] for i, idx in enumerate(valid_moves_idx)] return valid_moves_prob
def result(self, state: str, action: Action): new_state = FluentState([], []) old_state = decode_state(state, self.state_map) for fluent in old_state.pos: if fluent not in action.effect_rem: new_state.pos.append(fluent) for fluent in action.effect_add: if fluent not in new_state.pos: new_state.pos.append(fluent) for fluent in old_state.neg: if fluent not in action.effect_add: new_state.neg.append(fluent) for fluent in action.effect_rem: if fluent not in new_state.neg: new_state.neg.append(fluent) return encode_state(new_state, self.state_map)
def prepare_data(self): self.data = [] total = 0 for exp in self.experience[-1::-1]: state, move, reward = exp vector = encode_state(state).flatten().tolist() move_idx = get_move_idx(move) vector.append(move_idx) total = reward + self.discount * total vector.append(total) self.data.append(vector) self.data.reverse()
def REINFORCE_baseline(env, alpha_th, alpha_w, gamma, n_episodes): policy = ExponentialSoftmax(env.observation_space_size * env.action_space_size) v = LinearValueFunction(env.observation_space_size) returns = [] for episode in range(n_episodes): done = False obs = env.reset() all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states = [obs] actions = [a] rewards = [None] while not done: obs, reward, done = env.step(a) all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] a = policy.sample_action(all_sa_pairs) states.append(obs) actions.append(a) rewards.append(reward) for t in range(len(states)): G_t = sum(rewards[t + 1:]) x_t = encode_state(states[t], env.observation_space_size) delta = G_t - v.evaluate(x_t) v.weights += alpha_w * (gamma**t) * delta * x_t all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \ env.action_space_size) for a in range(env.action_space_size)] policy.weights += alpha_th * (gamma ** t) * G_t * delta * \ policy.eligibility_vector(actions[t], all_sa_pairs) returns.append(sum(rewards[1:])) print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return (policy, np.array(returns))
def compute_max_qval_action_pair(self, state, lookahead, prev_action_weight): state_hash = encode_state(state) predicted_returns = defaultdict(float) actions = self.valid_actions_getter.get(state) for action in actions: # expansion... edge_weight = prev_action_weight * self.off_plan_punish_factor if self.weights[state_hash][action] < edge_weight: self.weights[state_hash][action] = edge_weight for action in actions: q_s_a = self.get_q_value(state, action, lookahead) predicted_returns[action] = q_s_a max_q_val = max(predicted_returns.values()) best_actions = list() for action_name in predicted_returns: if predicted_returns[action_name] == max_q_val: best_actions.append(action_name) best_action = random.choice(best_actions) return max_q_val, best_action
for region in policy.keys(): policy[region] = [a / sum(policy[region]) for a in policy[region]] return policy # Main function runs the test using a stored NN and outputs a CSV file if __name__ == "__main__": r = sys.argv[1] n = int(sys.argv[2]) states = generate_all() inputs = np.zeros((512 * 3 * 3, 3, 3, 2)) for s, state in enumerate(states): encode_state(inputs[s], state) for i in range(n): nn = tf.keras.models.load_model("nets/NN_{0}_{1}.h5".format(r, i), custom_objects={'tf': tf}) policy = policy_test(nn, states, inputs) print("Policy {} {} evaluated!".format(r, i)) with open("policy_dists/policy_{0}_{1}.csv".format(r, i), "w") as f: for region in policy.keys(): f.write("".join([str(x) for x in region]) + ", " + ", ".join([str(y) for y in policy[region]]) + "\n") print("Done!")
def next_action(self): # perception state = self.services.perception.get_state() state_hash = encode_state(state) # remember self.update(self.prev_state_hash, self.prev_action, state_hash) # check if done self.check_goals(state) if len(self.uncompleted_goals) == 0: save_obj(self.transitions, self.env_name + "_transitions") save_obj(self.state_action_transition_count, self.env_name + "_state_action_transition_count") return None # choose if self.plan is not None: if self.prev_action.upper() not in self.plan and \ self.weights[self.prev_state_hash][self.prev_action] <= self.last_in_plan_transition_weight * self.off_plan_punish_factor ** self.lookahead: self.plan = None if self.plan is not None: action = self.choose(state) self.prev_action = action self.prev_state_hash = state_hash return self.prev_action applicable_actions = self.valid_actions_getter.get(state) possible_next_states = defaultdict(None) for applicable_action in applicable_actions: next_state = my_apply_action_to_state(state, applicable_action, self.services.parser) possible_next_states[applicable_action] = encode_state(next_state) actions_leading_to_not_seen_states = filter( lambda action_key: possible_next_states[action_key] not in self. visited_states_hash, possible_next_states) if len(actions_leading_to_not_seen_states) == 0: self.prev_state_hash = None self.prev_action = None self.visited_states_hash = set() self.plan = None return self.next_action() if len(actions_leading_to_not_seen_states) == 1: self.prev_state_hash = state_hash self.prev_action = actions_leading_to_not_seen_states.pop(0) return self.prev_action if self.plan is None: self.make_plan(state) action = self.choose(state) self.prev_state_hash = state_hash self.prev_action = action return self.prev_action return None