def launch_qlearning(env, episodes, display, calculated_reward_matrix, calculated_transition_matrix, calculated_valid_actions_matrix, calculated_lava_states_matrix, calculated_goal_states_matrix): # Iinitalisation de la qmatrix avec des 0 # (Nombres de cases sur la carte, nombre d'actions possibles) q_matrix = np.zeros((env.COLS * env.ROWS, 4)) gamma = 0.8 for step in range(episodes): env.reset() start_state = get_state(env.position, env.COLS) future_rewards = [] current_state = start_state # While diamond or lava not found while not is_lava_or_goal(current_state, calculated_lava_states_matrix, calculated_goal_states_matrix): action = get_next_action(current_state, get_flat_matrix(q_matrix), calculated_valid_actions_matrix, (episodes / (step + 1))) next_state = calculated_transition_matrix[current_state][action] future_rewards.append(q_matrix[next_state][get_next_action( next_state, get_flat_matrix(q_matrix), calculated_valid_actions_matrix, (episodes / (step + 1)))]) # Bellman Equation q_state = calculated_reward_matrix[current_state][ action] + gamma * max(future_rewards) q_matrix[current_state][action] = q_state reward, done = env.step(Action(action)) display(calculated_reward_matrix[current_state][action], Action(action), done, "## {} ##".format(step)) current_state = next_state
def convert_input_comb_to_actions(input_comb): actions = [] for action_type, action_ele in input_comb: action_value = input_comb[(action_type, action_ele)] action = Action(action_type=action_type, element=action_ele, value=action_value) actions.append(action) return actions
def choose_action(self, state): action2q = {a: self.q[(state, a)] for a in Action.get_actions()} if random.random() < self.epsilon: return random.choice(action2q.keys()) max_q = max(action2q.values()) best_actions = [a for a in action2q if action2q[a] == max_q] return random.choice(best_actions)
def __init__(self, epsilon=0.2, alpha=0.1, gamma=0.9): self.q = Counter() self.epsilon = epsilon self.alpha = alpha self.gamma = gamma self.action = Action() self.pool = Pool(5)
def learn(self): best_input_comb = self.form.best_input_comb positive_samples = [] negative_samples = [] previous_actions = [] for (action_type, action_ele) in best_input_comb: value_candidates = self.form.input_candidates[(action_type, action_ele)] best_value = best_input_comb[(action_type, action_ele)] best_action = None for value in value_candidates: action = Action(action_ele, action_type, value) encoding = self.encode(self.form, previous_actions, action) if value == best_value: positive_samples.append(encoding) else: negative_samples.append(encoding) if best_action: previous_actions.append(best_action) # negative_samples = random.sample(negative_samples, len(positive_samples)) samples = positive_samples + negative_samples if len(samples) <= 0: return None samples = self.zip_inputs(samples) labels = [1.0] * len(positive_samples) + [0.0] * len(negative_samples) history = self.net.fit(x=samples, y=np.array(labels), epochs=5, verbose=0) # i = random.randint(0, len(positive_samples) - 1) # self.fe.show_image("output/positive_action_loc.png", positive_samples[i][0]) # self.fe.show_image("output/positive_dom_sim.png", positive_samples[i][1]) return history.history["loss"][-1]
def load_action(state, action_line): m = re.match(ACTION_RE, action_line) action_type, value, target_locator = m.group(1), m.group( 2), m.group(3) target_ele = state.get_element_by_locator(target_locator) action = Action(element=target_ele, action_type=action_type, value=value) return action
def greedy_input_comb_rewards(self): greedy_input_comb_rewards = OrderedDict() for (action_type, action_ele) in self.input_candidates: values = self.input_candidates[(action_type, action_ele)] greedy_input_comb_rewards[(action_type, action_ele)] = {} for value in values: action = Action(action_ele, action_type, value) action_reward = self.simulate_actions([action]) greedy_input_comb_rewards[(action_type, action_ele)][value] = action_reward return greedy_input_comb_rewards
def __init__(self, env, endeavours_bias=0.1, longterm_satisfaction_bias=0.9): self.endeavours_bias = endeavours_bias self.longterm_satisfaction_bias = longterm_satisfaction_bias self.env = env num_states = env.COLS * env.ROWS num_actions = len(Action.all()) self.qvalues = np.zeros((num_states, num_actions))
def learn_q(self, last_state, action, reward, now_state): max_state_value = max( [self.q[(now_state, a)] for a in Action.get_actions()]) # 全部使用reward初始化 if (last_state, action) not in self.q: self.q[(last_state, action)] = reward self.q[( last_state, action)] += self.alpha * (reward + self.gamma * max_state_value - self.q[(last_state, action)])
def generate_input_comb(self, epsilon=1.0, eval_func=None): input_comb = OrderedDict() action_categories = [] previous_actions = [] for action_type, action_ele in self.input_candidates: previous_values = [ None if previous_action.value is None else previous_action.value_text_parsed for previous_action in previous_actions ] value_candidates = [] for value in self.input_candidates[(action_type, action_ele)]: if value is not None: value_text = Action(action_ele, action_type, value).value_text_parsed if value_text in previous_values: continue value_candidates.append(value) if np.random.uniform() <= epsilon: action_category = "Explore" if np.random.uniform() <= 0.5: action_value = random.choice(value_candidates) else: greedy_input_comb_rewards = self.greedy_input_comb_rewards[(action_type, action_ele)] value_candidate_rewards = { value_candidate: greedy_input_comb_rewards[value_candidate] for value_candidate in value_candidates } action_value = Utils.weighted_choice(value_candidate_rewards) else: action_category = "Exploit" max_q_score = -1 best_value = random.choice(value_candidates) for value_candidate in value_candidates: action_candidate = Action(element=action_ele, action_type=action_type, value=value_candidate) q_score = eval_func(self, previous_actions, action_candidate) if q_score > max_q_score: max_q_score = q_score best_value = value_candidate action_value = best_value previous_actions.append(Action(element=action_ele, action_type=action_type, value=action_value)) input_comb[(action_type, action_ele)] = action_value action_categories.append(action_category) return input_comb, action_categories
def SaveGame(self, request, context): game_history = GameHistory() game_history.observations = [tf.make_ndarray(observation) for observation in request.observations] game_history.actions = [Action(index) for index in request.actions] game_history.rewards = request.rewards game_history.to_plays = [Player(player_id) for player_id in request.to_plays] game_history.root_values = request.root_values game_history.policies = [policy.probabilities for policy in request.policies] self.replay_buffer.save_history(game_history) print('Number of games in buffer: {}'.format(len(self.replay_buffer.buffer))) return replay_buffer_pb2.SaveGameResponse(success=True)
def sample_batch(self, batch_size, num_unroll_steps, td_steps, discount): request = replay_buffer_pb2.MiniBatchRequest(batch_size=batch_size, num_unroll_steps=num_unroll_steps, td_steps=td_steps, discount=discount) response = self.remote_replay_buffer.SampleBatch(request) batch = [] for datapoint in response.datapoints: observation = tf.make_ndarray(datapoint.observation) actions = [Action(index) for index in datapoint.actions] targets = [(target.value, target.reward, target.policy.probabilities) for target in datapoint.targets] batch.append((observation, actions, targets)) return batch
def __init__(self, name, environment_class, environment_parameters, num_players, action_space_size, discount): self.name = name self.environment_class = environment_class self.environment_parameters = environment_parameters self.num_players = num_players self.action_space_size = action_space_size self.discount = discount self.action_space = [ Action(index) for index in range(action_space_size) ]
def extract_input_candidates(task): input_candidates = {} selectable_values = set() for action in task.state.possible_actions: action_type, action_ele, action_value = action.action_type, action.element, action.value if action_type not in [Action.INPUT_TEXT, Action.SELECT]: continue if (action_type, action_ele) not in input_candidates: input_candidates[(action_type, action_ele)] = [None] if action_value in input_candidates[(action_type, action_ele)]: continue if action_type == Action.SELECT: action_value_useful = False for word in task.all_words_parsed: word_sim = Utils.text_similarity(word, action.value_text_parsed) if word_sim > 0.5: selectable_values.add(word) action_value_useful = True if not action_value_useful: continue input_candidates[(action_type, action_ele)].append(action_value) input_candidates = OrderedDict(sorted(input_candidates.items(), key=lambda x: x[0][1].id)) for (action_type, action_ele) in input_candidates: values = input_candidates[(action_type, action_ele)] values_parsed = [ None if value is None else Action(action_ele, action_type, value).value_text_parsed for value in values ] # keep the max-similarity value for each parameter filtered_values = [None] for word in task.all_words_parsed: if action_type == Action.INPUT_TEXT and word in selectable_values: continue max_score = 0 max_score_value = None for i, value_parsed in enumerate(values_parsed): if value_parsed is None: continue value_score = Utils.text_similarity(word, value_parsed) if value_score > max_score: max_score = value_score max_score_value = values[i] if max_score_value is not None and max_score_value not in filtered_values: filtered_values.append(max_score_value) values = filtered_values values = sorted(values, key=lambda x: str(x)) input_candidates[(action_type, action_ele)] = values return input_candidates
def mc_control(num_episodes=10000): q_sa = {} p = {} n_s = {} n_sa = {} n0 = 100 for _ in range(num_episodes): state = State() reward = 0 episode_s = [] episode_sa = [] while not state.terminal: s = state.as_tuple() if s in p: a = sample_action(p[s]) else: a = Action.random() episode_s.append(s) episode_sa.append(s + (a, )) state, reward = step(state, a) ns = n_s.get(s, 0) n_s[s] = ns + 1 sa = s + (a, ) nsa = n_sa.get(sa, 0) n_sa[sa] = nsa + 1 # GLIE MC Control for sa in set(episode_sa): nsa = n_sa[sa] qsa = q_sa.get(sa, 0) q_sa[sa] = qsa + ((reward - qsa) / nsa) # Improve policy for s in set(episode_s): a_best = greedy_action(q_sa, s) ns = n_s.get(s, 0) epsilon = n0 / (n0 + ns) selection_probs = [] for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + epsilon / len(Action)) else: selection_probs.append(epsilon / len(Action)) p[s] = selection_probs return q_sa
def simulate(): random.seed(0) env = Environment() total_reward = 0.0 timesteps = 1000000 discount_rate = 0.90 for t in range(timesteps): state = State.LOW for t in range(2): action = random_policy() state, reward = env.step(state, Action(action)) total_reward += pow(discount_rate, t) * reward print("Expected return ", total_reward / timesteps)
def random_policy(): action_index = random.randint(0, 1) return Action(action_index)
def test_actions(self): actions = Action.all() self.assertEqual(4, len(actions))
:param pose: Pose from which to plan. :return: Action given by the ActionProvider """ return self.behavior.run(pose, self.belief) def observe(self, pose, obs): """ Observe the given (pose, observation) pair and update the agent's belief. :param pose: Pose :param obs: Observation (real number) """ print "obs", obs self.belief.update(pose, obs) def optimize(self, num_iterations=20): """ Run optimiaztion routines on the agent's belief :param num_iterations: number of iterations in the optimization routine (default: 20) """ self.belief.optimize(num_iterations) if __name__ == "__main__": from environment import ActionProvider0, state, simulator, Action action_provider = ActionProvider0.ActionProvider([(0,5)], 1, "kernel_traj", False) world = world0.World(state.State([2]), "static", action_provider, 0.05) action = Action.Action([[-0.5],[-1]],state.State([2])) agent = Agent(action_provider, state.State([2]), state.State([2]), "static", 20.0, 'MCTS_cont', (3, 150, 1, True)) print agent.select_action(state.State([2])) # , init_pose, init_obs, obj_fun_type, exploration_param, behavior_alg, # behavior_args):
def choose_action(self): if np.random.random() < self.endeavours_bias: return np.random.choice(Action.all()) else: return Action.all()[np.argmax(self.qvalues[self.get_state()])]