コード例 #1
0
def launch_qlearning(env, episodes, display, calculated_reward_matrix,
                     calculated_transition_matrix,
                     calculated_valid_actions_matrix,
                     calculated_lava_states_matrix,
                     calculated_goal_states_matrix):
    # Iinitalisation de la qmatrix avec des 0
    # (Nombres de cases sur la carte, nombre d'actions possibles)
    q_matrix = np.zeros((env.COLS * env.ROWS, 4))
    gamma = 0.8
    for step in range(episodes):
        env.reset()
        start_state = get_state(env.position, env.COLS)
        future_rewards = []
        current_state = start_state
        # While diamond or lava not found
        while not is_lava_or_goal(current_state, calculated_lava_states_matrix,
                                  calculated_goal_states_matrix):
            action = get_next_action(current_state, get_flat_matrix(q_matrix),
                                     calculated_valid_actions_matrix,
                                     (episodes / (step + 1)))
            next_state = calculated_transition_matrix[current_state][action]
            future_rewards.append(q_matrix[next_state][get_next_action(
                next_state, get_flat_matrix(q_matrix),
                calculated_valid_actions_matrix, (episodes / (step + 1)))])
            # Bellman Equation
            q_state = calculated_reward_matrix[current_state][
                action] + gamma * max(future_rewards)
            q_matrix[current_state][action] = q_state
            reward, done = env.step(Action(action))
            display(calculated_reward_matrix[current_state][action],
                    Action(action), done, "## {} ##".format(step))
            current_state = next_state
コード例 #2
0
 def convert_input_comb_to_actions(input_comb):
     actions = []
     for action_type, action_ele in input_comb:
         action_value = input_comb[(action_type, action_ele)]
         action = Action(action_type=action_type, element=action_ele, value=action_value)
         actions.append(action)
     return actions
コード例 #3
0
 def choose_action(self, state):
     action2q = {a: self.q[(state, a)] for a in Action.get_actions()}
     if random.random() < self.epsilon:
         return random.choice(action2q.keys())
     max_q = max(action2q.values())
     best_actions = [a for a in action2q if action2q[a] == max_q]
     return random.choice(best_actions)
コード例 #4
0
 def __init__(self, epsilon=0.2, alpha=0.1, gamma=0.9):
     self.q = Counter()
     self.epsilon = epsilon
     self.alpha = alpha
     self.gamma = gamma
     self.action = Action()
     self.pool = Pool(5)
コード例 #5
0
 def learn(self):
     best_input_comb = self.form.best_input_comb
     positive_samples = []
     negative_samples = []
     previous_actions = []
     for (action_type, action_ele) in best_input_comb:
         value_candidates = self.form.input_candidates[(action_type, action_ele)]
         best_value = best_input_comb[(action_type, action_ele)]
         best_action = None
         for value in value_candidates:
             action = Action(action_ele, action_type, value)
             encoding = self.encode(self.form, previous_actions, action)
             if value == best_value:
                 positive_samples.append(encoding)
             else:
                 negative_samples.append(encoding)
         if best_action:
             previous_actions.append(best_action)
     # negative_samples = random.sample(negative_samples, len(positive_samples))
     samples = positive_samples + negative_samples
     if len(samples) <= 0:
         return None
     samples = self.zip_inputs(samples)
     labels = [1.0] * len(positive_samples) + [0.0] * len(negative_samples)
     history = self.net.fit(x=samples, y=np.array(labels), epochs=5, verbose=0)
     # i = random.randint(0, len(positive_samples) - 1)
     # self.fe.show_image("output/positive_action_loc.png", positive_samples[i][0])
     # self.fe.show_image("output/positive_dom_sim.png", positive_samples[i][1])
     return history.history["loss"][-1]
コード例 #6
0
 def load_action(state, action_line):
     m = re.match(ACTION_RE, action_line)
     action_type, value, target_locator = m.group(1), m.group(
         2), m.group(3)
     target_ele = state.get_element_by_locator(target_locator)
     action = Action(element=target_ele,
                     action_type=action_type,
                     value=value)
     return action
コード例 #7
0
 def greedy_input_comb_rewards(self):
     greedy_input_comb_rewards = OrderedDict()
     for (action_type, action_ele) in self.input_candidates:
         values = self.input_candidates[(action_type, action_ele)]
         greedy_input_comb_rewards[(action_type, action_ele)] = {}
         for value in values:
             action = Action(action_ele, action_type, value)
             action_reward = self.simulate_actions([action])
             greedy_input_comb_rewards[(action_type, action_ele)][value] = action_reward
     return greedy_input_comb_rewards
コード例 #8
0
ファイル: ai.py プロジェクト: 0b11stan/q-cliff-walker
 def __init__(self,
              env,
              endeavours_bias=0.1,
              longterm_satisfaction_bias=0.9):
     self.endeavours_bias = endeavours_bias
     self.longterm_satisfaction_bias = longterm_satisfaction_bias
     self.env = env
     num_states = env.COLS * env.ROWS
     num_actions = len(Action.all())
     self.qvalues = np.zeros((num_states, num_actions))
コード例 #9
0
 def learn_q(self, last_state, action, reward, now_state):
     max_state_value = max(
         [self.q[(now_state, a)] for a in Action.get_actions()])
     # 全部使用reward初始化
     if (last_state, action) not in self.q:
         self.q[(last_state, action)] = reward
     self.q[(
         last_state,
         action)] += self.alpha * (reward + self.gamma * max_state_value -
                                   self.q[(last_state, action)])
コード例 #10
0
 def generate_input_comb(self, epsilon=1.0, eval_func=None):
     input_comb = OrderedDict()
     action_categories = []
     previous_actions = []
     for action_type, action_ele in self.input_candidates:
         previous_values = [
             None if previous_action.value is None else previous_action.value_text_parsed
             for previous_action in previous_actions
         ]
         value_candidates = []
         for value in self.input_candidates[(action_type, action_ele)]:
             if value is not None:
                 value_text = Action(action_ele, action_type, value).value_text_parsed
                 if value_text in previous_values:
                     continue
             value_candidates.append(value)
         if np.random.uniform() <= epsilon:
             action_category = "Explore"
             if np.random.uniform() <= 0.5:
                 action_value = random.choice(value_candidates)
             else:
                 greedy_input_comb_rewards = self.greedy_input_comb_rewards[(action_type, action_ele)]
                 value_candidate_rewards = {
                     value_candidate: greedy_input_comb_rewards[value_candidate]
                     for value_candidate in value_candidates
                 }
                 action_value = Utils.weighted_choice(value_candidate_rewards)
         else:
             action_category = "Exploit"
             max_q_score = -1
             best_value = random.choice(value_candidates)
             for value_candidate in value_candidates:
                 action_candidate = Action(element=action_ele, action_type=action_type, value=value_candidate)
                 q_score = eval_func(self, previous_actions, action_candidate)
                 if q_score > max_q_score:
                     max_q_score = q_score
                     best_value = value_candidate
             action_value = best_value
         previous_actions.append(Action(element=action_ele, action_type=action_type, value=action_value))
         input_comb[(action_type, action_ele)] = action_value
         action_categories.append(action_category)
     return input_comb, action_categories
コード例 #11
0
ファイル: replay_buffer.py プロジェクト: neuralsyn/muzero
 def SaveGame(self, request, context):
     game_history = GameHistory()
     game_history.observations = [tf.make_ndarray(observation) for observation in request.observations]
     game_history.actions = [Action(index) for index in request.actions]
     game_history.rewards = request.rewards
     game_history.to_plays = [Player(player_id) for player_id in request.to_plays]
     game_history.root_values = request.root_values
     game_history.policies = [policy.probabilities for policy in request.policies]
     self.replay_buffer.save_history(game_history)
     print('Number of games in buffer: {}'.format(len(self.replay_buffer.buffer)))
     return replay_buffer_pb2.SaveGameResponse(success=True)
コード例 #12
0
ファイル: replay_buffer.py プロジェクト: neuralsyn/muzero
    def sample_batch(self, batch_size, num_unroll_steps, td_steps, discount):
        request = replay_buffer_pb2.MiniBatchRequest(batch_size=batch_size, num_unroll_steps=num_unroll_steps,
                                                     td_steps=td_steps, discount=discount)
        response = self.remote_replay_buffer.SampleBatch(request)

        batch = []
        for datapoint in response.datapoints:
            observation = tf.make_ndarray(datapoint.observation)
            actions = [Action(index) for index in datapoint.actions]
            targets = [(target.value, target.reward, target.policy.probabilities) for target in datapoint.targets]
            batch.append((observation, actions, targets))
        return batch
コード例 #13
0
    def __init__(self, name, environment_class, environment_parameters,
                 num_players, action_space_size, discount):
        self.name = name
        self.environment_class = environment_class
        self.environment_parameters = environment_parameters
        self.num_players = num_players
        self.action_space_size = action_space_size
        self.discount = discount

        self.action_space = [
            Action(index) for index in range(action_space_size)
        ]
コード例 #14
0
    def extract_input_candidates(task):
        input_candidates = {}
        selectable_values = set()
        for action in task.state.possible_actions:
            action_type, action_ele, action_value = action.action_type, action.element, action.value
            if action_type not in [Action.INPUT_TEXT, Action.SELECT]:
                continue
            if (action_type, action_ele) not in input_candidates:
                input_candidates[(action_type, action_ele)] = [None]
            if action_value in input_candidates[(action_type, action_ele)]:
                continue
            if action_type == Action.SELECT:
                action_value_useful = False
                for word in task.all_words_parsed:
                    word_sim = Utils.text_similarity(word, action.value_text_parsed)
                    if word_sim > 0.5:
                        selectable_values.add(word)
                        action_value_useful = True
                if not action_value_useful:
                    continue
            input_candidates[(action_type, action_ele)].append(action_value)
        input_candidates = OrderedDict(sorted(input_candidates.items(), key=lambda x: x[0][1].id))

        for (action_type, action_ele) in input_candidates:
            values = input_candidates[(action_type, action_ele)]
            values_parsed = [
                None if value is None else Action(action_ele, action_type, value).value_text_parsed
                for value in values
            ]

            # keep the max-similarity value for each parameter
            filtered_values = [None]
            for word in task.all_words_parsed:
                if action_type == Action.INPUT_TEXT and word in selectable_values:
                    continue
                max_score = 0
                max_score_value = None
                for i, value_parsed in enumerate(values_parsed):
                    if value_parsed is None:
                        continue
                    value_score = Utils.text_similarity(word, value_parsed)
                    if value_score > max_score:
                        max_score = value_score
                        max_score_value = values[i]
                if max_score_value is not None and max_score_value not in filtered_values:
                    filtered_values.append(max_score_value)
            values = filtered_values

            values = sorted(values, key=lambda x: str(x))
            input_candidates[(action_type, action_ele)] = values
        return input_candidates
コード例 #15
0
def mc_control(num_episodes=10000):
    q_sa = {}
    p = {}
    n_s = {}
    n_sa = {}
    n0 = 100

    for _ in range(num_episodes):
        state = State()
        reward = 0
        episode_s = []
        episode_sa = []

        while not state.terminal:
            s = state.as_tuple()
            if s in p:
                a = sample_action(p[s])
            else:
                a = Action.random()

            episode_s.append(s)
            episode_sa.append(s + (a, ))
            state, reward = step(state, a)

            ns = n_s.get(s, 0)
            n_s[s] = ns + 1

            sa = s + (a, )
            nsa = n_sa.get(sa, 0)
            n_sa[sa] = nsa + 1

        # GLIE MC Control
        for sa in set(episode_sa):
            nsa = n_sa[sa]
            qsa = q_sa.get(sa, 0)
            q_sa[sa] = qsa + ((reward - qsa) / nsa)

        # Improve policy
        for s in set(episode_s):
            a_best = greedy_action(q_sa, s)
            ns = n_s.get(s, 0)
            epsilon = n0 / (n0 + ns)

            selection_probs = []
            for a in list(Action):
                if a is a_best:
                    selection_probs.append(1 - epsilon + epsilon / len(Action))
                else:
                    selection_probs.append(epsilon / len(Action))
            p[s] = selection_probs
    return q_sa
コード例 #16
0
def simulate():
    random.seed(0)
    env = Environment()
    total_reward = 0.0
    timesteps = 1000000
    discount_rate = 0.90

    for t in range(timesteps):
        state = State.LOW

        for t in range(2):
            action = random_policy()
            state, reward = env.step(state, Action(action))
            total_reward += pow(discount_rate, t) * reward

    print("Expected return ", total_reward / timesteps)
コード例 #17
0
def random_policy():
    action_index = random.randint(0, 1)
    return Action(action_index)
コード例 #18
0
 def test_actions(self):
     actions = Action.all()
     self.assertEqual(4, len(actions))
コード例 #19
0
        :param pose: Pose from which to plan.
        :return: Action given by the ActionProvider
        """
        return self.behavior.run(pose, self.belief)

    def observe(self, pose, obs):
        """
        Observe the given (pose, observation) pair and update the agent's belief.
        :param pose: Pose
        :param obs: Observation (real number)
        """
        print "obs", obs
        self.belief.update(pose, obs)

    def optimize(self, num_iterations=20):
        """
        Run optimiaztion routines on the agent's belief
        :param num_iterations: number of iterations in the optimization routine (default: 20)
        """
        self.belief.optimize(num_iterations)

if __name__ == "__main__":
    from environment import ActionProvider0, state, simulator, Action
    action_provider = ActionProvider0.ActionProvider([(0,5)], 1, "kernel_traj", False)
    world = world0.World(state.State([2]), "static", action_provider, 0.05)
    action = Action.Action([[-0.5],[-1]],state.State([2]))
    agent = Agent(action_provider, state.State([2]), state.State([2]), "static", 20.0, 'MCTS_cont',  (3, 150, 1, True))
    print agent.select_action(state.State([2]))
    # , init_pose, init_obs, obj_fun_type, exploration_param, behavior_alg,
                #  behavior_args):
コード例 #20
0
ファイル: ai.py プロジェクト: 0b11stan/q-cliff-walker
 def choose_action(self):
     if np.random.random() < self.endeavours_bias:
         return np.random.choice(Action.all())
     else:
         return Action.all()[np.argmax(self.qvalues[self.get_state()])]