def Monte_Carlo(self):
        """
        Monte_Carlo experiments

        :return: Q_table, policy
        """

        Q_table = self.table_init()  # Q_table initialization
        policy = {}  # policy table

        for k in range(self.M):  # iterations
            x = self.state_init()  # initial state
            u = self.epsilon_greedy(int(np.argmax(Q_table[x])), self.epsilon)
            while x != self.xG:  # stop condition
                x_next = self.move_next(x, self.u_set[u])  # next state
                reward = env.get_reward(x_next, self.lose)  # reward observed
                u_next = self.epsilon_greedy(int(np.argmax(Q_table[x_next])),
                                             self.epsilon)
                Q_table[x][u] = (1 - self.alpha) * Q_table[x][u] + \
                                self.alpha * (reward + self.gamma * Q_table[x_next][u_next])
                x, u = x_next, u_next

        for x in Q_table:
            policy[x] = int(np.argmax(Q_table[x]))  # extract policy

        return Q_table, policy
Exemple #2
0
    def cal_Q_value(self, x, p, table):
        """
        cal Q_value.

        :param x: next state vector
        :param p: probability of each state
        :param table: value table
        :return: Q-value
        """

        value = 0
        reward = env.get_reward(x, self.xG,
                                self.lose)  # get reward of next state
        for i in range(len(x)):
            value += p[i] * (reward[i] + self.gamma * max(table[x[i]]))

        return value
Exemple #3
0
                env.dump_image(os.path.join(img_dir, '%d.png' % t))

            view_batches = get_view(env)  # s
            actions, actions_batches = model.infer_actions(
                sess, view_batches, policy=argv.policy,
                epsilon=argv.epsilon)  # a

            env.take_action(actions)
            env.decrease_health()
            env.update_pig_pos()
            # env.update_rabbit_pos()

            if video_flag:
                env.dump_image(os.path.join(img_dir, '%d.png' % (t + 1)))

            rewards = get_reward(env)  # r, a dictionary
            env.increase_health(rewards)
            total_reward = 0
            for k, v in rewards.items():
                total_reward += v

            new_view_batches = get_view(env)  # s'
            maxQ_batches = model.infer_max_action_values(
                sess, new_view_batches)

            model.train(sess=sess,
                        view_batches=view_batches,
                        actions_batches=actions_batches,
                        rewards=rewards,
                        maxQ_batches=maxQ_batches,
                        learning_rate=argv.learning_rate)
Exemple #4
0
                    predict = main_qn.predict(predict_arr)[0]
                    action = np.argmax(predict)
                    value = predict

                do_action(action)
                time.sleep(0.05)

            state = get_state()
            state_deque.append(state.astype('float32') / 255.0)

            print('Episode: {}, Step: {}, Epsilon: {}, Action: {}, '.format(
                episode, step, epsilon, action_name[action]),
                  end='')

            if step > warmup_steps:
                reward = get_reward(state_deque)
                if is_scrolling(state_deque) and (action == 2 or action == 3):
                    reward += 0.1  # additional reward for moving toward the right side
                if is_dead(state_deque):
                    reward = -1
                    dead = True
                print('Reward: {}'.format(reward))
                memory.add((current_state_arr, action, reward,
                            get_state_arr(state_deque)))

            print(value)

            if do_learn is True:
                if len(memory) >= batch_size:
                    pause_button()
                0, top_action.data[0]] + prob_choose_top
            sampled_action = Categorical(action_probs).sample()

            print(action_probs)
            print(epsilon)

            # Act and update the current observation
            cur_observation, cur_frame, done = env.act(agent_host,
                                                       sampled_action.data[0])
            num_timesteps += 1

            if (done):
                break

            # Calculate the reward based on the current task
            reward = env.get_reward(task, cur_observation, prev_observation)

            # Check for episode's termination
            if (reward > 0):
                done = True
            if (num_timesteps >= max_EPISODE_LENGTH):
                done = True

            episode_trajectory.append(
                Point(last_frames.numpy(), instruction.numpy(),
                      sampled_action.data.numpy(), reward,
                      action_probs.data.numpy()))

            if (done):
                break
Exemple #6
0
def apply_action_in_env(states, actions):
    lats = env.convert_to_latents(states)
    new_lats = env.apply_actions(lats, actions, simplify=False)
    rewards = env.get_reward(lats, actions, new_lats, simplify=False)
    new_states = env.convert_to_obs(lats)
    return [rewards, new_states]
Exemple #7
0
# function and observe the reward value. We then update the record array with this new
# observation. We repeat this process a bunch of times, and it will continually update
# the record array. The arm with the highest reward probability should eventually get
# chosen most often, since it will give out the highest average reward.
# excerpt from the Book: Deep Reinforcement Learning in Action A.Zai, B. Brown

for i in range(500):
    if (use_softmax):
        # use softmax probabilities of values to choose action
        p = softmax(record[:, 1])
        choice = np.random.choice(np.arange(n), p=p)
    else:
        # use for epsilon greedy approach
        if random.random() > eps:
            choice = get_best_arm(record)
        else:
            choice = np.random.randint(10)

    # computes the reward for choosing the arm
    r = get_reward(probs[choice])
    # updates the Q-tale
    record = update_record(record, choice, r)
    #print(f"Updated record: {record}")

    #keeps track of the running average of rewards
    mean_reward = ((i + 1) * rewards[-1] + r) / (i + 2)
    rewards.append(mean_reward)

# plot the cumalative rewards vs iteration
ax.scatter(np.arange(len(rewards)), rewards)
plt.savefig("./cumulative_rewards.png")