def main(problem_id, map_name_base): 
    #random agent derived from lochlomond_demo.py provided by tutor prof.bjorn jensen for ai course(2019-20) University of Glasgow
    if(problem_id < 0 or problem_id > 7):
        problem_id = problem_id
    else:
        print("Probleam ID should be between 0 and 7")
    
    if(map_name_base == "8x8-base" or map_name_base == "4x4-base"):
        map_name_base = map_name_base 
    else:
        print("Map base can be 8x8-base or 4x4-base")
    
    reward_hole = 0.0     
    is_stochastic = True  
    EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"])
    max_episodes = 10000  
    max_iter_per_episode = 1000 
    
    #generate the specific problem
    env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole)

    env.action_space.sample() 


    print(env.desc)

    state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)

   
    np.random.seed(12)
    stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),episode_rewards=np.zeros(max_episodes))

    for e in range(max_episodes): 
        observation = env.reset()      

        for iter in range(max_iter_per_episode):      
          action = env.action_space.sample() #The agent goes here
          observation, reward, done, info = env.step(action) 

          stats.episode_rewards[e] += reward #collect useful stats for comparison and plotting
          stats.episode_lengths[e] = iter
          
          if(done and reward==reward_hole): 
              print("We have reached a hole :-( [we can't move so stop trying; just give up... and perhaps restart]")
              break

          if (done and reward == +1.0):
              #env.render()     
              print("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal... perhaps try again?]")
              break

    return (stats)
Esempio n. 2
0
def main(problemID, mapID):
    problem = int(problemID)
    rewardHole = -0.02
    stochastic = True
    trainingEpisodes = 35000
    episodes = 1000
    iterPerEpisode = 2000
    mapBase = mapID
    np.random.seed(12)
    successes = 0  # records the number of successes
    totalReward = 0
    stats = {"episodes": {}}

    # set up the environment
    env = LochLomondEnv(problem_id=problem,
                        is_stochastic=stochastic,
                        map_name_base=mapBase,
                        reward_hole=rewardHole)

    qTable = generate_q(env, trainingEpisodes, iterPerEpisode)

    print("___________________________________")
    print("Training Finished")
    print("Attempting to find solution...")

    for episode in range(episodes):
        # initial params
        state = env.reset()
        step = 0
        done = False
        reward = 0
        for step in range(iterPerEpisode):
            action = np.argmax(qTable[state, :])  # take the best action
            nextState, reward, done, info = env.step(action)
            if done:
                stats["episodes"][episode] = {"steps": step, "reward": reward}
                if (reward == 1.0):
                    successes += 1
                totalReward += reward
                break
        state = nextState

    successRate = ((successes / episodes) * 100)
    print("___________________________________")
    print("Finished")
    print("Success Rate: " + str(successRate) + "%")
    print("Total Reward: " + str(totalReward))
    # log stats
    stats["successrate"] = successRate
    stats["totalreward"] = totalReward
    stats["qtable"] = qTable
    return stats, qTable
Esempio n. 3
0
def run(problem_id=0,
        max_episodes=10000,
        max_iters_per=2000,
        reward_hole=-1.0):
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=True,
                        reward_hole=reward_hole)

    epsilon = 0.9
    lr_rate = 0.81
    gamma = 0.96
    epsilon_reduce = 1 / max_episodes

    Q = np.zeros((env.observation_space.n, env.action_space.n))

    np.random.seed(12)

    results = []

    for episode in range(max_episodes):
        state = env.reset()
        print('-' * 50)
        print_headers()

        for iter in range(max_iters_per):
            action = choose_action(state, epsilon, Q, env)
            state2, reward, done, info = env.step(action)
            print(",".join([
                str(episode),
                str(iter),
                str(reward),
                str(done),
                str(info),
                str(action)
            ]))

            learn(state, state2, reward, action, Q, gamma, lr_rate)
            state = state2
            if done and reward == reward_hole:
                print('Found a hole in ' + str(iter) + ' iterations')
                results.append({'iters': iter, 'success': False})
                break
            if done:
                print('Found frisbee in ' + str(iter) + ' iterations')
                results.append({'iters': iter, 'success': True})
                break

        epsilon -= epsilon_reduce

    return results
Esempio n. 4
0
def main(problemID, mapID):
    problem = int(problemID)
    reward_hole = -1.0
    stochastic = True
    episodes = 1000
    iterPerEpisode = 2000
    mapBase = mapID
    successes = 0  # records the number of successes
    stats = {"episodes": {}}
    totalReward = 0  # reward per episode

    # set up the environment
    env = LochLomondEnv(problem_id=problem,
                        is_stochastic=stochastic,
                        map_name_base=mapBase,
                        reward_hole=reward_hole)

    np.random.seed(12)

    for episode in range(episodes):  # iterate over episodes
        print("___________________________________")
        print("EPISODE: " + str(episode))
        observation = env.reset(
        )  # reset the state of the env to the starting state

        reward = 0

        for step in range(iterPerEpisode):
            action = env.action_space.sample(
            )  # your agent goes here (the current agent takes random actions)
            observation, reward, done, info = env.step(
                action)  # observe what happends when you take the action
            # Check if we are done and monitor rewards etc...

            if done:
                stats["episodes"][episode] = {"steps": step, "reward": reward}
                totalReward += reward
                break

    successRate = ((successes / episodes) * 100)
    print("Finished")
    print("Success Rate: " + str(successRate) + "%")
    print("Total Reward: " + str(totalReward))
    # log stats
    stats["successrate"] = successRate
    stats["totalreward"] = totalReward
    return stats
Esempio n. 5
0
def train_for_one_problem(problem_id, map_name):
    problem_id = problem_id  # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
    reward_hole = 0.0  # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice)
    is_stochastic = False  # should be False for A-star (deterministic search) and True for the RL agent

    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=is_stochastic,
                        reward_hole=reward_hole,
                        map_name_base=map_name)
    env.reset()

    done = False
    total_test_num = 60000
    restart_times = 0
    succeed_times = 0
    shortest_path = 100
    one_map_succeed_percentage = []

    for i in range(total_test_num):
        restart_times += 1
        done = False
        n_actions_for_episode = 0
        while not done:
            n_actions_for_episode += 1
            action = env.action_space.sample(
            )  # take random action from the available actions
            observation, reward, done, info = env.step(action)

            if done:
                print("\rProblem:%s Episodes #%s / 60000" %
                      (problem_id, restart_times),
                      end='')
                if reward == 1.0:
                    if shortest_path > n_actions_for_episode:
                        shortest_path = n_actions_for_episode
                    succeed_times += 1
                else:
                    env.reset()

    print("\nSucceed Times:", succeed_times)
    print("Total Times:", total_test_num)
    print("Shortest path:", shortest_path)

    one_map_succeed_percentage = float(succeed_times / 60000)
    return one_map_succeed_percentage
    env.close()
Esempio n. 6
0
def run(problem_id=0, max_episodes=10000, max_iters_per=2000, reward_hole=0.0):
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=True,
                        reward_hole=reward_hole)

    np.random.seed(12)

    results = []

    for episode in range(max_episodes):
        env.reset()
        print('-' * 50)
        print_headers()

        for iteration in range(max_iters_per):
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            print(",".join([
                str(episode),
                str(iteration),
                str(reward),
                str(done),
                str(info),
                str(action)
            ]))

            if done and reward == reward_hole:
                env.render()
                print("Hole Found in " + str(iteration) + " iterations")
                results.append({'iters': iteration, 'success': False})
                break

            if done and reward == 1.0:
                env.render()
                print("Frisbee acquired in " + str(iteration) + " iterations")
                results.append({'iters': iteration, 'success': True})
                break

    return results
state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(
    env)

# Reset the random generator to a known state (for reproducability)
np.random.seed(12)

####
for e in range(max_episodes):  # iterate over episodes
    observation = env.reset(
    )  # reset the state of the env to the starting state

    for iter in range(max_iter_per_episode):
        #env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line
        action = env.action_space.sample(
        )  # your agent goes here (the current agent takes random actions)
        observation, reward, done, info = env.step(
            action)  # observe what happends when you take the action

        # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner

        print("e,iter,reward,done =" + str(e) + " " + str(iter) + " " +
              str(reward) + " " + str(done))

        # Check if we are done and monitor rewards etc...
        if (done and reward == reward_hole):
            env.render()
            print(
                "We have reached a hole :-( [we can't move so stop trying; just give up]"
            )
            break

        if (done and reward == +1.0):
Esempio n. 8
0
state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(
    env)
print(state_space_locations)

# Reset the random generator to a known state (for reproducibility)
np.random.seed(12)

# Run a random/senseless agent
for e in range(max_episodes):  # iterate over episodes
    observation = env.reset(
    )  # reset the state of the env to the starting state

    for iter in range(max_iter_per_episode):
        action = env.action_space.sample()
        # observe what happends when you take the action
        observation, reward, done, info = env.step(action)

        # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner

        # print("e,iter,reward,done =" + str(e) + " " +
        #       str(iter) + " " + str(reward) + " " + str(done))

        # Check if we are done and monitor rewards etc...
        if (done and reward == reward_hole):

            print("HOLE")
            break

        if (done and reward == +1.0):
            env.render()
            print("FINISHED")
def main(problem_id, map_name_base):
    #rl agent referenced from lab 8 and 9 notebooks provided by tutor prof.bjorn jensen for ai course(2019-20) University of Glasgow
    if (problem_id < 0 or problem_id > 7):
        problem_id = problem_id
    else:
        print("Problem ID should be between 0 and 7")

    if (map_name_base == "8x8-base" or map_name_base == "4x4-base"):
        map_name_base = map_name_base
    else:
        print("Map base can be 8x8-base or 4x4-base")

    reward_hole = -0.05  #Hole penalty is set based on analysis to ensure that the reward is maximized
    is_stochastic = True
    EpisodeStats = namedtuple("Stats", ["episode_lengths", "episode_rewards"])
    map_name_base = map_name_base

    np.random.seed(12)
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=is_stochastic,
                        reward_hole=reward_hole,
                        map_name_base=map_name_base)

    states = env.observation_space.n
    actions = env.action_space.n
    Q = np.zeros((states, actions))

    max_episodes = 10000
    max_iter_per_episode = 1000

    alpha = 0.1  #learning rate
    gamma = 0.999  #discount rate
    epsilon = 1
    stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),
                         episode_rewards=np.zeros(max_episodes))

    for episode in range(max_episodes):
        state = env.reset()

        for step in range(max_iter_per_episode):
            # take best action according to Q-table if random value is greater than epsilon, otherwise take a random action
            random_value = random.uniform(0, 1)
            if random_value > epsilon:
                action = np.argmax(Q[state, :])  #Agent goes here
            else:
                action = env.action_space.sample()

            new_state, reward, done, info = env.step(action)
            Q[state, action] = Q[state, action] + alpha * (
                reward + gamma * np.max(Q[new_state, :]) - Q[state, action])
            stats.episode_rewards[episode] += reward
            stats.episode_lengths[episode] = step
            state = new_state

            if done:
                break

        epsilon = 0.01  #epsilon is set to a low value to make sure of the exploitation

    print(Q)

    return (stats)
Esempio n. 10
0
def run_senseless_agent(problem_id, map):

    reward_hole = 0.0
    max_episodes = 10000
    max_iter_per_episode = 1000

    env = LochLomondEnv(problem_id=problem_id, is_stochastic=True,
                        map_name_base=map,
                        reward_hole=reward_hole)

    env.render()
    env.action_space.sample()

    np.random.seed(12)

    # variables for performance evaluation
    # number of times goal is reached out of max_episodes/ (performance measures where reward is collected)
    goal_episodes = []
    # number of episodes agent falls in hole
    hole_episodes = []
    # average number of iterations taken to reach goal per rewarded episode
    goal_iterations = []

    rewards = []

    # number of episodes before goal is first reached
    first_goal = 0

    for e in range(max_episodes):

        rewards_current_episode = 0
        state = env.reset()

        for iter in range(max_iter_per_episode):

            action = env.action_space.sample()
            state, reward, done, info = env.step(action)

            rewards_current_episode += reward

            if (done and reward == reward_hole):
                hole_episodes.append(e)
                break

            if (done and reward == +1.0):
                # env.render()
                goal_episodes.append(e)
                goal_iterations.append(iter+2)

                # sets first goal to episode
                if first_goal == 0:
                    first_goal = e
                break

        rewards.append(rewards_current_episode)

    # calculating steps to goal
    goal_iteration_average = mean(goal_iterations)
    goal_iteration_bestcase = mini(goal_iterations)
    goal_iteration_worstcase = maxi(goal_iterations)


    # splits collected rewards into per 100 episodes
    rewards_per_100_eps = np.split(np.array(rewards), max_episodes / 100)
    rewards_per_100_eps = [str(sum(r / 100)) for r in rewards_per_100_eps]


    return len(goal_episodes), len(hole_episodes), goal_iteration_average, goal_iteration_bestcase, \
           goal_iteration_worstcase,  first_goal, rewards_per_100_eps
Esempio n. 11
0
class MyAbstractAIAgent():
    """
    Abstract agent that works as a base for all our agents.


    """
    def __init__(self, problem_id, map_name_base="8x8-base"):
        # map_name_base="4x4-base"
        if not (0 <= problem_id <= 7):
            raise ValueError("Problem ID must be 0 <= problem_id <= 7")

        self.map_name_base = map_name_base
        self.env = LochLomondEnv(problem_id=problem_id,
                                 is_stochastic=self.is_stochastic(),
                                 reward_hole=self.reward_hole(),
                                 map_name_base=map_name_base)

        self.problem_id = problem_id
        self.reset()
        self.out = 'out/'
        self.policy = {}
        self._train = []
        self.graphs = {}

    def is_stochastic(self):
        raise NotImplementedError

    def reward_hole(self):
        raise NotImplementedError

    def reset(self):
        self.rewards = 0
        self.failures = 0
        self.eval = []
        self.timeouts = 0

    def solve(self, episodes=10000, iterations=1000, seed=None, gamma=0.95):
        print('Solving with {} Agent'.format(self.name().capitalize()))
        print('Problem: ', self.problem_id)
        print('Grid: ', self.map_name_base)
        print('Episodes that will run...: ', episodes)

        self.train(episodes=episodes, iterations=iterations)
        rewards = self.rewards
        timeouts = self.timeouts
        failures = self.failures

        for e in range(1, episodes + 1):  # iterate over episodes
            state = self.env.reset()
            self.set_episode_seed(e, seed)

            if e % 1000 == 0:
                print("Eval Episode", e)

            for i in range(1, iterations + 1):
                action = self.action(state)
                state, reward, done, info = self.env.step(action)

                if done:
                    if reward == 1.0:
                        rewards += int(reward)
                    else:
                        failures += 1

                    # break the cycle
                    break

            if not done:
                timeouts += 1

            self.eval.append([
                self.problem_id, e, i,
                to_human(action),
                int(reward), rewards, rewards / e, failures, timeouts
            ])

    def action(self, i):
        raise NotImplementedError

    def train(self, episodes, iterations):
        raise NotImplementedError

    def env(self):
        return self.env

    def set_episode_seed(self, episode, seed=None):
        # by default no seed for abstract agent
        return None

    def alias(self):
        return '{}out_{}_{}_{}'.format(self.out, self.name(), self.problem_id,
                                       self.env.ncol)

    def evaluate(self, episodes):
        self.env.reset()
        print("This is the environment: ")
        print(self.env.render())

        if (len(self.policy) > 0):
            print("This is the final policy: ")
            print_table(
                policy_to_arrows(self.policy, self.env.ncol, self.env.ncol))

        print('Saving Evaluation Files...')
        self.write_eval_files()

        # Plotting mean rewards
        print('Saving Plots...')
        labels = ['Episodes', 'Mean Reward']
        title = 'Problem {}. Plot for {} Agent'.format(
            self.problem_id,
            self.name().capitalize())

        if (len(self._train) > 0):
            subtitle = 'Episodes vs Mean Reward (Training Phase).'
            self.plot_train(range(episodes), labels, title, subtitle, 'mr')

            subtitle = 'First 1000 Episodes vs Mean Reward (Training Phase).'
            self.plot_train(range(999), labels, title, subtitle,
                            'mr_first_1000')

            subtitle = 'Last 1000 Episodes vs Mean Reward (Training Phase).'
            self.plot_train(range(episodes - 1000, episodes - 1), labels,
                            title, subtitle, 'mr_last_1000')

        if (len(self.eval) > 0):
            subtitle = 'Episodes vs Mean Reward (Evaluation Phase).'
            self.plot_evaluation(range(episodes), labels, title, subtitle,
                                 'mr')

            subtitle = 'First 1000 Episodes vs Mean Reward (Evaluation Phase).'
            self.plot_evaluation(range(999), labels, title, subtitle,
                                 'mr_first_1000')

            subtitle = 'Last 1000 Episodes vs Mean Reward (Evaluation Phase).'
            self.plot_evaluation(range(episodes - 1000, episodes - 1), labels,
                                 title, subtitle, 'mr_last_1000')

        if (len(self.graphs) > 0):
            subtitle = 'Utilities plot'
            self.plot_utilities(['Episodes', 'U'], title, subtitle)

    def write_eval_files(self):
        def data_for_file(name):
            if name == 'policy':
                return policy_to_list(self.policy)
            if name == 'u':
                return u_to_list(self.U)
            if name == 'eval':
                return self.eval
            if name == 'q':
                return self.Q
            if name == 'train':
                return self._train
            if name == 'graphs':
                return self.graphs

            return []

        for file in self.files():
            if file == 'graphs':
                filename = '{}_{}.json'.format(self.alias(), file)
                with open(filename, 'w') as outfile:
                    json.dump(data_for_file(file), outfile)
            else:
                filename = '{}_{}.csv'.format(self.alias(), file)
                data = [self.header(file)] + data_for_file(file)
                np.savetxt(filename, data, delimiter=",", fmt='%s')
            print('\tFile saved: {}'.format(filename))

    def header(self, key):
        headers = {
            'eval': [
                'id', 'episode', 'iteration', 'action', 'reward', 'rewards',
                'mean_rewards', 'failures', 'timeouts'
            ],
            'policy': ['x', 'y', 'action'],
            'u': ['x', 'y', 'u'],
            'train': [
                'id', 'episode', 'iteration', 'reward', 'rewards',
                'mean_rewards', 'failures', 'timeouts'
            ],
            'graphs': ['x', 'y', 'value'],
            'q': ['position', 'x', 'y', 'action', 'action_friendly', 'value']
        }

        if key in headers:
            return headers[key]

    def plot_train(self, rows, labels, title, subtitle, suffix=''):
        """ Plots mean rewards from training phase """
        train = np.array(self._train)

        x = pd.to_numeric(train[:, 1])
        y = pd.to_numeric(train[:, 5])
        filename = '{}_train_{}.png'.format(self.alias(), suffix)

        self.plot(x, y, rows, labels, filename, title, subtitle)

    def plot_evaluation(self, rows, labels, title, subtitle, suffix=''):
        """ Plots mean rewards from evaluation phase """
        evaluation = np.array(self.eval)

        x = pd.to_numeric(evaluation[:, 1])
        y = pd.to_numeric(evaluation[:, 6])
        filename = '{}_eval_{}.png'.format(self.alias(), suffix)

        self.plot(x, y, rows, labels, filename, title, subtitle)

    def plot_utilities(self, labels, title, subtitle):
        for state, value in self.graphs.items():
            x, y = zip(*value)
            plt.plot(x, y, label=str(state))

        plt.ylim([-0.1, 1.05])
        plt.legend(loc='lower right')
        plt.xlabel(labels[0])
        plt.ylabel(labels[1])
        filename = '{}_utilities.png'.format(self.alias())
        plt.suptitle(title, fontsize=12)
        plt.title(subtitle, fontsize=10)
        plt.savefig(filename)
        plt.close()

        print('\tPlot saved: {}'.format(filename))

    def plot(self, x, y, rows, labels, filename, title, subtitle):
        plt.plot(x[rows], y[rows])
        plt.xlabel(labels[0])
        plt.ylabel(labels[1])
        plt.suptitle(title, fontsize=12)
        plt.title(subtitle, fontsize=10)
        plt.savefig(filename)
        plt.close()

        print('\tPlot saved: {}'.format(filename))
Esempio n. 12
0
def rl_agent(problem_id):

    # select small negative rewards for the RL-Agent to create an incenctive to learn
    reward_hole = -0.01

    # generate 10 000 episodes in order to give agent chance to reach the goal
    max_episodes = 10000

    # every episode should have 2000 iterations (agent can take 2000 steps in the map)
    max_iter_per_episode = 2000

    results = []

    # setup the frozen lake loch lomond environment (uncertainty involved)
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=True,
                        reward_hole=reward_hole)
    all_act = list(range(env.action_space.n))

    q_agent = QLearningAgentUofG(terminals=get_terminals(env),
                                 all_act=all_act,
                                 alpha=lambda n: 0.8,
                                 gamma=0.8,
                                 Rplus=2,
                                 Ne=5)

    print('Running Q Learning Agent for problem: '.format(problem_id))
    print("(will take a while)")

    for e in range(max_episodes):  # iterate over episodes

        state = env.reset(
        )  # reset the state of the environment to starting state S
        reward = 0

        # Over the total number of allowed iterations
        for iter in range(max_iter_per_episode):
            action = q_agent(state, reward, e + 1)

            # current agent takes actions
            if action is not None:
                state, reward, done, info = env.step(action)

            # Test condition to see if agent is done and associated rewards
            if done:
                q_agent(state, reward, e + 1)

                break

        results.append([e, iter + 1, int(reward)])

    # Compute the policy
    policy = {}

    for state_action, value in list(q_agent.Q.items()):
        state, action = state_action
        policy[state] = argmax(q_agent.actions_in_state(state),
                               key=lambda a: q_agent.Q[state, a])

    print('Policy: ')
    print_table(to_arrows(policy, 8, 8))

    # Save results to a CSV file
    np.savetxt('out_rl_{}.csv'.format(problem_id),
               np.array(results),
               header="episode,iterations,reward",
               delimiter=",",
               fmt='%s')

    np.savetxt('out_rl_{}_policy.txt'.format(problem_id),
               to_arrows(policy, 8, 8),
               delimiter="\t",
               fmt='%s')

    # Add a plot over all 10 000 episodes
    columns = ['episode', 'iterations', 'reward']

    dataframe = pd.DataFrame(data=np.array(results),
                             index=np.array(results)[0:, 0],
                             columns=columns)
    dataframe['cumulative_rewards'] = list(
        itertools.accumulate(dataframe['reward'], operator.add))
    dataframe['mean_rewards'] = dataframe.apply(lambda x: mean_rewards(x),
                                                axis=1)

    x = range(1, len(dataframe) + 1)
    y = dataframe['mean_rewards']

    title = 'Mean Reward vs Episodes'
    subtitle = 'RL-Agent: Problem ID {}'.format(problem_id)
    labels = ['Episodes', 'Mean Reward']

    add_plot(x, y, 'out_rl_{}.png'.format(problem_id), title, subtitle, labels)

    # Adding plot for the last 1000 episodes to detect potential learning
    dataframe_ac = pd.DataFrame(
        data=np.array(results)[range(max_episodes - 1000, max_episodes), :],
        columns=columns)
    dataframe_ac['episode'] = range(1000)
    dataframe_ac['cumulative_rewards'] = list(
        itertools.accumulate(dataframe_ac['reward'], operator.add))
    dataframe_ac['mean_rewards'] = dataframe_ac.apply(
        lambda x: mean_rewards(x), axis=1)

    x = range(1, len(dataframe_ac) + 1)
    y = dataframe_ac['mean_rewards']

    title = 'RL-Agent: Problem ID {}'.format(problem_id)
    subtitle = 'Last 1000 Episodes'
    labels = ['Last 1000 Episodes', 'Mean Reward']

    add_plot(x, y, 'out_rl_{}_converged.png'.format(problem_id), title,
             subtitle, labels)

    # Print involved performance measures over all 10 000 episodes
    print('Total episodes run: ', max_episodes)
    print('Allowed iterations per episode: ', max_iter_per_episode)
    print('Max iterations per episode: ', max(dataframe['iterations']))
    print('Mean iterations per episode: ', dataframe['iterations'].mean())
    print('Average success per episode: ',
          max(dataframe['cumulative_rewards']) / max_episodes)
    print('Episodes won: ', max(dataframe['cumulative_rewards']))

    # Print involved performance measures over the last 1000 episodes
    print("\n\n")
    print('Stats for the last 1000 episodes....')
    print('Max iterations per episode: ', max(dataframe_ac['iterations']))
    print('Mean iterations per episode: ', dataframe_ac['iterations'].mean())
    print('Average success per episode: ',
          max(dataframe_ac['cumulative_rewards']) / 1000)
    print('Episodes won: ', max(dataframe_ac['cumulative_rewards']))

    return dataframe
def simple_agent(problem_id):

    # since A*star agent is fully informed, any negative reward for hole would not make a difference, hence we chose 0
    reward_hole = 0.0

    # generate 10 000 episodes in order to give agent chance to reach the goal multiple times
    max_episodes = 10000   

    # since A*star agent always wins, lower limit for allowed iterations per episode to 100 (time constraint)
    max_iter_per_episode = 100 
 
    actions = []
    results = []
 
    # setup the frozen lake loch lomond environment (deterministic, no uncertainty)
    env = LochLomondEnv(problem_id=problem_id, is_stochastic=False, reward_hole=reward_hole)
    state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)
    
    # informed search problem 
    undirected_graph = UndirectedGraph(state_space_actions)
    undirected_graph.locations = state_space_locations
    graph_problem = GraphProblem(state_initial_id, state_goal_id, undirected_graph)

    node = astar_search(problem=graph_problem, h=None)
    best_path = node.solution()

    print('Running Simple Agent for problem: '.format(problem_id))

    for i in range(len(best_path)):
        if i == 0:
            previous = undirected_graph.locations[state_initial_id]
        else:
            previous = undirected_graph.locations[best_path[i - 1]]

        current = undirected_graph.locations[best_path[i]]

        action = get_action_from_location(previous, current)
        actions.append(action)

    for e in range(max_episodes): # iterate over total number of possible episodes

        observation = env.reset() # reset the state of the environment to starting state S
        
        for iter in range(max_iter_per_episode):
            
            # select action from the solution
            action = actions[iter]

            # outcome of taking a certain action
            observation, reward, done, info = env.step(action)
          
            # Test condition to see if agent is done and associated rewards
            if (done and reward==reward_hole): 
                break

            if (done and reward == +1.0):
                break

        results.append([e, iter+1, int(reward)])

    # Save results to a CSV file
    np.savetxt('out_simple_{}.csv'.format(problem_id), np.array(results), 
               header="episode,iterations,reward", delimiter=",", fmt='%s')

    columns = ['episode', 'iterations', 'reward']
    
    dataframe = pd.DataFrame(data=np.array(results), index=np.array(results)[0:,0], columns=columns)
    dataframe['cumulative_rewards'] = list(itertools.accumulate(dataframe['reward'], operator.add))
    dataframe['mean_rewards'] = dataframe.apply(lambda x: mean_rewards(x), axis=1)
    
    # Plotting the results for all task environments ID 0 to 7
    x = range(1, len(dataframe) + 1)
    y = dataframe['mean_rewards']
    
    title = 'Mean Reward vs Episodes'
    subtitle = 'Simple Agent: Problem ID {}'.format(problem_id)
    labels = ['Episodes', 'Mean Reward']

    add_plot(x, y, 'out_simple_{}_mean_reward.png'.format(problem_id), title, subtitle, labels)
    
    # Print involved performance measures over all 10 000 episodes
    print('Total episodes run: ', max_episodes)
    print('Allowed iterations per episode: ', max_iter_per_episode)
    print('Max iterations per episode: ', max(dataframe['iterations']))
    print('Mean iterations per episode: ', dataframe['iterations'].mean())
    print('Average success per episode: ', max(dataframe['cumulative_rewards']) / max_episodes)
    print('Episodes won: ', max(dataframe['cumulative_rewards']))
    print("\n")

    return dataframe
Esempio n. 14
0
def run_reinforcement_agent(problem_id, map):
    reward_hole = -0.5

    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=False,
                        reward_hole=reward_hole,
                        map_name_base=map)
    env.reset()

    # state_space, action_space, state_initial_id, state_goal_id = env2statespace(env)

    action_space = env.action_space.n
    state_space = env.observation_space.n

    q_table = np.zeros((state_space, action_space))

    # parameter set up
    max_episodes = 10000
    iterations = 1000
    learning_rate = 0.1  # alpha
    discount_rate = 0.95  # gamma
    epsilon = 0.05  # exploration-exploitation settup
    rewards = []

    hole_episode_counter = []

    # number of times goal is reached out of max_episodes/ (performance measures where reward is collected)
    goal_episodes = []
    # average number of iterations taken to reach goal per rewarded episode
    goal_iterations = []
    # number of episodes before goal is first reached
    first_goal = 0

    for episode in range(max_episodes):
        state = env.reset()

        # end learning phase at midpoint
        if episode == max_episodes / 2:
            learning_rate = 0.0

        rewards_current_episode = 0

        for step in range(iterations):

            # choose the highest q_value in table to choose action

            if np.random.uniform(0, 1) < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            # if q_table is empty, random choice
            if action == 0:
                action = env.action_space.sample()

            new_state, reward, done, info = env.step(action)

            # update q table
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) \
                                     + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

            state = new_state
            rewards_current_episode += reward

            if done == True:
                if rewards_current_episode != reward_hole:
                    # set first episode that goal is reached
                    if first_goal == 0:
                        first_goal = episode

                    goal_episodes.append(episode)
                    goal_iterations.append(step + 2)
                    # print('you reached the goal in {} steps'.format(step))
                    break
                else:
                    # print('you fell in {} steps'.format(step))
                    hole_episode_counter.append(episode)

                    break

        rewards.append(rewards_current_episode)

    rewards_per_100_eps = np.split(np.array(rewards), max_episodes / 100)
    rewards_per_100_eps = [str(sum(r / 100)) for r in rewards_per_100_eps]

    return len(goal_episodes), len(hole_episode_counter), mean(goal_iterations), \
           mini(goal_iterations), maxi(goal_iterations), first_goal, rewards_per_100_eps
Esempio n. 15
0
def main(p_id):
    # Setup the parameters for the specific problem (you can change all of these if you want to)
    problem_id = int(
        p_id
    )  # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
    reward_hole = -1.0  # should be less than or equal to 0.0 (you can fine tune this  depending on you RL agent choice)
    is_stochastic = True  # should be False for A-star (deterministic search) and True for the RL agent

    # Generate the specific problem
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=False,
                        reward_hole=reward_hole)

    #q-learning variables
    epsilon = 0.5  # degree of randomness, I found a lower rate leads to better results in the long term
    max_episodes = 2000  # you can decide you rerun the problem many times thus generating many episodes... you can learn from them all!
    max_iter_per_episode = 500  # you decide how many iterations/actions can be executed per episode

    lr_rate = 0.81
    gamma = 0.96

    Q = np.zeros((env.observation_space.n, env.action_space.n))

    def choose_action(state):
        action = 0
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  #make a random move
        else:
            action = np.argmax(Q[state, :])
        return action

    def learn(state, state2, reward, action):
        predict = Q[state, action]
        target = reward + gamma * np.max(Q[state2, :])
        Q[state, action] = Q[state, action] + lr_rate * (target - predict)

    # Reset the random generator to a known state (for reproducability)
    np.random.seed(12)

    #setup vars for logfile
    f = open("out_RL_{}.txt".format(problem_id), "w+")
    successes = 0
    failures = 0
    ####
    for e in range(max_episodes):  # iterate over episodes
        state = env.reset()  # reset the state of the env to the starting state

        for iter in range(max_iter_per_episode):
            #         env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line
            action = choose_action(
                state
            )  # your agent goes here (the current agent takes random actions)

            observation, reward, done, info = env.step(
                action)  # observe what happends when you take the action

            learn(state, observation, reward, action)

            state = observation
            #         # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner

            # Check if we are done and monitor rewards etc...
            if (done and reward == reward_hole):
                # env.render()
                # print("Failure")
                failures += 1
                f.write("e,iter,reward,done = " + str(e) + " " + str(iter) +
                        " " + str(reward) + " " + str(done) + "\n")
                # f.write("We have reached a hole :-( [we can't move so stop trying; just give up]\n")
                break

            if (done and reward == +1.0):
                # env.render()
                successes += 1
                # print("Success")
                f.write("e,iter,reward,done = " + str(e) + " " + str(iter) +
                        " " + str(reward) + " " + str(done) + "\n")
                # f.write("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]\n")
                break

    f.write("Successes: " + str(successes))
    f.write("\n")
    f.write("Failures: " + str(failures))
    successRate = successes / max_episodes * 100
    dict = {
        "Success": successes,
        "Failures": failures,
        "Episodes": max_episodes,
        "SuccessRate": successRate
    }
    return dict
Esempio n. 16
0
def train_for_one_model(problem_id, map_name, train_or_not):
    problem_id = problem_id       # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
    reward_hole = -0.01      # should be less than or equal to 0.0 (you can fine tune this depending on you RL agent choice)
    is_stochastic = True  # should be False for A-star (deterministic search) and True for the RL agent

    if  map_name == '4x4-base':
        n_dim = 4
        num_episodes = 100000
    else:
        num_episodes = 300000
        n_dim = 8

    env = LochLomondEnv(problem_id = problem_id, is_stochastic = is_stochastic, reward_hole = reward_hole, map_name_base = map_name)

    restart_times = 0
    n_actions_for_episode = 0
    rewards_all_episodes = []
    rewards_all_episodes_per_2000 = []
    x_axis_rewardsvsepisodes = []
    episode_steps = []
    max_steps_per_episode = 10000
    exploration_rate = 0.5
    q_table = np.zeros([env.observation_space.n,env.action_space.n])
    learning_rate = 0.3
    discount = 0.5
    if problem_id == 0 and n_dim == 8:
        learning_rate = 0.2
        discount = 0.8
    if problem_id == 0 and n_dim == 4:
        learning_rate = 0.4
        discount = 0.7
    epsilon_min = 0.005
    epsilon_decay_rate = 0.99995
    shortest_path = 10000
    longest_path = 0
    avg_path = []
    Train_or_not = train_or_not

    if Train_or_not == True:
        #--------------Training Process-----------------#
        for episode in range(num_episodes):
            restart_times += 1
            state = env.reset()
            done = False
            rewards_current_episode = 0
            path = [state]
            if restart_times % 5000 == 0:
                print("\ntraining in progress: #", restart_times)
            for step in range(max_steps_per_episode):
                n_actions_for_episode += 1
                # Exploration - exploitation trade-off
                exploration_exploitation_rate = random.uniform(0, 1)
                epsilon = 0.3

                if exploration_exploitation_rate < epsilon or q_table[state, :].all() == 0:
                    action = env.action_space.sample() # Exploration Method 20% i.e take random action from the available actions
                else:
                    action = np.argmax(q_table[state, :] + np.random.randn(1,4)) # Exploitation Method 80% i.e select the action with max value

                new_state, reward, done, info = env.step(action)
                path.append(new_state)

                # Update Q-table
                q_table[state, action] = q_table[state, action] + learning_rate * (reward + discount * np.max(q_table[new_state, :]) - q_table[state, action])

                state = new_state
                # rewards_current_episode += reward

                if done == True and reward == 1:
                    print("\rEpisode #%s: Finish it within %d steps" % (restart_times, len(path)),end = '')
                    break
                if done == True and reward == -0.01:
                    break

            # epsilon decay
            if epsilon >= epsilon_min:
                epsilon *= epsilon_decay_rate

            # rewards_all_episodes.append(rewards_current_episode)

            # if restart_times % 2000 == 0:
            #     avg_reward_2000 = np.sum(rewards_all_episodes) / (2000 * (restart_times / 2000))
            #     rewards_all_episodes_per_2000.append(avg_reward_2000)
            #     x_axis_rewardsvsepisodes.append(2000 * (restart_times / 2000))
        #---------------SAVE THE MODEL--------------------#
        np.save('%sx%s q_tableP%s.npy' % (n_dim, n_dim, problem_id), q_table)

    #--------FINAL TEST-----------#
    if(train_or_not == True):
        print("\nRunning Test for 50000 times. Please wait...")
    q_table = np.load('%sx%s q_tableP%s.npy' % (n_dim, n_dim, problem_id)) # Load the trained model q table
    env.reset()
    state = env.reset()
    test_total_num = 50000
    test_fail_num = 0
    test_succeed_num = 0
    Avg_rewards_per_1000_episodes = []
    Avg_reward_per_step = []
    Avg_reward_per_episode = []


    for k in range(test_total_num):
        s = env.reset()
        j=0
        rewards_temp = 0

        while j < 1000:

            j += 1
            action = np.argmax(q_table[s,:])
            new_state,r,done,b = env.step(action)
            rewards_temp += r
            s = new_state
            if done and r == -0.01:
                test_fail_num += 1
                break
            if done and r == 1.0:
                avg_path.append(j)
                if shortest_path > j:
                    shortest_path = j
                if longest_path < j:
                    longest_path = j
                test_succeed_num += 1.0
                break
            if j == 1000:
                test_fail_num += 1
        Avg_reward_per_episode.append(rewards_temp)
        Avg_reward_per_step.append(rewards_temp / j)

        if k % 1000 == 0:
            Avg_rewards_per_1000_episodes.append(np.sum(Avg_reward_per_episode) / int(1000 * float(k / 1000)))
            x_axis_rewardsvsepisodes.append(1000 * (k / 1000))
    #--------------OUTPUT FINAL RESULT-----------------#
    if (train_or_not == True):
        print("\n-------------------------------------------")
        print("Average rewards per 1000 episodes:",Avg_rewards_per_1000_episodes[-1])
        print("Average rewards per steps:", Avg_reward_per_step[-1])
        print("Success times:",test_succeed_num)
        print("Failure times:",test_fail_num)
        print("Success rate:",float(test_succeed_num / test_total_num))
        print("Success vs Failure rate:",float(test_succeed_num / test_fail_num))
        print("Steps number (Best case):",shortest_path)
        print("Steps number (Worst case):",longest_path)
        print("Steps number (On average):",np.mean(avg_path))
        print("Learning rate:",learning_rate)

    plt.cla()
    plt.plot(x_axis_rewardsvsepisodes[:], Avg_rewards_per_1000_episodes[:])
    plt.savefig('./Images/%sx%s maps: Average Rewards of Problem%s.jpg' % (n_dim,n_dim,problem_id))
    if (train_or_not == True):
        print("Figure Saved in Folder 'Images'")
        plt.show()
    return test_succeed_num, test_fail_num, shortest_path, longest_path,np.mean(avg_path), learning_rate, Avg_rewards_per_1000_episodes[-1], Avg_reward_per_step[-1]
def main(problem_id, map_name_base): 
    #simple agent referenced and adapted from lab 4 notebook by tutor prof.bjorn jensen for ai course (2019-20)
    if(problem_id < 0 or problem_id > 7):
        problem_id = problem_id
    else:
        print("Probleam ID should be between 0 and 7")
    
    if(map_name_base == "8x8-base" or map_name_base == "4x4-base"):
        map_name_base = map_name_base 
    else:
        print("Map base can be 8x8-base or 4x4-base")
    
    reward_hole = -1.0     
    is_stochastic = False  

    max_episodes = 10000 

    env = LochLomondEnv(problem_id=problem_id, is_stochastic=is_stochastic, map_name_base=map_name_base, reward_hole=reward_hole)

    env.action_space.sample() 

    print(env.desc)
    EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"])
    state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)

    frozen_lake_map = UndirectedGraph(state_space_actions)
    frozen_lake_map.locations = state_space_locations
    frozen_lake_problem = GraphProblem(state_initial_id, state_goal_id, frozen_lake_map)

    all_node_colors=[]
    iterations, all_node_colors, node = my_astar_search_graph(problem=frozen_lake_problem, h=None)

    solution_path = [node]
    cnode = node.parent
    solution_path.append(cnode)
    while cnode.state != "S_00_00":    
        cnode = cnode.parent
        if cnode is None:
            break
        solution_path.append(cnode)


    steps = solution_path[::-1]
    # Reset the random generator to a known state (for reproducibility)
    np.random.seed(12)
    
    observation = env.reset() # reset the state of the env to the starting state     

    stats = EpisodeStats(episode_lengths=np.zeros(max_episodes),episode_rewards=np.ones(max_episodes))
    for e in range(max_episodes): # iterate over episodes

        observation = env.reset() # reset the state of the env to the starting state     

        for i in range(len(steps)-1):
            action =  get_action_from_states(steps[i],steps[i+1])# your agent goes here (the current agent takes random actions)
            
            observation, reward, done, info = env.step(action) # observe what happends when you take the action
            # update stats
            stats.episode_rewards[e] = reward
            stats.episode_lengths[e] = i
    
          # Check if we are done and monitor rewards etc...
        if (done):
        
            print("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal... perhaps try again?]")
            break

    return (stats)
Esempio n. 18
0
    # return a dict for use
    return random_agent_dict


# a simple run random agent
if __name__ == '__main__':
    agent_dict = random_agent(env, problem_id, max_episodes)
    reward_random_accumulate = 0
    reward_random_total = 0
    for episode in range(max_episodes):
        state = env.reset()
        step = 0
        reward_random = 0
        for step in range(max_iter_per_episode):
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)

            reward_random_accumulate += reward

            if (step == max_iter_per_episode - 1):
                print("step over")
            if (done and reward == reward_hole):
                #print("hole :-( ")
                break
            if (done and reward == +1.0):
                reward_random_total = reward + reward_random_total
                print(
                    "We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]"
                )
                #print("Number of steps", step)
                break
Esempio n. 19
0
def main(p_id):
    # Setup the parameters for the specific problem (you can change all of these if you want to)
    problem_id = int(p_id)    # problem_id \in [0:7] generates 8 diffrent problems on which you can train/fine-tune your agent
    reward_hole = 0.0     # should be less than or equal to 0.0 (you can fine tune this  depending on you RL agent choice)
    is_stochastic = False  # should be False for A-star (deterministic search) and True for the RL agent

    max_episodes = 2000
    max_iter_per_episode = 500 # you decide how many iterations/actions can be executed per episode

    # Generate the specific problem
    env = LochLomondEnv(problem_id=problem_id, is_stochastic=False,   reward_hole=reward_hole)

    # Let's visualize the problem/env
    # print("grid= \n")
    # print(env.desc)
    # env.render
    g = Grid(env.desc)

    # Create a representation of the state space for use with AIMA A-star
    # state_space_locations, state_space_actions, state_initial_id, state_goal_id = env2statespace(env)

    # print(state_goal_id)
    # Reset the random generator to a known state (for reproducability)
    np.random.seed(12)
    #setup vars for logfile
    f= open("out_AStar_{}.txt".format(problem_id) ,"w+")
    successes = 0
    failures = 0
    ####
    for e in range(max_episodes): # iterate over episodes
        observation = env.reset() # reset the state of the env to the starting state
        steps = aStar(g)
        for iter in range(max_iter_per_episode):
            # env.render() # for debugging/develeopment you may want to visualize the individual steps by uncommenting this line

            action = steps[iter]
            # print(action)
            observation, reward, done, info = env.step(action) # observe what happends when you take the action

            #         # TODO: You'll need to add code here to collect the rewards for plotting/reporting in a suitable manner

            # Check if we are done and monitor rewards etc...
            if(done and reward==reward_hole):
            # env.render()
                # print("Failure")
                failures += 1
                f.write("e,iter,reward,done = " + str(e) + " " + str(iter)+ " " + str(reward)+ " Fail\n")
                # f.write("We have reached a hole :-( [we can't move so stop trying; just give up]\n")
                break

            if (done and reward == +1.0):
                # env.render()
                successes += 1
                # print("Success")
                f.write("e,iter,reward,done = " + str(e) + " " + str(iter)+ " " + str(reward)+ " Success\n")
                # f.write("We have reached the goal :-) [stop trying to move; we can't]. That's ok we have achived the goal]\n")
                break


    f.write("Successes: " + str(successes))
    f.write("\n")
    f.write("Failures: " + str(failures))
    successRate = successes / max_episodes * 100
    dict = {"Success": successes,
            "Failures": failures,
            "Episodes": max_episodes,
            "SuccessRate": successRate}
    # print(dict)
    return dict
def random_agent(problem_id):

    # should be less than or equal to 0.0, select 0 because reaching goal state is hard enough for random agent
    reward_hole = 0.0

    # generate 10 000 episodes in order to give agent chance to reach the goal multiple times
    max_episodes = 10000

    # every episode should have 2000 iterations (agent can take 2000 steps in the map)
    max_iter_per_episode = 2000

    # setup the frozen lake loch lomond environment (uncertainty involved)
    env = LochLomondEnv(problem_id=problem_id,
                        is_stochastic=True,
                        reward_hole=reward_hole)
    results = []

    print('Running Random Agent for problem: ', problem_id)

    for e in range(
            max_episodes):  # iterate over total number of possible episodes

        # reset the random generator to known state (probability needs to adapt)
        np.random.seed(e)

        observation = env.reset(
        )  # reset the state of the environment to starting state S

        for iter in range(max_iter_per_episode):

            # current agent takes random actions
            action = env.action_space.sample()

            # outcome of taking a certain action
            observation, reward, done, info = env.step(action)

            # Test condition to see if agent is done and associated rewards
            if (done and reward == reward_hole):
                break

            if (done and reward == +1.0):
                break

        results.append([e, iter + 1, int(reward)])

    columns = ['episode', 'iterations', 'reward']

    # Save results to a CSV file
    np.savetxt('out_random_{}.csv'.format(problem_id),
               np.array(results),
               header="episode,iterations,reward",
               delimiter=",",
               fmt='%s')

    dataframe = pd.DataFrame(data=np.array(results),
                             index=np.array(results)[0:, 0],
                             columns=columns)
    dataframe['cumulative_rewards'] = list(
        itertools.accumulate(dataframe['reward'], operator.add))
    dataframe['mean_rewards'] = dataframe.apply(lambda x: mean_rewards(x),
                                                axis=1)

    # Plotting the results for all task environments ID 0 to 7
    x = range(1, len(dataframe) + 1)
    y = dataframe['mean_rewards']

    title = 'Mean Reward vs Episodes'
    subtitle = 'Random Agent: Problem ID {}'.format(problem_id)
    labels = ['Episodes', 'Mean Reward']

    dataframe = dataframe[[
        'episode', 'iterations', 'cumulative_rewards', 'mean_rewards'
    ]]

    add_plot(x, y, 'out_random_{}_mean_reward.png'.format(problem_id), title,
             subtitle, labels)

    print('Total episodes run: ', max_episodes)
    print('Allowed iterations per episode: ', max_iter_per_episode)
    print('Max iterations per episode: ', max(dataframe['iterations']))
    print('Mean iterations per episode: ', dataframe['iterations'].mean())
    print('Average success per episode: ',
          max(dataframe['cumulative_rewards']) / max_episodes)
    print('Episodes won: ', max(dataframe['cumulative_rewards']))
    print("\n")

    return dataframe