Beispiel #1
0
def main():
    args = parser.parse_args()

    if args.render:
        from envs.gridworld import GridWorld
    else:
        from envs.gridworld_clockless import GridWorldClockless as GridWorld

    env = GridWorld(display=args.render,
                    obstacles=[np.asarray([1, 2])],
                    goal_state=np.asarray([5, 5]),
                    step_wrapper=step_wrapper,
                    reset_wrapper=reset_wrapper,
                    seed=3)
    loss_t = LBT(list_size=100, stop_threshold=1.5, log_interval=100)
    model = ActorCritic(env,
                        gamma=0.99,
                        log_interval=200,
                        max_episodes=5000,
                        max_ep_length=20,
                        termination=loss_t)

    if args.policy_path is not None:
        model.policy.load(args.policy_path)

    if args.reward_net is not None:
        reward_net = RewardNet(env.reset().shape[0])
        reward_net.to('cuda')
        reward_net.load('./saved-models-rewards/0.pt')
        reward_net.eval()
    else:
        reward_net = None

    if not args.play:
        model.train_mp(n_jobs=4, reward_net=reward_net, irl=args.irl)

        if not args.dont_save:
            model.policy.save('./saved-models/')

    if args.play:
        env.tickSpeed = 15
        assert args.policy_path is not None, 'pass a policy to play from!'

        model.generate_trajectory(args.num_trajs, './trajs/ac_gridworld/')
Beispiel #2
0
parser.add_argument('--gamma', type=float, default=0.9)
parser.add_argument('--model', default='convdeconv1')
parser.add_argument('--target', type=int, default=1000)
parser.add_argument('--path', required=True)
boolean_flag(parser, 'dueling', default=True)
boolean_flag(parser, 'norm', default=True)
boolean_flag(parser, 'double', default=True)
boolean_flag(parser, 'render', default=False)
args = parser.parse_args()

n_steps = int(1e8)

train_level = 'level1'
test_levels = ['level1', 'level2', 'level3']

env = GridWorld(train_level)
coords_shape = env.unwrapped.coords_shape
set_global_seeds(args.seed)
env.seed(args.seed)

print('~~~~~~~~~~~~~~~~~~~~~~')
print(env.spec.id)
print('observations:', env.observation_space.shape)
print('coords:     ', coords_shape)
print('actions:    ', env.action_space.n)
print('walls:      ', env.unwrapped.walls.shape)
print('~~~~~~~~~~~~~~~~~~~~~~')

config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
def main():

    args = parser.parse_args()

    experiment_logger = Logger('temp_save.txt')

    experiment_logger.log_header('Arguments for the experiment :')
    experiment_logger.log_info(vars(args))

    mp.set_start_method('spawn')

    if args.render:
        from envs.gridworld import GridWorld
    else:
        from envs.gridworld_clockless import GridWorldClockless as GridWorld

    agent_width = 10
    step_size = 10
    obs_width = 10
    grid_size = 10

    if args.feat_extractor == 'Onehot':
        feat_ext = OneHot(grid_rows=10, grid_cols=10)
    if args.feat_extractor == 'SocialNav':
        feat_ext = SocialNav(fieldList=['agent_state', 'goal_state'])
    if args.feat_extractor == 'FrontBackSideSimple':
        feat_ext = FrontBackSideSimple(
            thresh1=1,
            thresh2=2,
            thresh3=3,
            thresh4=4,
            step_size=step_size,
            agent_width=agent_width,
            obs_width=obs_width,
        )

    if args.feat_extractor == 'LocalGlobal':
        feat_ext = LocalGlobal(
            window_size=3,
            grid_size=grid_size,
            agent_width=agent_width,
            obs_width=obs_width,
            step_size=step_size,
        )

    experiment_logger.log_header('Parameters of the feature extractor :')
    experiment_logger.log_info(feat_ext.__dict__)
    '''
    np.asarray([2,2]),np.asarray([7,4]),np.asarray([3,5]),
                                np.asarray([5,2]),np.asarray([8,3]),np.asarray([7,5]),
                                np.asarray([3,3]),np.asarray([3,7]),np.asarray([5,7])
                                '''
    env = GridWorld(display=args.render,
                    is_onehot=False,
                    is_random=True,
                    rows=100,
                    agent_width=agent_width,
                    step_size=step_size,
                    obs_width=obs_width,
                    width=grid_size,
                    cols=100,
                    seed=7,
                    buffer_from_obs=0,
                    obstacles=3,
                    goal_state=np.asarray([5, 5]))

    experiment_logger.log_header('Environment details :')
    experiment_logger.log_info(env.__dict__)

    model = ActorCritic(env,
                        feat_extractor=feat_ext,
                        gamma=0.99,
                        log_interval=100,
                        max_ep_length=40,
                        hidden_dims=args.policy_net_hidden_dims,
                        max_episodes=4000)

    experiment_logger.log_header('Details of the RL method :')
    experiment_logger.log_info(model.__dict__)

    pdb.set_trace()

    if args.policy_path is not None:
        model.policy.load(args.policy_path)

    if not args.play and not args.play_user:
        if args.reward_path is None:
            model.train_mp(n_jobs=4)
        else:
            from irlmethods.deep_maxent import RewardNet
            state_size = featExtract.extract_features(env.reset()).shape[0]
            reward_net = RewardNet(state_size)
            reward_net.load(args.reward_path)
            print(next(reward_net.parameters()).is_cuda)
            model.train_mp(reward_net=reward_net, n_jobs=4)

        if not args.dont_save:
            model.policy.save('./saved-models/')

    if args.play:
        env.tickSpeed = 15
        assert args.policy_path is not None, 'pass a policy to play from!'

        model.generate_trajectory(args.num_trajs,
                                  './trajs/ac_fbs_simple4_static_map7/')

    if args.play_user:
        env.tickSpeed = 200

        model.generate_trajectory_user(args.num_trajs,
                                       './trajs/ac_gridworld_user/')
Beispiel #4
0
def experiment(args, agent_algorithm):
    np.random.seed()

    scores = list()
    #add timestamp to results
    ts = str(time.time())
    # Evaluation of the model provided by the user.
    if args.load_path and args.evaluation:
        # MDP
        if args.name not in ['Taxi', 'Gridworld']:
            mdp = Gym(args.name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        elif args.name == 'Taxi':
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma
        else:
            rew_weights = [args.fast_zone, args.slow_zone, args.goal]
            grid_size = args.grid_size
            env = GridWorld(gamma=args.gamma,
                            rew_weights=rew_weights,
                            shape=(grid_size, grid_size),
                            randomized_initial=args.rand_initial,
                            horizon=args.horizon)
            gamma_eval = args.gamma
            mdp = env.generate_mdp()
            n_states = mdp.info.observation_space.size[0]
        # Policy
        epsilon_test = Parameter(value=args.test_exploration_rate)
        pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)

        # Approximator
        input_shape = mdp.info.observation_space.shape + (1, )
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   name='test',
                                   load_path=args.load_path,
                                   net_type=args.net_type,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'lr_sigma': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })

        approximator = SimpleNet

        # Agent
        algorithm_params = dict(batch_size=0,
                                initial_replay_size=0,
                                max_replay_size=0,
                                clip_reward=False,
                                target_update_frequency=1)
        if args.alg == 'boot':
            algorithm_params['p_mask'] = args.p_mask
            pi = BootPolicy(args.n_approximators, epsilon=epsilon_test)
        elif args.alg == 'gaussian':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedGaussianPolicy(epsilon=epsilon_test)
        elif args.alg == 'dqn':
            pi = EpsGreedy(epsilon=epsilon_test)
        elif args.alg == 'particle':
            if args.ucb:
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_test)

        else:
            raise ValueError("Algorithm uknown")

        if args.alg in ['gaussian', 'particle']:
            algorithm_params['update_type'] = args.update_type
            algorithm_params['delta'] = args.delta
            algorithm_params['store_prob'] = args.store_prob
            if args.clip_target:
                algorithm_params['max_spread'] = args.q_max - args.q_min
            approximator_params['q_min'] = args.q_min
            approximator_params['q_max'] = args.q_max
            approximator_params['loss'] = args.loss
            approximator_params['init_type'] = args.init_type
            approximator_params['sigma_weight'] = args.sigma_weight
        if args.alg in ['particle', 'boot']:
            approximator_params['n_approximators'] = args.n_approximators
            algorithm_params['n_approximators'] = args.n_approximators
        agent = agent_algorithm(approximator,
                                pi,
                                mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        # Algorithm
        core_test = Core(agent, mdp)

        # Evaluate model
        pi.set_eval(True)
        dataset = core_test.evaluate(n_steps=args.test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        get_stats(dataset)
    else:
        # DQN learning run
        print("Learning Run")

        # Settings
        if args.debug:
            initial_replay_size = 50
            max_replay_size = 500
            train_frequency = 5
            target_update_frequency = 10
            test_samples = 20
            evaluation_frequency = 50
            max_steps = 1000
        else:
            initial_replay_size = args.initial_replay_size
            max_replay_size = args.max_replay_size
            train_frequency = args.train_frequency
            target_update_frequency = args.target_update_frequency
            test_samples = args.test_samples
            evaluation_frequency = args.evaluation_frequency
            max_steps = args.max_steps

        # MDP
        if args.name not in ['Taxi', 'Gridworld']:
            mdp = Gym(args.name, args.horizon, args.gamma)
            n_states = None
            gamma_eval = 1.
        elif args.name == 'Taxi':
            mdp = generate_taxi('../../grid.txt')
            n_states = mdp.info.observation_space.size[0]
            gamma_eval = mdp.info.gamma
        else:
            rew_weights = [args.fast_zone, args.slow_zone, args.goal]
            grid_size = args.grid_size
            env = GridWorld(gamma=args.gamma,
                            rew_weights=rew_weights,
                            shape=(grid_size, grid_size),
                            randomized_initial=args.rand_initial,
                            horizon=args.horizon)
            mdp = env.generate_mdp()
            n_states = mdp.info.observation_space.size[0]
            print(mdp.info.gamma)
            gamma_eval = args.gamma
        # Policy
        epsilon = LinearDecayParameter(value=args.initial_exploration_rate,
                                       min_value=args.final_exploration_rate,
                                       n=args.final_exploration_frame)
        epsilon_test = Parameter(value=args.test_exploration_rate)
        epsilon_random = Parameter(value=1.)

        policy_name = 'weighted'
        update_rule = args.update_type + "_update"
        if args.alg == 'boot':
            pi = BootPolicy(args.n_approximators, epsilon=epsilon)
            policy_name = 'boot'
            update_rule = 'boot'
        elif args.alg == 'dqn':
            pi = EpsGreedy(epsilon=epsilon)
            policy_name = 'eps_greedy'
            update_rule = 'td'
        elif args.alg == 'particle':
            if args.ucb:
                policy_name = 'ucb'
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedPolicy(args.n_approximators)
        elif args.alg == 'gaussian':
            if args.ucb:
                policy_name = 'ucb'
                pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma))
            else:
                pi = WeightedGaussianPolicy()
        else:
            raise ValueError("Algorithm unknown")
        # Summary folder
        folder_name = './logs/' + args.alg + "/" + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str(
            args.n_approximators
        ) + "_particles" + "/" + args.init_type + "_init" + "/" + str(
            args.learning_rate) + "/" + ts

        # Approximator
        input_shape = mdp.info.observation_space.shape
        input_preprocessor = list()
        approximator_params = dict(input_shape=input_shape,
                                   output_shape=(mdp.info.action_space.n, ),
                                   n_states=n_states,
                                   n_actions=mdp.info.action_space.n,
                                   n_features=args.n_features,
                                   n_approximators=args.n_approximators,
                                   input_preprocessor=input_preprocessor,
                                   folder_name=folder_name,
                                   net_type=args.net_type,
                                   sigma_weight=args.sigma_weight,
                                   optimizer={
                                       'name': args.optimizer,
                                       'lr': args.learning_rate,
                                       'lr_sigma': args.learning_rate,
                                       'decay': args.decay,
                                       'epsilon': args.epsilon
                                   })
        if args.load_path:
            ts = os.path.basename(os.path.normpath(args.load_path))
            approximator_params['load_path'] = args.load_path
            approximator_params['folder_name'] = args.load_path
            folder_name = args.load_path
            p = "scores_" + str(ts) + ".npy"
            scores = np.load(p).tolist()
            max_steps = max_steps - evaluation_frequency * len(scores)
        approximator = SimpleNet

        # Agent
        algorithm_params = dict(
            batch_size=args.batch_size,
            initial_replay_size=initial_replay_size,
            max_replay_size=max_replay_size,
            clip_reward=False,
            target_update_frequency=target_update_frequency // train_frequency,
        )
        if args.alg == 'boot':
            algorithm_params['p_mask'] = args.p_mask
        elif args.alg in ['particle', 'gaussian']:
            algorithm_params['update_type'] = args.update_type
            algorithm_params['delta'] = args.delta
            algorithm_params['store_prob'] = args.store_prob
            if args.clip_target:
                algorithm_params['max_spread'] = args.q_max - args.q_min
            approximator_params['q_min'] = args.q_min
            approximator_params['q_max'] = args.q_max
            approximator_params['loss'] = args.loss
            approximator_params['init_type'] = args.init_type

        if args.alg in ['boot', 'particle']:
            approximator_params['n_approximators'] = args.n_approximators
            algorithm_params['n_approximators'] = args.n_approximators

        agent = agent_algorithm(approximator,
                                pi,
                                mdp.info,
                                approximator_params=approximator_params,
                                **algorithm_params)

        if args.ucb:
            q = agent.approximator
            if args.alg == 'particle':

                def mu(state):
                    q_list = q.predict(state).squeeze()
                    qs = np.array(q_list)
                    return qs.mean(axis=0)

                quantiles = [
                    i * 1. / (args.n_approximators - 1)
                    for i in range(args.n_approximators)
                ]
                for p in range(args.n_approximators):
                    if quantiles[p] >= 1 - args.delta:
                        delta_index = p
                        break

                def quantile_func(state):
                    q_list = q.predict(state).squeeze()

                    qs = np.sort(np.array(q_list), axis=0)
                    return qs[delta_index, :]

                print("Setting up ucb policy")
                pi.set_mu(mu)
                pi.set_quantile_func(quantile_func)

            if args.alg == 'gaussian':
                standard_bound = norm.ppf(1 - args.delta, loc=0, scale=1)

                def mu(state):
                    q_and_sigma = q.predict(state).squeeze()
                    means = q_and_sigma[0]
                    return means

                def quantile_func(state):
                    q_and_sigma = q.predict(state).squeeze()
                    means = q_and_sigma[0]
                    sigmas = q_and_sigma[1]
                    return sigmas * standard_bound + means

                print("Setting up ucb policy")
                pi.set_mu(mu)
                pi.set_quantile_func(quantile_func)
        args.count = 100
        if args.plot_qs:
            import matplotlib.pyplot as plt
            colors = ['red', 'blue', 'green']
            labels = ['left', 'nop', 'right']

            def plot_probs(qs):
                args.count += 1
                if args.count < 1:
                    return
                ax.clear()
                for i in range(qs.shape[-1]):
                    mu = np.mean(qs[..., i], axis=0)
                    sigma = np.std(qs[..., i], axis=0)
                    x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 20)
                    ax.plot(x,
                            stats.norm.pdf(x, mu, sigma),
                            label=labels[i],
                            color=colors[i])

                ax.set_xlabel('Q-value')
                ax.set_ylabel('Probability')
                ax.set_title('Q-distributions')
                #ax.set_ylim(bottom=0, top=1)

                plt.draw()
                plt.pause(0.02)
                #print("Plotted")
                args.count = 0
                #return probs

            plt.ion()
            fig, ax = plt.subplots()

            plot_probs(
                np.array(agent.approximator.predict(np.array(mdp.reset()))))

            input()
            args.count = 100
            qs = np.array([
                np.linspace(-1000, 0, 10),
                np.linspace(-2000, -1000, 10),
                np.linspace(-750, -250, 10)
            ])
            plot_probs(qs.T)
        # Algorithm
        core = Core(agent, mdp)
        core_test = Core(agent, mdp)

        # RUN

        # Fill replay memory with random dataset

        print_epoch(0)
        core.learn(
            n_steps=initial_replay_size,
            n_steps_per_fit=initial_replay_size,
            quiet=args.quiet,
        )

        if args.save:
            agent.approximator.model.save()

        # Evaluate initial policy
        if hasattr(pi, 'set_eval'):
            pi.set_eval(True)
        pi.set_epsilon(epsilon_test)
        dataset = core_test.evaluate(n_steps=test_samples,
                                     render=args.render,
                                     quiet=args.quiet)
        scores.append(get_stats(dataset))
        if args.plot_qs:
            pi.set_plotter(plot_probs)
        np.save(folder_name + '/scores_' + str(ts) + '.npy', scores)
        for n_epoch in range(1, max_steps // evaluation_frequency + 1):
            print_epoch(n_epoch)
            print('- Learning:')
            # learning step
            if hasattr(pi, 'set_eval'):
                pi.set_eval(False)

            pi.set_epsilon(epsilon)
            # learning step
            if args.plot_qs:
                pi.set_plotter(None)
            core.learn(
                n_steps=evaluation_frequency,
                n_steps_per_fit=train_frequency,
                quiet=args.quiet,
            )

            if args.save:
                agent.approximator.model.save()

            print('- Evaluation:')
            # evaluation step
            if hasattr(pi, 'set_eval'):
                pi.set_eval(True)
            pi.set_epsilon(epsilon_test)
            if args.plot_qs:
                pi.set_plotter(plot_probs)
            dataset = core_test.evaluate(n_steps=test_samples,
                                         render=args.render,
                                         quiet=args.quiet)
            scores.append(get_stats(dataset))
            np.save(folder_name + '/scores_' + str(ts) + '.npy', scores)

    return scores
                argmax_action = action
    return argmax_action


def policy_iteration(policy, env, discount=1.0):
    states = env.get_state_space()

    while True:
        policy_stable = True
        states_values = policy_evaluation(policy, env)
        states_values = states_values.flatten()
        for state in states:
            tmp = policy[state]
            argmax_action = update_rule(policy, env, state, states_values,
                                        discount)
            for action in policy[state]:
                if action == argmax_action:
                    policy[state][action] = 1.0  # Max prob
                else:
                    policy[state][action] = 0
            if tmp != policy[state]:
                policy_stable = False
        if policy_stable:
            return policy


if __name__ == '__main__':
    env = GridWorld()
    policy = RandomPolicy(env)
    pprint(policy_iteration(policy, env).__dict__)
Beispiel #6
0
def generate_agent_grid_visitation_map(policy_fname_list,
                                       feature_extractor=None,
                                       store=False):

    #given the policy file name list and feature extractor creates a heatmap of the
    #agent on the gridworld based on the trajectories in the list
    #if store=True, the figure is stored in the form of a pickle

    #list containing the points of trajectories of all the policies
    trajectory_point_master_list = []
    traj_to_plot = 2

    env = GridWorld(display=False,
                    is_onehot=False,
                    is_random=False,
                    rows=10,
                    cols=10,
                    seed=3,
                    obstacles=[np.asarray([5, 5])],
                    goal_state=np.asarray([1, 5]))

    max_ep_length = 15
    run_iterations = 50

    rl_method = ActorCritic(env,
                            feat_extractor=feature_extractor,
                            gamma=0.99,
                            max_ep_length=max_ep_length,
                            log_interval=50)

    labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

    counter = 0
    for name in policy_fname_list:
        counter += 1
        if counter == traj_to_plot:
            policy_name_to_plot = name
        #ready the policy
        rl_method.policy.load(name)
        trajectory_point_policy = []

        env = GridWorld(display=False,
                        is_onehot=False,
                        is_random=False,
                        rows=10,
                        cols=10,
                        seed=7,
                        obstacles=[np.asarray([5, 5])],
                        goal_state=np.asarray([1, 5]))

        heat_map = np.zeros((env.rows, env.cols))

        for i in range(run_iterations):
            trajectory_point_run = []
            state = env.reset()
            heat_map[state['agent_state'][0], state['agent_state'][1]] += 1
            trajectory_point_run.append(
                (state['agent_state'][0] * env.cellWidth,
                 state['agent_state'][1] * env.cellWidth))
            state = feature_extractor.extract_features(state)
            for t in range(max_ep_length):

                action = rl_method.select_action(state)
                state, reward, done, _ = env.step(action)
                heat_map[state['agent_state'][0], state['agent_state'][1]] += 1
                trajectory_point_run.append(
                    (state['agent_state'][0] * env.cellWidth,
                     state['agent_state'][1] * env.cellWidth))
                state = feature_extractor.extract_features(state)

            trajectory_point_policy.append(trajectory_point_run)

        trajectory_point_master_list.append(trajectory_point_policy)
        fig, ax = plt.subplots()

        im = ax.imshow(heat_map, vmin=0, vmax=40)
        ax.set_xticks(np.arange(10))
        ax.set_yticks(np.arange(10))

        ax.set_xticklabels(labels)
        ax.set_yticklabels(labels)

        ax.set_xlabel('Columns of the gridworld', fontsize='large')
        ax.set_ylabel('Rows of the gridworld', fontsize='large')

        for i in range(len(labels)):
            for j in range(len(labels)):

                text = ax.text(j,
                               i,
                               heat_map[i, j],
                               ha="center",
                               va="bottom",
                               color="black")
                #arrow = ax.arrow(j,i,.1,.1,shape='full',head_width= .2)
                #arrow = ax.annotate("",xy = (j,i) , arrowprops = arrow)
                pass
        ax.set_title("Grid location visitation frequency for a unbiased agent")

        #plt.colorbar()
        #plt.clim(0,70)
        plt.draw()
        if store:
            pickle_filename = 'FigureObject' + str(counter) + '.fig.pickle'
            pickle.dump(fig, open(pickle_filename, 'wb'))
        plt.pause(.001)

    #annotate_trajectory(policy_name_to_plot, env, rl_method,
    #                    max_ep_length, ax, feature_extractor=feature_extractor)

    plt.show()
Beispiel #7
0
def plot_reward_across_policy_models(foldername,
                                     expert=None,
                                     feature_extractor=None,
                                     seed_list=[],
                                     iterations_per_model=50,
                                     compare_expert=True):

    #given a folder of policy networks, the function will go through them one by one and
    #create a plot of the rewards obtained by each of the policy networks and compare them
    #to that of an expert (if provided)
    color_list = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
    counter = 0

    reward_across_seeds = []
    xaxis = None
    for seed in seed_list:

        env = GridWorld(display=False,
                        is_onehot=False,
                        is_random=True,
                        rows=10,
                        cols=10,
                        seed=seed,
                        obstacles=[
                            np.asarray([5, 1]),
                            np.array([5, 9]),
                            np.asarray([4, 1]),
                            np.array([6, 9]),
                            np.asarray([3, 1]),
                            np.array([7, 9])
                        ],
                        goal_state=np.asarray([1, 5]))

        max_ep_length = 20

        rl_method = ActorCritic(env,
                                feat_extractor=feature_extractor,
                                gamma=0.99,
                                max_ep_length=max_ep_length,
                                log_interval=50)

        model_names = glob.glob(os.path.join(foldername, '*.pt'))

        xaxis = np.arange(len(model_names))

        reward_exp = get_rewards_for_model(expert,
                                           env=env,
                                           feature_extractor=feature_extractor,
                                           rl_method=rl_method,
                                           max_ep_length=max_ep_length,
                                           iterations=iterations_per_model)

        reward_across_models = []
        reward_expert = []
        for policy_file in sorted(model_names, key=numericalSort):

            print('asdfasfsa', policy_file)

            reward_per_model = get_rewards_for_model(
                policy_file,
                env=env,
                feature_extractor=feature_extractor,
                rl_method=rl_method,
                max_ep_length=max_ep_length,
                iterations=iterations_per_model)

            print('Average reward for the model:', reward_per_model)
            reward_across_models.append(reward_per_model)
            reward_expert.append(reward_exp)

        reward_across_seeds.append(reward_across_models)

    np_reward_across_seeds = np.array(reward_across_seeds)

    print(np_reward_across_seeds.shape)
    means_rewards = np.mean(np_reward_across_seeds, axis=0)

    print("the mean rewards :", means_rewards)

    print("The mean across all runs and seeds : ", np.mean(means_rewards))

    std_rewards = np.std(np_reward_across_seeds, axis=0)

    print('the std :', std_rewards)
    plt.xlabel('IRL iteration no.')
    plt.ylabel('Reward obtained')
    plt.plot(xaxis,
             means_rewards,
             color=color_list[counter],
             label='IRL trained agent')
    plt.fill_between(xaxis,
                     means_rewards - std_rewards,
                     means_rewards + std_rewards,
                     alpha=0.5,
                     facecolor=color_list[counter])
    plt.plot(reward_expert, color='k', label='Expert agent')
    plt.legend()
    plt.draw()
    plt.pause(0.001)
    plt.show()
    return reward_across_models