Ejemplo n.º 1
0
def train_and_test(experiments):
    df = pd.DataFrame(columns=[
        'model name', 'episode number', 'train mean score', 'train mean steps',
        'test accuracy', 'test mean score', 'test mean steps'
    ])
    for model_name, steps, train_agent in experiments:
        # Train
        train_res = experiment(steps, agent_config=train_agent)
        train_res["agent"].save_model(model_name)
        training_mean_steps = train_res["steps"].mean()
        training_mean_score = train_res["scores"].mean()

        np.savetxt("results/training/ddqn.csv",
                   train_res["steps"],
                   delimiter=',')

        # Test
        test_agent = DQNAgent(output_dim,
                              None,
                              use_ddqn=True,
                              default_policy=True,
                              model_filename=model_name,
                              epsilon=0.01,
                              epsilon_lower_bound=0.01,
                              learn_thresh=0)
        test_res = experiment(500,
                              default_policy=True,
                              policy=model_name,
                              agent_config=test_agent)
        testing_accuracy = accuracy(test_res["results"])
        testing_mean_steps = test_res["steps"].mean()
        testing_mean_score = test_res["scores"].mean()

        np.savetxt("results/testing/ddqn.csv",
                   test_res["steps"],
                   delimiter=',')

        df.loc[len(df)] = [
            model_name,
            len(train_res["steps"]), training_mean_score, training_mean_steps,
            testing_accuracy, testing_mean_score, testing_mean_steps
        ]

    df.to_csv('experiments.csv')
Ejemplo n.º 2
0
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """

    with tf.device('/gpu:0'):
        res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
        scores = []  # Cumulative rewards
        steps = []  # Steps per episode

        reward_list = RingBuffer(100)
        env = gym.make('PongDeterministic-v4')

        input_dim = env.observation_space.shape[0]
        output_dim = env.action_space.n

        if default_policy:
            agent = DQNAgent(output_dim,
                             None,
                             use_ddqn=True,
                             default_policy=True,
                             model_filename=policy,
                             epsilon=0.05,
                             epsilon_lower_bound=0.05)
        else:
            layers = [
                Conv2D(32, (8, 8),
                       strides=(4, 4),
                       activation='relu',
                       input_shape=(84, 84, 4),
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Conv2D(64, (4, 4),
                       strides=(2, 2),
                       activation='relu',
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Conv2D(64, (3, 3),
                       strides=(1, 1),
                       activation='relu',
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Flatten(),
                Dense(512, activation='relu'),
                Dense(output_dim)
            ]
            agent = DQNAgent(output_dim,
                             layers,
                             use_ddqn=True,
                             memory_size=700000,
                             gamma=0.99,
                             learn_thresh=50000,
                             epsilon_lower_bound=0.02,
                             epsilon_decay_function=lambda e: e -
                             (0.98 / 950000),
                             update_rate=10000,
                             optimizer=Adam(0.00025))

        gathered_frame = 0
        for episode_number in tqdm(range(n_episodes), desc="Episode"):
            frame = env.reset()
            state = pre_processing(frame)
            empty_state = np.zeros(state.shape, dtype="uint8")
            cumulative_reward = 0

            has_lost_life = True

            t = 0
            while True:
                if has_lost_life:
                    next_action = 1  # [1, 4, 5][ran.randint(0, 2)]

                    stack = np.stack(
                        (empty_state, empty_state, empty_state, empty_state),
                        axis=2)
                    stack = np.reshape([stack], (1, 84, 84, 4))

                    for _ in range(ran.randint(1, 10)):
                        gathered_frame += 1
                        frame, reward, end, _ = env.step(next_action)
                        new_state = np.reshape(pre_processing(frame),
                                               (1, 84, 84, 1))
                        new_stack = np.append(new_state,
                                              stack[:, :, :, :3],
                                              axis=3)
                        stack = new_stack

                        if (render):
                            env.render()

                    has_lost_life = False

                next_action = agent.act(stack)
                new_state, reward, end, _ = env.step(next_action)

                if (render):
                    env.render()
                    time.sleep(0.02)

                reward = np.clip(reward, -1., 1.)

                if reward != 0:
                    has_lost_life = True

                cumulative_reward += reward

                new_state = np.reshape(pre_processing(new_state),
                                       (1, 84, 84, 1))
                new_stack = np.append(new_state, stack[:, :, :, :3], axis=3)
                agent.memoise(
                    (stack, next_action, reward, new_state, has_lost_life))

                stack = new_stack
                gathered_frame += 1

                if end:
                    reward_list.append(cumulative_reward)
                    if cumulative_reward > 0:
                        res[1] += 1
                        print("You Won!, steps:", t, "reward:",
                              reward_list.mean(), "frames:", gathered_frame)
                    else:
                        res[0] += 1
                        print("You Lost!, steps:", t, "reward:",
                              reward_list.mean(), "frames:", gathered_frame)
                    steps.append(t)
                    break

                agent.learn()
                t += 1

            scores.append(cumulative_reward)
            if episode_number >= 50 and episode_number % 10 == 0:
                model_name = "partial_model_pong" + str(episode_number)
                agent.save_model(model_name)

        env.close()
        return {
            "results": np.array(res),
            "steps": np.array(steps),
            "scores": np.array(scores),
            "agent": agent
        }
Ejemplo n.º 3
0
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """
    res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
    scores = []  # Cumulative rewards
    steps = []  # steps per episode

    env = gym.make('CartPole-v0')
    env = env.unwrapped
    env.seed(seed)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    layer1 = Dense(10, input_dim=input_dim, activation='relu')
    layer2 = Dense(output_dim)

    if default_policy:
        agent = DQNAgent(output_dim,
                         None,
                         use_ddqn=True,
                         default_policy=True,
                         model_filename=policy,
                         epsilon=0,
                         epsilon_lower_bound=0,
                         learn_thresh=0)
    else:
        agent1 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent2 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent3 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent4 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent5 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent6 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent7 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent8 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent9 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent10 = DQNAgent(output_dim, [layer1, layer2],
                           use_ddqn=True,
                           learn_thresh=2000,
                           update_rate=100,
                           epsilon_decay_function=lambda e: e - 0.0001,
                           epsilon_lower_bound=0.1,
                           optimizer=keras.optimizers.RMSprop(0.001),
                           memory_size=2000,
                           tb_dir=None)
        agents = [
            agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8,
            agent9, agent10
        ]

        agentE = EnsemblerAgent(output_dim, agents, EnsemblerType.TRUST_BASED)

    for i_ep in tqdm(range(n_episodes), desc="Episode"):
        state = env.reset()
        cumulative_reward = 0

        state = np.reshape(state, [1, 4])

        t = 0
        while True:
            if (render):
                env.render()
                time.sleep(0.1)

            next_action = agentE.act(state)

            new_state, reward, end, _ = env.step(next_action)

            x, x_dot, theta, theta_dot = new_state
            new_state = np.reshape(new_state, [1, 4])

            # Reward shaping
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            r3 = -abs(theta_dot)

            agent1.memoise((state, next_action, r2, new_state, end))
            agent2.memoise((state, next_action, r3, new_state, end))
            agent3.memoise((state, next_action, r2, new_state, end))
            agent4.memoise((state, next_action, r3, new_state, end))
            agent5.memoise((state, next_action, r2, new_state, end))
            agent6.memoise((state, next_action, r3, new_state, end))
            agent7.memoise((state, next_action, r2, new_state, end))
            agent8.memoise((state, next_action, r3, new_state, end))
            agent9.memoise((state, next_action, r2, new_state, end))
            agent10.memoise((state, next_action, r3, new_state, end))

            if end or t > 199:
                if t < 195:
                    res[0] += 1
                else:
                    res[1] += 1
                    print("ENTRATO!,", t, "steps", "reward: ",
                          cumulative_reward)

                steps.append(t)
                if i_ep % 100 == 0:
                    if evaluate(env, agentE):
                        cumulative_reward += reward
                        scores.append(cumulative_reward)
                        env.close()
                        return {
                            "results": np.array(res),
                            "steps": np.array(steps),
                            "scores": np.array(scores)
                        }

                break
            else:
                state = new_state
                cumulative_reward += reward

            for agent in agentE.agents:
                agent.learn()
            t += 1

        cumulative_reward += reward
        scores.append(cumulative_reward)

    env.close()
    return {
        "results": np.array(res),
        "steps": np.array(steps),
        "scores": np.array(scores)
    }
Ejemplo n.º 4
0
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """
    res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
    scores = []  # Cumulative rewards
    steps = []  # Steps per episode

    env = gym.make('MountainCar-v0')
    env.seed(seed)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    layer1 = Dense(15, input_dim=input_dim, activation='relu')
    layer2 = Dense(output_dim)

    agent1 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent2 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent3 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent4 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent5 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent6 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent7 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent8 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent9 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    agent10 = DQNAgent(output_dim, [layer1, layer2],
                       use_ddqn=True,
                       learn_thresh=1000,
                       update_rate=300,
                       epsilon_decay_function=lambda e: e - 0.001,
                       epsilon_lower_bound=0.01,
                       optimizer=keras.optimizers.RMSprop(0.001),
                       tb_dir=None)
    agents = [
        agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8, agent9,
        agent10
    ]
    agentE = EnsemblerAgent(env.action_space.n, agents,
                            EnsemblerType.MAJOR_VOTING_BASED)

    evaluate = False

    for i_episode in tqdm(range(n_episodes + 1), desc="Episode"):
        state = env.reset()
        cumulative_reward = 0

        state = np.reshape(state, [1, 2])

        if i_episode > 0 and i_episode % 120 == 0:
            evaluate = True

        if evaluate == False:
            for t in range(env._max_episode_steps):
                if (render):
                    env.render()

                next_action = agentE.act(state)
                new_state, reward, end, _ = env.step(next_action)
                original_state = new_state

                # r1 = reward + 0.1 * original_state[0]
                # r2 = reward + 0.2 * np.sin(3 * original_state[0])
                # r3 = reward + 0.7 * (original_state[1] * original_state[1])

                # Reward shaping
                r1 = reward + original_state[0]
                r2 = reward + np.sin(3 * original_state[0])
                r3 = reward + (original_state[1] * original_state[1])
                r4 = abs(new_state[0] - (-0.5))  # r in [0, 1]

                new_state = np.reshape(new_state, [1, 2])

                agent1.memoise((state, next_action, r1, new_state, end))
                agent2.memoise((state, next_action, r2, new_state, end))
                agent3.memoise((state, next_action, r1, new_state, end))
                agent4.memoise((state, next_action, r2, new_state, end))
                agent5.memoise((state, next_action, r1, new_state, end))
                agent6.memoise((state, next_action, r2, new_state, end))
                agent7.memoise((state, next_action, r1, new_state, end))
                agent8.memoise((state, next_action, r2, new_state, end))
                agent9.memoise((state, next_action, r1, new_state, end))
                agent10.memoise((state, next_action, r2, new_state, end))

                if end:
                    if t == env._max_episode_steps - 1:
                        res[0] += 1
                    else:
                        res[1] += 1
                        print("ENTRATO!,", t, "steps", "reward: ",
                              cumulative_reward)

                    steps.append(t)
                    break
                else:
                    state = new_state
                    cumulative_reward += reward

                for agent in agentE.agents:
                    agent.learn()

            cumulative_reward += reward
            scores.append(cumulative_reward)
        else:
            # Model validation for early stopping
            evaluate = False
            eval_res = [
                0, 0
            ]  # array of results accumulator: {[0]: Loss, [1]: Victory}
            eval_scores = []  # Cumulative rewards
            eval_steps = []  # Steps per episode

            for i_episode in range(100):
                state = env.reset()

                state = np.reshape(state, [1, 2])
                cumulative_reward = 0

                for t in range(env._max_episode_steps):
                    if (render):
                        env.render()

                    next_action = agentE.act(state)
                    new_state, reward, end, _ = env.step(next_action)

                    new_state = np.reshape(new_state, [1, 2])

                    if end:
                        if t == env._max_episode_steps - 1:
                            eval_res[0] += 1
                        else:
                            eval_res[1] += 1

                        eval_steps.append(t)
                        break
                    else:
                        state = new_state
                        cumulative_reward += reward

                cumulative_reward += reward
                eval_scores.append(cumulative_reward)

            testing_accuracy = accuracy(np.array(eval_res))
            testing_mean_steps = np.array(eval_steps).mean()
            testing_mean_score = np.array(eval_scores).mean()
            print("\nTraining episodes:", len(steps), "Training mean score:",
                  np.array(steps).mean(), "Training mean steps",
                  np.array(scores).mean(), "\nAccuracy:", testing_accuracy,
                  "Test mean score:", testing_mean_score, "Test mean steps:",
                  testing_mean_steps)

    env.close()
    return {
        "results": np.array(res),
        "steps": np.array(steps),
        "scores": np.array(scores)
    }
Ejemplo n.º 5
0
def experiment(n_episodes,
               default_policy=False,
               policy=None,
               render=False,
               agent_config=None):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering
        agent_config: DQNAgent object

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """
    res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
    scores = []  # Cumulative rewards
    steps = []  # Steps per episode

    env = gym.make('MountainCar-v0')
    env.seed(seed)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    if agent_config is None:
        if default_policy:
            agent = DQNAgent(output_dim,
                             None,
                             use_ddqn=True,
                             default_policy=True,
                             model_filename=policy,
                             epsilon=0,
                             epsilon_lower_bound=0,
                             learn_thresh=0)
        else:
            layer1 = Dense(15, input_dim=input_dim, activation='relu')
            layer2 = Dense(output_dim)
            agent = DQNAgent(output_dim, [layer1, layer2],
                             use_ddqn=True,
                             learn_thresh=1000,
                             update_rate=300,
                             epsilon_decay_function=lambda e: e * 0.95,
                             epsilon_lower_bound=0.01,
                             optimizer=keras.optimizers.RMSprop(0.001))
    else:
        agent = agent_config

    for i_episode in tqdm(range(n_episodes), desc="Episode"):
        state = env.reset()
        cumulative_reward = 0

        # Model validation for early stopping
        if i_episode > 0 and (i_episode % 100) == 0 and not default_policy:
            agent.save_model("tmp_model")
            evaluation_result = experiment(500,
                                           default_policy=True,
                                           policy="tmp_model")
            acc = accuracy(evaluation_result["results"])
            if acc == 100:
                break
            else:
                print("Accuracy:", acc, "Episode:", i_episode)

        state = np.reshape(state, [1, 2])

        for t in range(env._max_episode_steps):
            if (render):
                env.render()

            next_action = agent.act(state)
            new_state, reward, end, _ = env.step(next_action)

            reward = abs(new_state[0] - (-0.5))  # r in [0, 1] (reward shaping)
            new_state = np.reshape(new_state, [1, 2])

            agent.memoise((state, next_action, reward, new_state, end))

            if end:
                if t == env._max_episode_steps - 1:
                    res[0] += 1
                else:
                    res[1] += 1
                    # print("ENTRATO!,", t, "steps")

                steps.append(t)
                break
            else:
                state = new_state
                cumulative_reward += reward

            agent.learn()

        cumulative_reward += reward
        scores.append(cumulative_reward)
    env.close()
    return {
        "results": np.array(res),
        "steps": np.array(steps),
        "scores": np.array(scores),
        "agent": agent
    }
Ejemplo n.º 6
0
#experiment(10, render=True, default_policy=True, policy="model1")

input_dim = 2
output_dim = 3

experiments = []

layer1 = Dense(15, input_dim=input_dim, activation='relu')
layer2 = Dense(output_dim)
layers = [layer1, layer2]
experiments.append(("model23", 25000,
                    DQNAgent(output_dim,
                             layers,
                             use_ddqn=True,
                             learn_thresh=1000,
                             update_rate=300,
                             epsilon_decay_function=lambda e: e * 0.995,
                             epsilon_lower_bound=0.01,
                             optimizer=keras.optimizers.Adam(0.001),
                             tb_dir=None)))


def train_and_test(experiments):
    df = pd.DataFrame(columns=[
        'model name', 'episode number', 'train mean score', 'train mean steps',
        'test accuracy', 'test mean score', 'test mean steps'
    ])
    for model_name, steps, train_agent in experiments:
        # Train
        train_res = experiment(steps, agent_config=train_agent)
        train_res["agent"].save_model(model_name)