def testMaze(n_train, n_nav):
    ValueLearning.DBG_LVL = 1
    move_distance = 0.99

    # Experiment parameters
    nx = 6
    ny = 6
    n_fields = round(1.0 * (nx + 3) * (ny + 3))

    Hippocampus.N_CELLS_PER_FIELD = 1
    n_cells = n_fields * Hippocampus.N_CELLS_PER_FIELD

    # Maze creation
    maze = Environment.RandomGoalOpenField(nx, ny, move_distance)

    # Generate place fields and place cells
    place_fields = Hippocampus.setupPlaceFields(maze, n_fields)
    place_cells = Hippocampus.assignPlaceCells(n_cells, place_fields)

    # Create Actor and Critic
    actor = Agents.RandomAgent(maze.getActions(), n_cells)
    critic = Agents.Critic(n_cells)

    ValueLearning.learnValueFunction(n_train,
                                     maze,
                                     place_cells,
                                     actor,
                                     critic,
                                     max_steps=1000)
예제 #2
0
def learnValueFunction(n_trials,
                       environment,
                       place_cells,
                       actor=None,
                       critic=None,
                       max_steps=np.Inf):
    """
    Main function responsible for learning value function for a given environment
    INPUTS:
    -------
    n_trials: (INTEGER) Number of trials allowed on the task
    environment: (Maze) Physical space in which the task has to be learnt
    place_cells: (PlaceCell) Entity that encodes a particular location as a population

    <OPTIONAL INPUTS>
    actor: Pre-trained actor
    critic: Pre-trained critic

    OUTPUTS:
    --------
    actor: (Actor Class) Entity that learns actions for a given state
    critic: (Critic Class) Entity that evaluates the value for a
        particular state. These values are used for taking actions.
    """

    # Visualize place fields for a few cells and then the aggregate activity
    # Set up the actor and critic based on the place fields
    if critic is None:
        critic = Agents.Critic(len(place_cells))
    else:
        assert (critic.getNFields() == len(place_cells))

    if actor is None:
        actor = Agents.Actor(environment.getActions(), len(place_cells))
        # actor = Agents.RandomAgent(environment.getActions(), len(place_cells))
        # actor = Agents.IdealActor(environment, critic, place_cells)
    else:
        assert (actor.getNFields() == len(place_cells))

    n_steps = np.zeros(n_trials, dtype=float)
    for trial in range(n_trials):
        # Path is visualized using a graphics object
        canvas = Graphics.WallMazeCanvas(environment)
        if DBG_LVL > 2:
            n_cells_to_visualize = 4
            for _ in range(n_cells_to_visualize):
                sample_cell = random.randint(0, len(place_cells))
                canvas.visualizePlaceField(place_cells[sample_cell])
            canvas.visualizeAggregatePlaceFields(place_cells)

        # Initialize a new location and adjust for the optimal number of steps
        # needed to get to the goal.
        environment.redrawInitLocation()
        optimal_steps_to_goal = environment.getOptimalDistanceToGoal()
        n_steps[trial] = -optimal_steps_to_goal

        initial_state = environment.getCurrentState()
        canvas.update(initial_state)
        terminate_trial = False
        while not terminate_trial:
            terminate_trial = environment.reachedGoalState()
            if (n_steps[trial] > max_steps * environment.MOVE_DISTANCE):
                break

            n_steps[trial] += environment.MOVE_DISTANCE
            current_state = environment.getCurrentState()
            if DBG_LVL > 1:
                print('On state: (%.2f, %.2f)' %
                      (current_state[0], current_state[1]))

            # Get the place field activity based on the current location
            pf_activity = [pf.getActivity(current_state) for pf in place_cells]

            # Get an action based on the place field activity
            next_action = actor.getAction(pf_activity)
            if DBG_LVL > 1:
                print('Selected Action: %s' % next_action)

            # Apply this action onto the environment
            reward = environment.move(next_action)
            # canvas.update(environment.getCurrentState())

            # Use the obtained reward to update the value
            new_environment_state = environment.getCurrentState()
            canvas.update(new_environment_state)

            new_pf_activity = [
                pf.getActivity(new_environment_state) for pf in place_cells
            ]
            prediction_error = critic.updateValue(pf_activity, new_pf_activity,
                                                  reward)
            actor.updateWeights(pf_activity, prediction_error)

        if (DBG_LVL > 0):
            print('Ended trial %d moving %.1f.' % (trial, n_steps[trial]))
            # At debug level 1, only the first and the last trajectories, and
            # corresponding value functions are shown. At higher debug levels,
            # the entire trajectory is shown for every iteration
            if (DBG_LVL > 1) or (trial == 1) or (trial == n_trials - 1):
                # Plot the trajectory taken for this trial
                canvas.plotTrajectory()

                # This takes extremely long when using a population of neurons
                canvas.plotValueFunction(place_cells,
                                         critic,
                                         limits=False,
                                         continuous=True)

                # Plot a histogram of the weightS
                """
                critic_weights = np.reshape(critic.getWeights(), -1)
                Graphics.histogram(critic_weights)
                """

    if (DBG_LVL > 0):
        Graphics.plot(n_steps)
    else:
        print('Step Statistics - Mean (%.2f), STD (%.2f)' %
              (np.mean(n_steps), np.std(n_steps)))

    return (actor, critic, n_steps)
def testMaze():
    """
    No comments here. Look at single_maze_learning_agent.py for more details!
    """
    ValueLearning.DBG_LVL = 0

    nx = 6
    ny = 6

    # Set the number of cells to be used per "place field" - Same for all the environments
    Hippocampus.N_CELLS_PER_FIELD = 1

    n_fields = round(1.0 * (nx + 3) * (ny + 3))
    n_cells = Hippocampus.N_CELLS_PER_FIELD * n_fields
    move_distance = 0.99

    n_training_trials = 100
    n_single_env_episodes = 2
    n_alternations = 1
    max_train_steps = 1000

    # First Environment: Has its own place cells and place fields
    env_E1 = Environment.RandomGoalOpenField(nx, ny, move_distance)
    canvas_E1 = Graphics.WallMazeCanvas(env_E1)
    place_fields_E1 = Hippocampus.setupPlaceFields(env_E1, n_fields)
    place_cells_E1 = Hippocampus.assignPlaceCells(n_cells, place_fields_E1)

    # Train a critic on the first environment
    print('Training Critic solely on Env A')
    critic_E1 = None
    weights_E1 = np.empty((n_cells, n_single_env_episodes), dtype=float)
    for episode in range(n_single_env_episodes):
        (_, critic_E1,
         _) = ValueLearning.learnValueFunction(n_training_trials,
                                               env_E1,
                                               place_cells_E1,
                                               critic=critic_E1,
                                               max_steps=max_train_steps)
        weights_E1[:, episode] = critic_E1.getWeights()

    # Get a trajectory in the environment and plot the value function
    canvas_E1.plotValueFunction(place_cells_E1, critic_E1, continuous=True)
    input('Press return to run next environment...')

    components_E1 = Graphics.showDecomposition(weights_E1,
                                               title='Environment 01')

    # Create empty actors and critics
    actor = Agents.RandomAgent(env_E1.getActions(), n_cells)
    critic = Agents.Critic(n_cells)

    # Second Environment: This has a different set (but the same number) of
    # place fields and place cells (also has a bunch of walls)
    nx = 6
    ny = 6
    lp_wall = Environment.Wall((0, 3), (3, 3))
    rp_wall = Environment.Wall((4, 3), (6, 3))
    env_E2 = Environment.MazeWithWalls(nx,
                                       ny, [lp_wall, rp_wall],
                                       move_distance=move_distance)
    canvas_E2 = Graphics.WallMazeCanvas(env_E2)
    place_fields_E2 = Hippocampus.setupPlaceFields(env_E2, n_fields)
    place_cells_E2 = Hippocampus.assignPlaceCells(n_cells, place_fields_E2)

    # Train another critic on the second environment
    print()
    print('Training Critic solely on Env B')
    critic_E2 = None
    weights_E2 = np.empty((n_cells, n_single_env_episodes), dtype=float)
    for episode in range(n_single_env_episodes):
        (_, critic_E2,
         _) = ValueLearning.learnValueFunction(n_training_trials,
                                               env_E2,
                                               place_cells_E2,
                                               critic=critic_E2,
                                               max_steps=max_train_steps)
        weights_E2[:, episode] = critic_E2.getWeights()

    components_E2 = Graphics.showDecomposition(weights_E2,
                                               title='Environment 02')
    canvas_E2.plotValueFunction(place_cells_E2, critic_E2, continuous=True)

    # Look at the projection of one environment's weights on the other's principal components
    Graphics.showDecomposition(weights_E1,
                               components=components_E2,
                               title='E2 on E1')
    Graphics.showDecomposition(weights_E2,
                               components=components_E1,
                               title='E1 on E2')
    input('Press any key to start Alternation.')

    # This can be used to just reinforce the fact that the agent is indeed
    # random! The steps taken to goal would not change over time because of the
    # way the agent behaves.
    learning_steps_E1 = np.zeros((n_alternations, 1), dtype=float)
    learning_steps_E2 = np.zeros((n_alternations, 1), dtype=float)

    # keep track of weights for PCA
    weights = np.empty((n_cells, n_alternations * 2), dtype=float)
    for alt in range(n_alternations):
        n_alternation_trials = n_single_env_episodes * n_training_trials
        # n_alternation_trials = n_training_trials
        print('Alternation: %d' % alt)
        # First look at the performance of the agent in the task before it is
        # allowed to learn anything. Then allow learning
        print('Learning Environment A')
        (actor, critic, steps_E1) = ValueLearning.learnValueFunction(
            n_alternation_trials, env_E1, place_cells_E1, actor, critic,
            max_train_steps)
        learning_steps_E1[alt] = np.mean(steps_E1)
        weights[:, 2 * alt] = critic.getWeights()

        # Repeat for environment 1
        print('Learning Environment B')
        (actor, critic, steps_E2) = ValueLearning.learnValueFunction(
            n_alternation_trials, env_E2, place_cells_E2, actor, critic,
            max_train_steps)
        learning_steps_E2[alt] = np.mean(steps_E2)
        weights[:, 2 * alt + 1] = critic.getWeights()

    # Show the alternation weights in the two basis
    Graphics.showDecomposition(weights,
                               components=components_E1,
                               title='Alternation weights in E1')
    Graphics.showDecomposition(weights,
                               components=components_E2,
                               title='Alternation weights in E2')

    # Show the value functions for both the environments
    input('Press return for Value Function of E1')
    canvas_E1.plotValueFunction(place_cells_E1, critic, continuous=True)
    canvas_E1.plotValueFunction(place_cells_E1, critic_E1, continuous=True)
    canvas_E1.plotValueFunction(place_cells_E1, critic_E2, continuous=True)

    # Plot the ideal value function
    ideal_critic = Agents.IdealValueAgent(env_E1, place_cells_E1)
    optimal_value_function = ideal_critic.getValueFunction()

    scaling_factor = 1.0 / (1 - critic_E1.getDiscountFactor())
    # Graphics.showImage(optimal_value_function, xticks=range(1,nx), yticks=range(1,ny), range=(maze.NON_GOAL_STATE_REWARD, scaling_factor * maze.GOAL_STATE_REWARD))
    Graphics.showImage(optimal_value_function, xticks=range(1,nx), yticks=range(1,ny), \
        range=(env_E1.NON_GOAL_STATE_REWARD, scaling_factor * env_E1.GOAL_STATE_REWARD))

    input('Press return for Value Function of E2')
    canvas_E2.plotValueFunction(place_cells_E2, critic, continuous=True)
    canvas_E2.plotValueFunction(place_cells_E2, critic_E2, continuous=True)
    canvas_E2.plotValueFunction(place_cells_E2, critic_E1, continuous=True)

    # Plot the ideal value function
    ideal_critic = Agents.IdealValueAgent(env_E2, place_cells_E2)
    optimal_value_function = ideal_critic.getValueFunction()

    scaling_factor = 1.0 / (1 - critic_E2.getDiscountFactor())
    # Graphics.showImage(optimal_value_function, xticks=range(1,nx), yticks=range(1,ny), range=(maze.NON_GOAL_STATE_REWARD, scaling_factor * maze.GOAL_STATE_REWARD))
    Graphics.showImage(optimal_value_function, xticks=range(1,nx), yticks=range(1,ny), \
        range=(env_E2.NON_GOAL_STATE_REWARD, scaling_factor * env_E2.GOAL_STATE_REWARD))
    input('Press any key to exit!')
예제 #4
0
def testMaze(n_training_trials, n_navigation_trials):
    """
    No comments here. Look at single_maze_learning_agent.py for more details!
    """
    ValueLearning.DBG_LVL = 0
    move_distance = 0.29

    nx = 6
    ny = 6

    n_fields = round(1.0 * (nx + 3) * (ny + 3))
    Hippocampus.N_CELLS_PER_FIELD = 4
    n_cells = Hippocampus.N_CELLS_PER_FIELD * n_fields

    n_alternations = 1
    max_nav_steps = 400
    max_train_steps = 4000

    # First Environment: Has its own place cells and place fields
    env_E1 = Environment.RandomGoalOpenField(nx, ny, move_distance)
    canvas_E1 = Graphics.WallMazeCanvas(env_E1)
    place_fields_E1 = Hippocampus.setupPlaceFields(env_E1, n_fields)
    place_cells_E1 = Hippocampus.assignPlaceCells(n_cells, place_fields_E1)

    # Create empty actors and critics
    actor = Agents.Actor(env_E1.getActions(), n_cells)
    critic = Agents.Critic(n_cells)

    # Second Environment: This has a different set (but the same number) of
    # place fields and place cells
    nx = 6
    ny = 6
    lp_wall = Environment.Wall((0, 3), (3, 3))
    rp_wall = Environment.Wall((4, 3), (6, 3))
    env_E2 = Environment.MazeWithWalls(nx, ny, [lp_wall, rp_wall],
                                       move_distance)
    canvas_E2 = Graphics.WallMazeCanvas(env_E2)
    place_fields_E2 = Hippocampus.setupPlaceFields(env_E2, n_fields)
    place_cells_E2 = Hippocampus.assignPlaceCells(n_cells, place_fields_E2)

    learning_steps_E1 = np.zeros((n_training_trials, 1), dtype=float)
    learning_steps_E2 = np.zeros((n_training_trials, 1), dtype=float)
    for alt in range(n_alternations):
        print('Alternation: %d' % alt)
        # First look at the performance of the agent in the task before it is
        # allowed to learn anything. Then allow learning

        print('Learning Environment A')
        (actor, critic, steps_E1) = ValueLearning.learnValueFunction(
            n_training_trials, env_E1, place_cells_E1, actor, critic,
            max_train_steps)
        learning_steps_E1 = steps_E1

        print('Learning Environment B')
        (actor, critic, steps_E2) = ValueLearning.learnValueFunction(
            n_training_trials, env_E2, place_cells_E2, actor, critic,
            max_train_steps)
        learning_steps_E2 = steps_E2

    # canvas_E1.plotValueFunction(place_cells_E1, critic)
    # canvas_E2.plotValueFunction(place_cells_E2, critic)

    # Plot a histogram of the weights
    # Critic
    # critic_weights = np.reshape(critic.getWeights(), -1)
    # Graphics.histogram(critic_weights)
    """
    # Actor
    actor_weights = np.reshape(actor.getWeights(), -1)
    Graphics.histogram(actor_weights)
    """

    # After alternation, check the behavior on both the tasks
    n_trials = n_navigation_trials
    ValueLearning.DBG_LVL = 0
    print('Navigating Environment A')
    navigation_steps_E1 = ValueLearning.navigate(n_trials, env_E1,
                                                 place_cells_E1, actor, critic,
                                                 max_nav_steps)

    print('Navigating Environment B')
    navigation_steps_E2 = ValueLearning.navigate(n_trials, env_E2,
                                                 place_cells_E2, actor, critic,
                                                 max_nav_steps)

    return (learning_steps_E1, learning_steps_E2, navigation_steps_E1,
            navigation_steps_E2)