def testMaze(n_train, n_nav): ValueLearning.DBG_LVL = 1 move_distance = 0.99 # Experiment parameters nx = 6 ny = 6 n_fields = round(1.0 * (nx + 3) * (ny + 3)) Hippocampus.N_CELLS_PER_FIELD = 1 n_cells = n_fields * Hippocampus.N_CELLS_PER_FIELD # Maze creation maze = Environment.RandomGoalOpenField(nx, ny, move_distance) # Generate place fields and place cells place_fields = Hippocampus.setupPlaceFields(maze, n_fields) place_cells = Hippocampus.assignPlaceCells(n_cells, place_fields) # Create Actor and Critic actor = Agents.RandomAgent(maze.getActions(), n_cells) critic = Agents.Critic(n_cells) ValueLearning.learnValueFunction(n_train, maze, place_cells, actor, critic, max_steps=1000)
def learnValueFunction(n_trials, environment, place_cells, actor=None, critic=None, max_steps=np.Inf): """ Main function responsible for learning value function for a given environment INPUTS: ------- n_trials: (INTEGER) Number of trials allowed on the task environment: (Maze) Physical space in which the task has to be learnt place_cells: (PlaceCell) Entity that encodes a particular location as a population <OPTIONAL INPUTS> actor: Pre-trained actor critic: Pre-trained critic OUTPUTS: -------- actor: (Actor Class) Entity that learns actions for a given state critic: (Critic Class) Entity that evaluates the value for a particular state. These values are used for taking actions. """ # Visualize place fields for a few cells and then the aggregate activity # Set up the actor and critic based on the place fields if critic is None: critic = Agents.Critic(len(place_cells)) else: assert (critic.getNFields() == len(place_cells)) if actor is None: actor = Agents.Actor(environment.getActions(), len(place_cells)) # actor = Agents.RandomAgent(environment.getActions(), len(place_cells)) # actor = Agents.IdealActor(environment, critic, place_cells) else: assert (actor.getNFields() == len(place_cells)) n_steps = np.zeros(n_trials, dtype=float) for trial in range(n_trials): # Path is visualized using a graphics object canvas = Graphics.WallMazeCanvas(environment) if DBG_LVL > 2: n_cells_to_visualize = 4 for _ in range(n_cells_to_visualize): sample_cell = random.randint(0, len(place_cells)) canvas.visualizePlaceField(place_cells[sample_cell]) canvas.visualizeAggregatePlaceFields(place_cells) # Initialize a new location and adjust for the optimal number of steps # needed to get to the goal. environment.redrawInitLocation() optimal_steps_to_goal = environment.getOptimalDistanceToGoal() n_steps[trial] = -optimal_steps_to_goal initial_state = environment.getCurrentState() canvas.update(initial_state) terminate_trial = False while not terminate_trial: terminate_trial = environment.reachedGoalState() if (n_steps[trial] > max_steps * environment.MOVE_DISTANCE): break n_steps[trial] += environment.MOVE_DISTANCE current_state = environment.getCurrentState() if DBG_LVL > 1: print('On state: (%.2f, %.2f)' % (current_state[0], current_state[1])) # Get the place field activity based on the current location pf_activity = [pf.getActivity(current_state) for pf in place_cells] # Get an action based on the place field activity next_action = actor.getAction(pf_activity) if DBG_LVL > 1: print('Selected Action: %s' % next_action) # Apply this action onto the environment reward = environment.move(next_action) # canvas.update(environment.getCurrentState()) # Use the obtained reward to update the value new_environment_state = environment.getCurrentState() canvas.update(new_environment_state) new_pf_activity = [ pf.getActivity(new_environment_state) for pf in place_cells ] prediction_error = critic.updateValue(pf_activity, new_pf_activity, reward) actor.updateWeights(pf_activity, prediction_error) if (DBG_LVL > 0): print('Ended trial %d moving %.1f.' % (trial, n_steps[trial])) # At debug level 1, only the first and the last trajectories, and # corresponding value functions are shown. At higher debug levels, # the entire trajectory is shown for every iteration if (DBG_LVL > 1) or (trial == 1) or (trial == n_trials - 1): # Plot the trajectory taken for this trial canvas.plotTrajectory() # This takes extremely long when using a population of neurons canvas.plotValueFunction(place_cells, critic, limits=False, continuous=True) # Plot a histogram of the weightS """ critic_weights = np.reshape(critic.getWeights(), -1) Graphics.histogram(critic_weights) """ if (DBG_LVL > 0): Graphics.plot(n_steps) else: print('Step Statistics - Mean (%.2f), STD (%.2f)' % (np.mean(n_steps), np.std(n_steps))) return (actor, critic, n_steps)
def testMaze(): """ No comments here. Look at single_maze_learning_agent.py for more details! """ ValueLearning.DBG_LVL = 0 nx = 6 ny = 6 # Set the number of cells to be used per "place field" - Same for all the environments Hippocampus.N_CELLS_PER_FIELD = 1 n_fields = round(1.0 * (nx + 3) * (ny + 3)) n_cells = Hippocampus.N_CELLS_PER_FIELD * n_fields move_distance = 0.99 n_training_trials = 100 n_single_env_episodes = 2 n_alternations = 1 max_train_steps = 1000 # First Environment: Has its own place cells and place fields env_E1 = Environment.RandomGoalOpenField(nx, ny, move_distance) canvas_E1 = Graphics.WallMazeCanvas(env_E1) place_fields_E1 = Hippocampus.setupPlaceFields(env_E1, n_fields) place_cells_E1 = Hippocampus.assignPlaceCells(n_cells, place_fields_E1) # Train a critic on the first environment print('Training Critic solely on Env A') critic_E1 = None weights_E1 = np.empty((n_cells, n_single_env_episodes), dtype=float) for episode in range(n_single_env_episodes): (_, critic_E1, _) = ValueLearning.learnValueFunction(n_training_trials, env_E1, place_cells_E1, critic=critic_E1, max_steps=max_train_steps) weights_E1[:, episode] = critic_E1.getWeights() # Get a trajectory in the environment and plot the value function canvas_E1.plotValueFunction(place_cells_E1, critic_E1, continuous=True) input('Press return to run next environment...') components_E1 = Graphics.showDecomposition(weights_E1, title='Environment 01') # Create empty actors and critics actor = Agents.RandomAgent(env_E1.getActions(), n_cells) critic = Agents.Critic(n_cells) # Second Environment: This has a different set (but the same number) of # place fields and place cells (also has a bunch of walls) nx = 6 ny = 6 lp_wall = Environment.Wall((0, 3), (3, 3)) rp_wall = Environment.Wall((4, 3), (6, 3)) env_E2 = Environment.MazeWithWalls(nx, ny, [lp_wall, rp_wall], move_distance=move_distance) canvas_E2 = Graphics.WallMazeCanvas(env_E2) place_fields_E2 = Hippocampus.setupPlaceFields(env_E2, n_fields) place_cells_E2 = Hippocampus.assignPlaceCells(n_cells, place_fields_E2) # Train another critic on the second environment print() print('Training Critic solely on Env B') critic_E2 = None weights_E2 = np.empty((n_cells, n_single_env_episodes), dtype=float) for episode in range(n_single_env_episodes): (_, critic_E2, _) = ValueLearning.learnValueFunction(n_training_trials, env_E2, place_cells_E2, critic=critic_E2, max_steps=max_train_steps) weights_E2[:, episode] = critic_E2.getWeights() components_E2 = Graphics.showDecomposition(weights_E2, title='Environment 02') canvas_E2.plotValueFunction(place_cells_E2, critic_E2, continuous=True) # Look at the projection of one environment's weights on the other's principal components Graphics.showDecomposition(weights_E1, components=components_E2, title='E2 on E1') Graphics.showDecomposition(weights_E2, components=components_E1, title='E1 on E2') input('Press any key to start Alternation.') # This can be used to just reinforce the fact that the agent is indeed # random! The steps taken to goal would not change over time because of the # way the agent behaves. learning_steps_E1 = np.zeros((n_alternations, 1), dtype=float) learning_steps_E2 = np.zeros((n_alternations, 1), dtype=float) # keep track of weights for PCA weights = np.empty((n_cells, n_alternations * 2), dtype=float) for alt in range(n_alternations): n_alternation_trials = n_single_env_episodes * n_training_trials # n_alternation_trials = n_training_trials print('Alternation: %d' % alt) # First look at the performance of the agent in the task before it is # allowed to learn anything. Then allow learning print('Learning Environment A') (actor, critic, steps_E1) = ValueLearning.learnValueFunction( n_alternation_trials, env_E1, place_cells_E1, actor, critic, max_train_steps) learning_steps_E1[alt] = np.mean(steps_E1) weights[:, 2 * alt] = critic.getWeights() # Repeat for environment 1 print('Learning Environment B') (actor, critic, steps_E2) = ValueLearning.learnValueFunction( n_alternation_trials, env_E2, place_cells_E2, actor, critic, max_train_steps) learning_steps_E2[alt] = np.mean(steps_E2) weights[:, 2 * alt + 1] = critic.getWeights() # Show the alternation weights in the two basis Graphics.showDecomposition(weights, components=components_E1, title='Alternation weights in E1') Graphics.showDecomposition(weights, components=components_E2, title='Alternation weights in E2') # Show the value functions for both the environments input('Press return for Value Function of E1') canvas_E1.plotValueFunction(place_cells_E1, critic, continuous=True) canvas_E1.plotValueFunction(place_cells_E1, critic_E1, continuous=True) canvas_E1.plotValueFunction(place_cells_E1, critic_E2, continuous=True) # Plot the ideal value function ideal_critic = Agents.IdealValueAgent(env_E1, place_cells_E1) optimal_value_function = ideal_critic.getValueFunction() scaling_factor = 1.0 / (1 - critic_E1.getDiscountFactor()) # Graphics.showImage(optimal_value_function, xticks=range(1,nx), yticks=range(1,ny), range=(maze.NON_GOAL_STATE_REWARD, scaling_factor * maze.GOAL_STATE_REWARD)) Graphics.showImage(optimal_value_function, xticks=range(1,nx), yticks=range(1,ny), \ range=(env_E1.NON_GOAL_STATE_REWARD, scaling_factor * env_E1.GOAL_STATE_REWARD)) input('Press return for Value Function of E2') canvas_E2.plotValueFunction(place_cells_E2, critic, continuous=True) canvas_E2.plotValueFunction(place_cells_E2, critic_E2, continuous=True) canvas_E2.plotValueFunction(place_cells_E2, critic_E1, continuous=True) # Plot the ideal value function ideal_critic = Agents.IdealValueAgent(env_E2, place_cells_E2) optimal_value_function = ideal_critic.getValueFunction() scaling_factor = 1.0 / (1 - critic_E2.getDiscountFactor()) # Graphics.showImage(optimal_value_function, xticks=range(1,nx), yticks=range(1,ny), range=(maze.NON_GOAL_STATE_REWARD, scaling_factor * maze.GOAL_STATE_REWARD)) Graphics.showImage(optimal_value_function, xticks=range(1,nx), yticks=range(1,ny), \ range=(env_E2.NON_GOAL_STATE_REWARD, scaling_factor * env_E2.GOAL_STATE_REWARD)) input('Press any key to exit!')
def testMaze(n_training_trials, n_navigation_trials): """ No comments here. Look at single_maze_learning_agent.py for more details! """ ValueLearning.DBG_LVL = 0 move_distance = 0.29 nx = 6 ny = 6 n_fields = round(1.0 * (nx + 3) * (ny + 3)) Hippocampus.N_CELLS_PER_FIELD = 4 n_cells = Hippocampus.N_CELLS_PER_FIELD * n_fields n_alternations = 1 max_nav_steps = 400 max_train_steps = 4000 # First Environment: Has its own place cells and place fields env_E1 = Environment.RandomGoalOpenField(nx, ny, move_distance) canvas_E1 = Graphics.WallMazeCanvas(env_E1) place_fields_E1 = Hippocampus.setupPlaceFields(env_E1, n_fields) place_cells_E1 = Hippocampus.assignPlaceCells(n_cells, place_fields_E1) # Create empty actors and critics actor = Agents.Actor(env_E1.getActions(), n_cells) critic = Agents.Critic(n_cells) # Second Environment: This has a different set (but the same number) of # place fields and place cells nx = 6 ny = 6 lp_wall = Environment.Wall((0, 3), (3, 3)) rp_wall = Environment.Wall((4, 3), (6, 3)) env_E2 = Environment.MazeWithWalls(nx, ny, [lp_wall, rp_wall], move_distance) canvas_E2 = Graphics.WallMazeCanvas(env_E2) place_fields_E2 = Hippocampus.setupPlaceFields(env_E2, n_fields) place_cells_E2 = Hippocampus.assignPlaceCells(n_cells, place_fields_E2) learning_steps_E1 = np.zeros((n_training_trials, 1), dtype=float) learning_steps_E2 = np.zeros((n_training_trials, 1), dtype=float) for alt in range(n_alternations): print('Alternation: %d' % alt) # First look at the performance of the agent in the task before it is # allowed to learn anything. Then allow learning print('Learning Environment A') (actor, critic, steps_E1) = ValueLearning.learnValueFunction( n_training_trials, env_E1, place_cells_E1, actor, critic, max_train_steps) learning_steps_E1 = steps_E1 print('Learning Environment B') (actor, critic, steps_E2) = ValueLearning.learnValueFunction( n_training_trials, env_E2, place_cells_E2, actor, critic, max_train_steps) learning_steps_E2 = steps_E2 # canvas_E1.plotValueFunction(place_cells_E1, critic) # canvas_E2.plotValueFunction(place_cells_E2, critic) # Plot a histogram of the weights # Critic # critic_weights = np.reshape(critic.getWeights(), -1) # Graphics.histogram(critic_weights) """ # Actor actor_weights = np.reshape(actor.getWeights(), -1) Graphics.histogram(actor_weights) """ # After alternation, check the behavior on both the tasks n_trials = n_navigation_trials ValueLearning.DBG_LVL = 0 print('Navigating Environment A') navigation_steps_E1 = ValueLearning.navigate(n_trials, env_E1, place_cells_E1, actor, critic, max_nav_steps) print('Navigating Environment B') navigation_steps_E2 = ValueLearning.navigate(n_trials, env_E2, place_cells_E2, actor, critic, max_nav_steps) return (learning_steps_E1, learning_steps_E2, navigation_steps_E1, navigation_steps_E2)