def test_wall_cross_x_left_to_right(self): mdp = mdps.MazeMDP(room_size=5, num_rooms=2) state = (5,0) action = (-1,0) actual = mdp.runs_into_wall(state, action) expected = True self.assertEquals(actual, expected)
def test_wall_cross_y_down_to_up(self): mdp = mdps.MazeMDP(room_size=5, num_rooms=2) state = (0,4) action = (0,1) actual = mdp.runs_into_wall(state, action) expected = True self.assertEquals(actual, expected)
def test_corner_movement_right(self): mdp = mdps.MazeMDP(room_size=5, num_rooms=1) state = (4,4) action = (1,0) actual = mdp.runs_into_wall(state, action) expected = True self.assertEquals(actual, expected)
def test_leave_maze_positive_y_false(self): mdp = mdps.MazeMDP(room_size=5, num_rooms=1) state = (0,3) action = (0,1) actual = mdp.runs_into_wall(state, action) expected = False self.assertEquals(actual, expected)
def test_wall_cross_through_doorway_x_left_to_right_larger_room_size(self): mdp = mdps.MazeMDP(room_size=7, num_rooms=2) state = (7,3) action = (-1,0) actual = mdp.runs_into_wall(state, action) expected = False self.assertEquals(actual, expected)
def test_leave_maze_negative_y(self): mdp = mdps.MazeMDP(room_size=5, num_rooms=1) state = (0,0) action = (0,-1) actual = mdp.runs_into_wall(state, action) expected = True self.assertEquals(actual, expected)
def test_wall_cross_y_down_to_up_false_larger_room_size(self): mdp = mdps.MazeMDP(room_size=7, num_rooms=2) state = (0,3) action = (0,1) actual = mdp.runs_into_wall(state, action) expected = False self.assertEquals(actual, expected)
def test_leave_maze_positive_x_false_larger(self): mdp = mdps.MazeMDP(room_size=5, num_rooms=5) state = (3,0) action = (1,0) actual = mdp.runs_into_wall(state, action) expected = False self.assertEquals(actual, expected)
def test_wall_cross_x_right_to_left_false_larger_room_size(self): mdp = mdps.MazeMDP(room_size=7, num_rooms=2) state = (3,0) action = (1,0) actual = mdp.runs_into_wall(state, action) expected = False self.assertEquals(actual, expected)
def test_wall_cross_through_doorway_y_up(self): mdp = mdps.MazeMDP(room_size=5, num_rooms=2) state = (2,4) action = (0,1) actual = mdp.runs_into_wall(state, action) expected = False self.assertEquals(actual, expected)
def test_run_with_standard_maze_mdp_q_learning_agent_correct_V(self): mdp = mdps.MazeMDP(5, 2) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.01 num_actions = len(mdp.get_actions(None)) discount = 1 exploration_prob = .5 step_size = .1 a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False) num_epochs = 10 epoch_length = 200 test_epoch_length = 0 max_steps = 300 run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) e.run() V = get_V(e) actual_total = 0 for k, v in V.iteritems(): actual_total += v expected_total_min = -110 expected_total_max = -40 self.assertTrue(actual_total < expected_total_max) self.assertTrue(actual_total > expected_total_min)
def test_wall_cross_through_doorway_y_down_larger_room_size(self): mdp = mdps.MazeMDP(room_size=7, num_rooms=2) state = (3,7) action = (0,-1) actual = mdp.runs_into_wall(state, action) expected = False self.assertEquals(actual, expected)
def test_sequence_value_string(self): room_size = 3 num_rooms = 3 mdp = mdps.MazeMDP(room_size, num_rooms) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.1 discount = 1 sequence_length = 2 batch_size = 10 learning_rate = 1e-3 freeze_interval = 10000 num_hidden = 4 eps = .5 reg = 1e-8 num_actions = len(mdp.get_actions(None)) batch_size = 100 network = recurrent_qnetwork.RecurrentQNetwork( input_shape=2 * room_size, sequence_length=sequence_length, batch_size=batch_size, num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate=learning_rate, regularization=reg, update_rule='adam', freeze_interval=freeze_interval, network_type='single_layer_lstm', rng=None) num_epochs = 5 epoch_length = 10 test_epoch_length = 0 max_steps = (room_size * num_rooms)**2 epsilon_decay = (num_epochs * epoch_length * max_steps) / 2 adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter( room_size=room_size) p = policy.EpsilonGreedy(num_actions, eps, 0.05, epsilon_decay) rm = replay_memory.SequenceReplayMemory( input_shape=2 * room_size, sequence_length=sequence_length, batch_size=batch_size, capacity=50000) log = logger.NeuralLogger(agent_name='RecurrentQNetwork') a = agent.RecurrentNeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter) run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=True) e.log_temporal_value_string()
def run(): mdp = mdps.MazeMDP(room_size=5, num_rooms=5) print 'online RL algorithm: ' total_rewards, V = simulate_online_RL_algorithm(mdp) mdp.print_v(V) learning_utils.plot_rewards(total_rewards) print 'DP algorithm: ' simulate_MDP_algorithm(mdp)
def test_run_with_small_maze_mdp_q_learning_agent_correct_V(self): mdp = mdps.MazeMDP(5, 1) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.1 num_actions = len(mdp.get_actions(None)) discount = 1 exploration_prob = .7 step_size = 5e-1 a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False) num_epochs = 20 epoch_length = 100 test_epoch_length = 0 max_steps = 100 run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) e.run() V = get_V(e) expected = { (0, 0): 0.3, (1, 0): 0.4, (2, 0): 0.5, (3, 0): 0.6, (4, 0): 0.7, (0, 1): 0.4, (1, 1): 0.5, (2, 1): 0.6, (3, 1): 0.7, (4, 1): 0.8, (0, 2): 0.5, (1, 2): 0.6, (2, 2): 0.7, (3, 2): 0.8, (4, 2): 0.9, (0, 3): 0.6, (1, 3): 0.7, (2, 3): 0.8, (3, 3): 0.9, (4, 3): 1.0, (0, 4): 0.7, (1, 4): 0.8, (2, 4): 0.9, (3, 4): 1.0, (4, 4): 0.0 } max_diff = 1e-1 for k in expected.keys(): self.assertTrue(k in V) self.assertTrue(np.abs(V[k] - expected[k]) < max_diff)
def run_keras_nnet(): mdp = mdps.MazeMDP(room_size=5, num_rooms=2) num_episodes = 200 total_rewards, total_steps, trajectory, V = simulate_keras_online_RL_algorithm( mdp=mdp, num_episodes=num_episodes, max_iterations=100) print 'average_reward: {}'.format(np.mean(total_rewards[num_episodes / 2:])) print 'average_steps: {}'.format(np.mean(total_steps[num_episodes / 2])) learning_utils.plot_rewards(total_rewards) learning_utils.plot_rewards(total_steps) print trajectory mdp.print_trajectory(trajectory) print V mdp.print_v(V)
def test_agent(self): room_size = 5 mdp = mdps.MazeMDP(room_size, 1) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.1 discount = mdp.get_discount() num_actions = len(mdp.get_actions(None)) network = qnetwork.QNetwork(input_shape=2 * room_size, batch_size=1, num_actions=4, num_hidden=10, discount=discount, learning_rate=1e-3, update_rule='sgd', freeze_interval=10000, rng=None) p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, 10000) rm = replay_memory.ReplayMemory(1) log = logger.NeuralLogger(agent_name='QNetwork') adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter( room_size=room_size) a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, logger=log, state_adapter=adapter) num_epochs = 2 epoch_length = 10 test_epoch_length = 0 max_steps = 10 run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=False) e.run()
def test_run_with_maze_mdp_and_working_agent_completes(self): mdp = mdps.MazeMDP(5, 1) num_actions = len(mdp.get_actions(None)) discount = 1 exploration_prob = .3 step_size = 1e-2 a = agent.QLearningAgent(num_actions=num_actions, discount=discount, exploration_prob=exploration_prob, step_size=step_size, logging=False) num_epochs = 1 epoch_length = 1 test_epoch_length = 0 max_steps = 10000 run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests) e.run() total_len = len(e.agent.logger.actions) self.assertTrue(total_len < max_steps * epoch_length * num_epochs)
def run(learning_rate, freeze_interval, num_hidden, reg, seq_len, eps, nt, update): room_size = 5 num_rooms = 2 input_shape = 2 * room_size print 'building mdp...' mdp = mdps.MazeMDP(room_size, num_rooms) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.01 network_type = nt discount = 1 sequence_length = seq_len num_actions = len(mdp.get_actions(None)) batch_size = 100 update_rule = update print 'building network...' network = recurrent_qnetwork.RecurrentQNetwork( input_shape=input_shape, sequence_length=sequence_length, batch_size=batch_size, num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate=learning_rate, regularization=reg, update_rule=update_rule, freeze_interval=freeze_interval, network_type=network_type, rng=None) # take this many steps because (very loosely): # let l be the step length # let d be the difference in start and end locations # let N be the number of steps for the agent to travel a distance d # then N ~ (d/l)^2 // assuming this is a random walk # with l = 1, this gives d^2 in order to make it N steps away # the desired distance here is to walk along both dimensions of the maze # which is equal to two times the num_rooms * room_size # so squaring that gives a loose approximation to the number of # steps needed (discounting that this is actually a lattice (does it really matter?)) # (also discounting the walls) # see: http://mathworld.wolfram.com/RandomWalk2-Dimensional.html max_steps = (2 * room_size * num_rooms)**2 num_epochs = 500 epoch_length = 1 test_epoch_length = 0 epsilon_decay = (num_epochs * epoch_length * max_steps) / 4 print 'building adapter...' adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter( room_size=room_size) print 'building policy...' p = policy.EpsilonGreedy(num_actions, eps, 0.05, epsilon_decay) print 'building replay memory...' # want to track at minimum the last 50 episodes capacity = max_steps * 50 rm = replay_memory.SequenceReplayMemory( input_shape=input_shape, sequence_length=sequence_length, batch_size=batch_size, capacity=capacity) print 'building logger...' log = logger.NeuralLogger(agent_name=network_type) print 'building agent...' a = agent.RecurrentNeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter) run_tests = False print 'building experiment...' e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=True) print 'running experiment...' e.run() ak = file_utils.load_key('../access_key.key') sk = file_utils.load_key('../secret_key.key') bucket = 'hierarchical9' try: aws_util = aws_s3_utility.S3Utility(ak, sk, bucket) aws_util.upload_directory(e.agent.logger.log_dir) except Exception as e: print 'error uploading to s3: {}'.format(e)
def run_nnet(): mdp = mdps.MazeMDP(room_size=5, num_rooms=2) total_rewards, total_losses = simulate_symbolic_online_RL_algorithm( mdp=mdp, num_episodes=700, max_iterations=100) learning_utils.plot_rewards(total_rewards) learning_utils.plot_rewards(total_losses)
def run(learning_rate, freeze_interval, num_hidden, reg): room_size = 5 num_rooms = 2 mdp = mdps.MazeMDP(room_size, num_rooms) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.01 discount = 1 num_actions = len(mdp.get_actions(None)) batch_size = 100 print 'building network...' network = qnetwork.QNetwork(input_shape=2 * room_size + num_rooms**2, batch_size=batch_size, num_hidden_layers=2, num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate=learning_rate, regularization=reg, update_rule='adam', freeze_interval=freeze_interval, rng=None) num_epochs = 50 epoch_length = 2 test_epoch_length = 0 max_steps = 4 * (room_size * num_rooms)**2 epsilon_decay = (num_epochs * epoch_length * max_steps) / 1.5 print 'building policy...' p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, epsilon_decay) print 'building memory...' rm = replay_memory.ReplayMemory(batch_size, capacity=50000) print 'building logger...' log = logger.NeuralLogger(agent_name='QNetwork') print 'building state adapter...' adapter = state_adapters.CoordinatesToRowColRoomAdapter( room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToRowColAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToFlattenedGridAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.IdentityAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size) print 'building agent...' a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter) run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=True) e.run() ak = file_utils.load_key('../access_key.key') sk = file_utils.load_key('../secret_key.key') bucket = 'hierarchical' try: aws_util = aws_s3_utility.S3Utility(ak, sk, bucket) aws_util.upload_directory(e.agent.logger.log_dir) except Exception as e: print 'error uploading to s3: {}'.format(e)