Esempio n. 1
0
 def test_wall_cross_x_left_to_right(self):
     mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
     state = (5,0)
     action = (-1,0)
     actual = mdp.runs_into_wall(state, action)
     expected = True
     self.assertEquals(actual, expected)
Esempio n. 2
0
 def test_wall_cross_y_down_to_up(self):
     mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
     state = (0,4)
     action = (0,1)
     actual = mdp.runs_into_wall(state, action)
     expected = True
     self.assertEquals(actual, expected)
Esempio n. 3
0
 def test_corner_movement_right(self):
     mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
     state = (4,4)
     action = (1,0)
     actual = mdp.runs_into_wall(state, action)
     expected = True
     self.assertEquals(actual, expected)
Esempio n. 4
0
 def test_leave_maze_positive_y_false(self):
     mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
     state = (0,3)
     action = (0,1)
     actual = mdp.runs_into_wall(state, action)
     expected = False
     self.assertEquals(actual, expected)
Esempio n. 5
0
 def test_wall_cross_through_doorway_x_left_to_right_larger_room_size(self):
     mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
     state = (7,3)
     action = (-1,0)
     actual = mdp.runs_into_wall(state, action)
     expected = False
     self.assertEquals(actual, expected)
Esempio n. 6
0
 def test_leave_maze_negative_y(self):
     mdp = mdps.MazeMDP(room_size=5, num_rooms=1)
     state = (0,0)
     action = (0,-1)
     actual = mdp.runs_into_wall(state, action)
     expected = True
     self.assertEquals(actual, expected)
Esempio n. 7
0
 def test_wall_cross_y_down_to_up_false_larger_room_size(self):
     mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
     state = (0,3)
     action = (0,1)
     actual = mdp.runs_into_wall(state, action)
     expected = False
     self.assertEquals(actual, expected)
Esempio n. 8
0
 def test_leave_maze_positive_x_false_larger(self):
     mdp = mdps.MazeMDP(room_size=5, num_rooms=5)
     state = (3,0)
     action = (1,0)
     actual = mdp.runs_into_wall(state, action)
     expected = False
     self.assertEquals(actual, expected)
Esempio n. 9
0
 def test_wall_cross_x_right_to_left_false_larger_room_size(self):
     mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
     state = (3,0)
     action = (1,0)
     actual = mdp.runs_into_wall(state, action)
     expected = False
     self.assertEquals(actual, expected)
Esempio n. 10
0
 def test_wall_cross_through_doorway_y_up(self):
     mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
     state = (2,4)
     action = (0,1)
     actual = mdp.runs_into_wall(state, action)
     expected = False
     self.assertEquals(actual, expected)
    def test_run_with_standard_maze_mdp_q_learning_agent_correct_V(self):
        mdp = mdps.MazeMDP(5, 2)
        mdp.compute_states()
        mdp.EXIT_REWARD = 1
        mdp.MOVE_REWARD = -0.01
        num_actions = len(mdp.get_actions(None))
        discount = 1
        exploration_prob = .5
        step_size = .1
        a = agent.QLearningAgent(num_actions=num_actions,
                                 discount=discount,
                                 exploration_prob=exploration_prob,
                                 step_size=step_size,
                                 logging=False)
        num_epochs = 10
        epoch_length = 200
        test_epoch_length = 0
        max_steps = 300
        run_tests = False
        e = experiment.Experiment(mdp, a, num_epochs, epoch_length,
                                  test_epoch_length, max_steps, run_tests)
        e.run()

        V = get_V(e)
        actual_total = 0
        for k, v in V.iteritems():
            actual_total += v
        expected_total_min = -110
        expected_total_max = -40
        self.assertTrue(actual_total < expected_total_max)
        self.assertTrue(actual_total > expected_total_min)
Esempio n. 12
0
 def test_wall_cross_through_doorway_y_down_larger_room_size(self):
     mdp = mdps.MazeMDP(room_size=7, num_rooms=2)
     state = (3,7)
     action = (0,-1)
     actual = mdp.runs_into_wall(state, action)
     expected = False
     self.assertEquals(actual, expected)
 def test_sequence_value_string(self):
     room_size = 3
     num_rooms = 3
     mdp = mdps.MazeMDP(room_size, num_rooms)
     mdp.compute_states()
     mdp.EXIT_REWARD = 1
     mdp.MOVE_REWARD = -0.1
     discount = 1
     sequence_length = 2
     batch_size = 10
     learning_rate = 1e-3
     freeze_interval = 10000
     num_hidden = 4
     eps = .5
     reg = 1e-8
     num_actions = len(mdp.get_actions(None))
     batch_size = 100
     network = recurrent_qnetwork.RecurrentQNetwork(
         input_shape=2 * room_size,
         sequence_length=sequence_length,
         batch_size=batch_size,
         num_actions=4,
         num_hidden=num_hidden,
         discount=discount,
         learning_rate=learning_rate,
         regularization=reg,
         update_rule='adam',
         freeze_interval=freeze_interval,
         network_type='single_layer_lstm',
         rng=None)
     num_epochs = 5
     epoch_length = 10
     test_epoch_length = 0
     max_steps = (room_size * num_rooms)**2
     epsilon_decay = (num_epochs * epoch_length * max_steps) / 2
     adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(
         room_size=room_size)
     p = policy.EpsilonGreedy(num_actions, eps, 0.05, epsilon_decay)
     rm = replay_memory.SequenceReplayMemory(
         input_shape=2 * room_size,
         sequence_length=sequence_length,
         batch_size=batch_size,
         capacity=50000)
     log = logger.NeuralLogger(agent_name='RecurrentQNetwork')
     a = agent.RecurrentNeuralAgent(network=network,
                                    policy=p,
                                    replay_memory=rm,
                                    log=log,
                                    state_adapter=adapter)
     run_tests = False
     e = experiment.Experiment(mdp,
                               a,
                               num_epochs,
                               epoch_length,
                               test_epoch_length,
                               max_steps,
                               run_tests,
                               value_logging=True)
     e.log_temporal_value_string()
Esempio n. 14
0
def run():
    mdp = mdps.MazeMDP(room_size=5, num_rooms=5)
    print 'online RL algorithm: '
    total_rewards, V = simulate_online_RL_algorithm(mdp)
    mdp.print_v(V)
    learning_utils.plot_rewards(total_rewards)
    print 'DP algorithm: '
    simulate_MDP_algorithm(mdp)
    def test_run_with_small_maze_mdp_q_learning_agent_correct_V(self):
        mdp = mdps.MazeMDP(5, 1)
        mdp.compute_states()
        mdp.EXIT_REWARD = 1
        mdp.MOVE_REWARD = -0.1
        num_actions = len(mdp.get_actions(None))
        discount = 1
        exploration_prob = .7
        step_size = 5e-1
        a = agent.QLearningAgent(num_actions=num_actions,
                                 discount=discount,
                                 exploration_prob=exploration_prob,
                                 step_size=step_size,
                                 logging=False)
        num_epochs = 20
        epoch_length = 100
        test_epoch_length = 0
        max_steps = 100
        run_tests = False
        e = experiment.Experiment(mdp, a, num_epochs, epoch_length,
                                  test_epoch_length, max_steps, run_tests)
        e.run()

        V = get_V(e)
        expected = {
            (0, 0): 0.3,
            (1, 0): 0.4,
            (2, 0): 0.5,
            (3, 0): 0.6,
            (4, 0): 0.7,
            (0, 1): 0.4,
            (1, 1): 0.5,
            (2, 1): 0.6,
            (3, 1): 0.7,
            (4, 1): 0.8,
            (0, 2): 0.5,
            (1, 2): 0.6,
            (2, 2): 0.7,
            (3, 2): 0.8,
            (4, 2): 0.9,
            (0, 3): 0.6,
            (1, 3): 0.7,
            (2, 3): 0.8,
            (3, 3): 0.9,
            (4, 3): 1.0,
            (0, 4): 0.7,
            (1, 4): 0.8,
            (2, 4): 0.9,
            (3, 4): 1.0,
            (4, 4): 0.0
        }

        max_diff = 1e-1
        for k in expected.keys():
            self.assertTrue(k in V)
            self.assertTrue(np.abs(V[k] - expected[k]) < max_diff)
Esempio n. 16
0
def run_keras_nnet():
    mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
    num_episodes = 200
    total_rewards, total_steps, trajectory, V = simulate_keras_online_RL_algorithm(
        mdp=mdp, num_episodes=num_episodes, max_iterations=100)
    print 'average_reward: {}'.format(np.mean(total_rewards[num_episodes /
                                                            2:]))
    print 'average_steps: {}'.format(np.mean(total_steps[num_episodes / 2]))
    learning_utils.plot_rewards(total_rewards)
    learning_utils.plot_rewards(total_steps)
    print trajectory
    mdp.print_trajectory(trajectory)
    print V
    mdp.print_v(V)
Esempio n. 17
0
 def test_agent(self):
     room_size = 5
     mdp = mdps.MazeMDP(room_size, 1)
     mdp.compute_states()
     mdp.EXIT_REWARD = 1
     mdp.MOVE_REWARD = -0.1
     discount = mdp.get_discount()
     num_actions = len(mdp.get_actions(None))
     network = qnetwork.QNetwork(input_shape=2 * room_size,
                                 batch_size=1,
                                 num_actions=4,
                                 num_hidden=10,
                                 discount=discount,
                                 learning_rate=1e-3,
                                 update_rule='sgd',
                                 freeze_interval=10000,
                                 rng=None)
     p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, 10000)
     rm = replay_memory.ReplayMemory(1)
     log = logger.NeuralLogger(agent_name='QNetwork')
     adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(
         room_size=room_size)
     a = agent.NeuralAgent(network=network,
                           policy=p,
                           replay_memory=rm,
                           logger=log,
                           state_adapter=adapter)
     num_epochs = 2
     epoch_length = 10
     test_epoch_length = 0
     max_steps = 10
     run_tests = False
     e = experiment.Experiment(mdp,
                               a,
                               num_epochs,
                               epoch_length,
                               test_epoch_length,
                               max_steps,
                               run_tests,
                               value_logging=False)
     e.run()
 def test_run_with_maze_mdp_and_working_agent_completes(self):
     mdp = mdps.MazeMDP(5, 1)
     num_actions = len(mdp.get_actions(None))
     discount = 1
     exploration_prob = .3
     step_size = 1e-2
     a = agent.QLearningAgent(num_actions=num_actions,
                              discount=discount,
                              exploration_prob=exploration_prob,
                              step_size=step_size,
                              logging=False)
     num_epochs = 1
     epoch_length = 1
     test_epoch_length = 0
     max_steps = 10000
     run_tests = False
     e = experiment.Experiment(mdp, a, num_epochs, epoch_length,
                               test_epoch_length, max_steps, run_tests)
     e.run()
     total_len = len(e.agent.logger.actions)
     self.assertTrue(total_len < max_steps * epoch_length * num_epochs)
Esempio n. 19
0
        def run(learning_rate, freeze_interval, num_hidden, reg, seq_len, eps,
                nt, update):
            room_size = 5
            num_rooms = 2
            input_shape = 2 * room_size
            print 'building mdp...'
            mdp = mdps.MazeMDP(room_size, num_rooms)
            mdp.compute_states()
            mdp.EXIT_REWARD = 1
            mdp.MOVE_REWARD = -0.01
            network_type = nt
            discount = 1
            sequence_length = seq_len
            num_actions = len(mdp.get_actions(None))
            batch_size = 100
            update_rule = update
            print 'building network...'
            network = recurrent_qnetwork.RecurrentQNetwork(
                input_shape=input_shape,
                sequence_length=sequence_length,
                batch_size=batch_size,
                num_actions=4,
                num_hidden=num_hidden,
                discount=discount,
                learning_rate=learning_rate,
                regularization=reg,
                update_rule=update_rule,
                freeze_interval=freeze_interval,
                network_type=network_type,
                rng=None)

            # take this many steps because (very loosely):
            # let l be the step length
            # let d be the difference in start and end locations
            # let N be the number of steps for the agent to travel a distance d
            # then N ~ (d/l)^2  // assuming this is a random walk
            # with l = 1, this gives d^2 in order to make it N steps away
            # the desired distance here is to walk along both dimensions of the maze
            # which is equal to two times the num_rooms * room_size
            # so squaring that gives a loose approximation to the number of
            # steps needed (discounting that this is actually a lattice (does it really matter?))
            # (also discounting the walls)
            # see: http://mathworld.wolfram.com/RandomWalk2-Dimensional.html
            max_steps = (2 * room_size * num_rooms)**2
            num_epochs = 500
            epoch_length = 1
            test_epoch_length = 0
            epsilon_decay = (num_epochs * epoch_length * max_steps) / 4
            print 'building adapter...'
            adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(
                room_size=room_size)
            print 'building policy...'
            p = policy.EpsilonGreedy(num_actions, eps, 0.05, epsilon_decay)
            print 'building replay memory...'
            # want to track at minimum the last 50 episodes
            capacity = max_steps * 50
            rm = replay_memory.SequenceReplayMemory(
                input_shape=input_shape,
                sequence_length=sequence_length,
                batch_size=batch_size,
                capacity=capacity)
            print 'building logger...'
            log = logger.NeuralLogger(agent_name=network_type)
            print 'building agent...'
            a = agent.RecurrentNeuralAgent(network=network,
                                           policy=p,
                                           replay_memory=rm,
                                           log=log,
                                           state_adapter=adapter)
            run_tests = False
            print 'building experiment...'
            e = experiment.Experiment(mdp,
                                      a,
                                      num_epochs,
                                      epoch_length,
                                      test_epoch_length,
                                      max_steps,
                                      run_tests,
                                      value_logging=True)
            print 'running experiment...'
            e.run()

            ak = file_utils.load_key('../access_key.key')
            sk = file_utils.load_key('../secret_key.key')
            bucket = 'hierarchical9'
            try:
                aws_util = aws_s3_utility.S3Utility(ak, sk, bucket)
                aws_util.upload_directory(e.agent.logger.log_dir)
            except Exception as e:
                print 'error uploading to s3: {}'.format(e)
Esempio n. 20
0
def run_nnet():
    mdp = mdps.MazeMDP(room_size=5, num_rooms=2)
    total_rewards, total_losses = simulate_symbolic_online_RL_algorithm(
        mdp=mdp, num_episodes=700, max_iterations=100)
    learning_utils.plot_rewards(total_rewards)
    learning_utils.plot_rewards(total_losses)
Esempio n. 21
0
        def run(learning_rate, freeze_interval, num_hidden, reg):
            room_size = 5
            num_rooms = 2
            mdp = mdps.MazeMDP(room_size, num_rooms)
            mdp.compute_states()
            mdp.EXIT_REWARD = 1
            mdp.MOVE_REWARD = -0.01
            discount = 1
            num_actions = len(mdp.get_actions(None))
            batch_size = 100
            print 'building network...'
            network = qnetwork.QNetwork(input_shape=2 * room_size +
                                        num_rooms**2,
                                        batch_size=batch_size,
                                        num_hidden_layers=2,
                                        num_actions=4,
                                        num_hidden=num_hidden,
                                        discount=discount,
                                        learning_rate=learning_rate,
                                        regularization=reg,
                                        update_rule='adam',
                                        freeze_interval=freeze_interval,
                                        rng=None)
            num_epochs = 50
            epoch_length = 2
            test_epoch_length = 0
            max_steps = 4 * (room_size * num_rooms)**2
            epsilon_decay = (num_epochs * epoch_length * max_steps) / 1.5
            print 'building policy...'
            p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, epsilon_decay)
            print 'building memory...'
            rm = replay_memory.ReplayMemory(batch_size, capacity=50000)
            print 'building logger...'
            log = logger.NeuralLogger(agent_name='QNetwork')
            print 'building state adapter...'
            adapter = state_adapters.CoordinatesToRowColRoomAdapter(
                room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToRowColAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToFlattenedGridAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.IdentityAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size)
            print 'building agent...'
            a = agent.NeuralAgent(network=network,
                                  policy=p,
                                  replay_memory=rm,
                                  log=log,
                                  state_adapter=adapter)
            run_tests = False
            e = experiment.Experiment(mdp,
                                      a,
                                      num_epochs,
                                      epoch_length,
                                      test_epoch_length,
                                      max_steps,
                                      run_tests,
                                      value_logging=True)
            e.run()

            ak = file_utils.load_key('../access_key.key')
            sk = file_utils.load_key('../secret_key.key')
            bucket = 'hierarchical'
            try:
                aws_util = aws_s3_utility.S3Utility(ak, sk, bucket)
                aws_util.upload_directory(e.agent.logger.log_dir)
            except Exception as e:
                print 'error uploading to s3: {}'.format(e)