Beispiel #1
0
def main(argv):
    del argv
    gw = Gridworld(10, 10, 0, 99)
    #for i in range(7):
    #    gw.grid[7][i] = 1
    agent_module = importlib.import_module("agents." + FLAGS.agent)

    avg_num_steps = np.zeros(FLAGS.num_episodes)
    policy = getattr(policies.tabular_policies, FLAGS.policy)

    for _ in range(FLAGS.num_trials):
        agent = agent_module.Agent(FLAGS.agent, gw.width * gw.height,
                                   FLAGS.gamma, policy, FLAGS.alpha)
        steps_per_episode = []

        for _ in range(FLAGS.num_episodes):
            state = gw.start
            action = agent.select_action(state)
            step = 0
            terminate = False
            while step < FLAGS.max_steps and not terminate:
                next_state = gw.apply_action(state, action)
                terminate, reward = gw.is_goal(next_state)
                next_action = agent.select_action(next_state)
                agent.update(state, action, reward, next_state, next_action)
                state = next_state
                action = next_action
                step += 1
            steps_per_episode.append(step)

        avg_num_steps += np.array(steps_per_episode)

    avg_num_steps = avg_num_steps / FLAGS.num_trials
    plt.plot(avg_num_steps)
    plt.show()
Beispiel #2
0
    def __init__(self, params_file):
        """ Initializes an experiment.

        """
        self.params = self.get_parameter(params_file)
        self.exp_dir = self.set_exp_dir()
        _logger = self.set_logger()
        _logger.info("Initializing new experiment of type %s" %
                     str(self.params['type']))
        _logger.info("Loading parameters from %s" % str(params_file))
        _logger.info("Saving logs in %s" % str(self.exp_dir))
        # self.set_s tatus('Initializing')
        # copy parameter source
        helper.copy_file(params_file, os.path.join(self.exp_dir,
                                                   'params.yaml'))
        # Mersenne Twister pseudo-random number generator
        self.rng = np.random.RandomState(self.params['random_seed'])
        # set environment
        self.env = Gridworld(grid=os.path.join(os.getcwd(), 'maps',
                                               self.params['grid']),
                             max_steps=self.params['max_steps'],
                             visual=self.params['visual'],
                             rng=self.rng)
        self.current_task = 'None'
        self.current_run = 0
        self.current_episode = 0
        self.exp_steps = 0
Beispiel #3
0
def main(argv):
    del argv
    gw = Gridworld(10, 10, 0, 80)
    for i in range(7):
        gw.grid[7][i] = 1
    agent_module = importlib.import_module("agents." + FLAGS.agent)

    avg_num_steps = np.zeros(FLAGS.num_episodes)
    policy = getattr(policies.tabular_policies, FLAGS.policy)

    for _ in range(FLAGS.num_trials):
        agent = agent_module.Agent(FLAGS.agent, gw.width * gw.height, FLAGS.n,
                                   FLAGS.gamma, policy, FLAGS.alpha)
        steps_per_episode = []

        for _ in range(FLAGS.num_episodes):
            T = np.Inf
            state = gw.start
            agent.reset_agent()
            agent.stored_states.append(state)
            action = agent.select_action(state)
            agent.stored_actions.append(action)
            step = 0
            tau = 0
            while tau != T - 1:
                if step < T:
                    next_state = gw.apply_action(state, action)
                    terminate, reward = gw.is_goal(next_state)
                    agent.stored_states.append(next_state)
                    agent.stored_rewards.append(reward)
                    if terminate or step == FLAGS.max_steps - 1:
                        T = step + 1
                    else:
                        next_action = agent.select_action(next_state)
                        agent.stored_actions.append(next_action)
                        state = next_state
                        action = next_action
                tau = step - agent.n + 1
                if tau >= 0:
                    agent.update(tau, T)
                step += 1

            steps_per_episode.append(step)

        avg_num_steps += np.array(steps_per_episode)

    avg_num_steps = avg_num_steps / FLAGS.num_trials
    np.save(FLAGS.log_path + "/" + FLAGS.log_file, avg_num_steps)
    plt.plot(avg_num_steps)
    plt.savefig(FLAGS.log_path + "/" + FLAGS.log_file)
Beispiel #4
0
def get_pi_eval():
    alpha = 0.1
    env = Gridworld()
    agent = QAgent(25, 4, 1, alpha)
    episode = 0
    while True:
        env.reset()
        agent.new_episode()
        G = 0
        s = env.get_state()

        t = 0
        while True:
            a = agent.get_action(s)
            _, r, _ = env.step(a)
            G += r

            if env.terminal():
                agent.train_inf(s, a, r)
                break

            s_prime = env.get_state()
            agent.train(s, a, r, s_prime)
            s = s_prime

            t += 1
        episode += 1
        print("episode=", episode, " G", G)
        if episode > 800:
            break
    return agent.get_pi()
Beispiel #5
0
    def __init__( self, nrows = 8, ncols = 8 , **kwargs):

        # we have to make some assertions to make sure the we can fit
        # a lake in the middle of the gridworld

        assert  nrows > 2 and ncols > 2
        self.nstates = nrows * ncols

        self.lake = []
        for x in range(self.nstates):

            if ( 0 < (x % ncols) < ncols - 1 ) and ( ncols < x < (nrows - 1) * ncols ):
                self.lake.append(x)

        Gridworld.__init__(self, nrows = nrows, ncols = ncols, **kwargs)
Beispiel #6
0
def getCliffGrid():
    grid = [[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
            [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
            [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
            [
                'S', -100, -100, -100, -100, -100, -100, -100, -100, -100,
                -100, 'TERMINAL_STATE'
            ]]
    return Gridworld(grid)
def iteration_example():
    grid = Gridworld.negative_grid()
    print('grid:')
    print_values(grid.rewards, grid)
    values, policy = policy_iteration(grid, 0.9)
    print('values:')
    print_values(values, grid)
    print('policy:')
    print_policy(policy, grid)
def q_example():
    grid = Gridworld.negative_grid()
    print('Rewards:')
    print_values(grid.rewards, grid)
    values, policy, deltas = gradient_q(grid)
    plt.plot(deltas)
    plt.show()
    print('Values:')
    print_values(values, grid)
    print('Policy:')
    print_policy(policy, grid)
Beispiel #9
0
def gridworld():
    height = 5
    width = 5
    state_A = (0, 1)
    state_B = (0, 3)
    from_A = (4, 1)
    from_B = (2, 3)
    reward_leaving_state_A = 10
    reward_leaving_state_B = 5
    gamma = 0.9
    return Gridworld(height, width, state_A, state_B, from_A, from_B,
                     reward_leaving_state_A, reward_leaving_state_B, gamma)
Beispiel #10
0
def ex_4_5(size=None):
    """
  Testing policy evaluation and policy iteration on gridworld using Q values.
  """
    if size is None:
        size = DEF_EX_4_4_SIZE
    env = Gridworld(size)
    pi_rand = random_policy(env)
    pi_init = {(a, s): pi_rand(s, a) for s in env.states for a in env.moves}
    alg = DynamicProgramming(env, pi=pi_init, theta=1e-4, gamma=1)
    alg.policy_iteration_Q()
    alg.print_policy()
def example_1():
    """
	Example 1: Obains the solution for a given infinite gridworld problem using
	value iteration and policy iteration.

	"""
    # Initialises all required inputs for the Gridworld. In this example,
    # actions list contains up, down, left and right, and the reward dict is
    # stored in the form of key: (coord, action index), value: (reward, new coord).
    name = 'base_problem'
    size = (5, 5)
    actions_list = [[-1, 0], [1, 0], [0, -1], [0, 1]]
    reward_dict = {
        ((0, 1), 1): (10, (4, 1)),
        ((0, 2), 0): (3, (0, 2)),
        ((0, 2), 1): (3, (0, 2)),
        ((0, 2), 2): (3, (0, 2)),
        ((0, 2), 3): (3, (0, 2)),
        ((1, 1), 3): (4, (1, 4))
    }
    discount = 0.9

    # Initialises the Gridworld.
    gridworld = Gridworld(name, size, actions_list, reward_dict, discount)

    # Obtains the optimum value estimate and policy from the Gridworld using
    # value iteration.
    gridworld.obtain_optimum('value_iteration')

    # Obtains the optimum value estimate and policy from the Gridworld using
    # policy iteration.
    gridworld.obtain_optimum('policy_iteration')
Beispiel #12
0
def ex_4_4(size=None):
    """
  Testing a policy iteration that stops when policy encountered twice on
  environment where all policies are equally bad (gridworld with cost of move
  equal to zero).
  """
    if size is None:
        size = DEF_EX_4_4_SIZE
    env = Gridworld(size, cost_move=0)
    det_pi = {s: env.moves[0] for s in env.states}
    alg = DynamicProgramming(env, det_pi=det_pi, theta=1e-7, gamma=1)
    # uncomment/comment for the difference between improvement and not improvement
    alg.policy_iteration_improved()
Beispiel #13
0
def q_example():
    grid = Gridworld.negative_grid()
    print('Rewards:')
    print_values(grid.rewards, grid)
    values, policy, deltas, count_updates = q_learning(grid)
    plt.plot(deltas)
    plt.show()
    print('Updates:')
    print_values(count_updates, grid)
    print('Values:')
    print_values(values, grid)
    print('Policy:')
    print_policy(policy, grid)
Beispiel #14
0
def fig_4_1(size=None):
    if size is None:
        size = DEF_FIG_4_1_SIZE
    env = Gridworld(size)
    pi_rand = random_policy(env)
    pi_init = {(a, s): pi_rand(s, a) for s in env.states for a in env.moves}
    alg = DynamicProgramming(env, pi=pi_init, theta=1e-4,
                             gamma=1)  # undiscounted
    alg.policy_evaluation()
    alg.print_values()
    # show the optimal policy
    while not alg.policy_improvement():
        pass
    alg.print_policy()
Beispiel #15
0
def simple_example():
    grid = Gridworld.negative_grid(-0.9)
    print('Rewards:')
    print_values(grid.rewards, grid)
    policy = {}
    for s in grid.actions.keys():
        policy[s] = np.random.choice(POSSIBLE_ACTIONS)

    values, deltas, policy = policy_iteration(grid, 2000, policy)
    plt.plot(deltas)
    plt.show()
    print('Values:')
    print_values(values, grid)
    print('Policy:')
    print_policy(policy, grid)
Beispiel #16
0
def simple_example():
    grid = Gridworld.default_grid()
    print('Rewards:')
    print_values(grid.rewards, grid)
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
    }
    values = first_visit_monte_carlo(grid, 100, policy)
    print('Values:')
    print_values(values, grid)
    print('Policy:')
    print_policy(policy, grid)
Beispiel #17
0
def simple_example():
    grid = Gridworld.default_grid()
    print('Rewards:')
    print_values(grid.rewards, grid)
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
    }
    values = td_zero(grid, policy)
    print('Values:')
    print_values(values, grid)
    print('Policy:')
    print_policy(policy, grid)
def simple_example():
    grid = Gridworld.default_grid()
    values_uniform = policy_evaluation(grid, 1)
    print('values for uniformly random actions:')
    print_values(values_uniform, grid)
    print('\n\n')

    fixed_policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'R',
        (2, 1): 'R',
        (2, 2): 'R',
        (2, 3): 'U',
    }
    print_policy(fixed_policy, grid)
    fixed_values = policy_evaluation(grid, 0.9, fixed_policy)
    print('Values for fixed policy:')
    print_values(fixed_values, grid)
Beispiel #19
0
def mc_prediction():
    grid = Gridworld.default_grid()
    print('Rewards:')
    print_values(grid.rewards, grid)
    policy = {
        (2, 0): 'U',
        (1, 0): 'U',
        (0, 0): 'R',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'U',
        (2, 1): 'L',
        (2, 2): 'U',
        (2, 3): 'L',
    }
    values, deltas = approx_monte_carlo(grid, policy)
    plt.plot(deltas)
    plt.show()
    print('Values:')
    print_values(values, grid)
    print('Policy:')
    print_policy(policy, grid)
Beispiel #20
0
            target_weights[i] = (self.tau * weights[i] +
                                 (1 - self.tau) * target_weights[i])

        self.target_model.set_weights(target_weights)

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return

        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            self.learn(state, action, reward, next_state, done)


if __name__ == '__main__':
    world = Gridworld(3, 3, goal_position=(3, 3), traps=[(2, 2)])

    agent = DQNAgent(2, 4)
    ANGLES = [math.pi / 2, 0, -math.pi / 2, math.pi]

    episodes = 500
    batch_size = 32
    memory_length = 20
    last_results = deque(maxlen=memory_length)

    for e in range(episodes):
        state = world.reset()
        state = np.expand_dims(state, 0)

        for time_step in range(500):
            action = agent.act(state)
Beispiel #21
0
class Experiment(object):
    """ This is the base class for all experiment implementations.

    The experiment organizes all objects and directs the training in a given
    scenario.

    """
    __metaclass__ = abc.ABCMeta

    def __init__(self, params_file):
        """ Initializes an experiment.

        """
        self.params = self.get_parameter(params_file)
        self.exp_dir = self.set_exp_dir()
        _logger = self.set_logger()
        _logger.info("Initializing new experiment of type %s" %
                     str(self.params['type']))
        _logger.info("Loading parameters from %s" % str(params_file))
        _logger.info("Saving logs in %s" % str(self.exp_dir))
        # self.set_s tatus('Initializing')
        # copy parameter source
        helper.copy_file(params_file, os.path.join(self.exp_dir,
                                                   'params.yaml'))
        # Mersenne Twister pseudo-random number generator
        self.rng = np.random.RandomState(self.params['random_seed'])
        # set environment
        self.env = Gridworld(grid=os.path.join(os.getcwd(), 'maps',
                                               self.params['grid']),
                             max_steps=self.params['max_steps'],
                             visual=self.params['visual'],
                             rng=self.rng)
        self.current_task = 'None'
        self.current_run = 0
        self.current_episode = 0
        self.exp_steps = 0

    def get_parameter(self, file_name):
        path_to_file = os.path.join(os.getcwd(), file_name)
        with open(path_to_file, 'r') as ymlfile:
            params = yaml.load(ymlfile, Loader=yaml.Loader)
        return params

    def set_exp_dir(self):
        folder = "%s_%s_%s" % (str(
            time.strftime("%Y-%m-%d_%H-%M")), str(
                self.params['type']).lower(), str(self.params['grid']).lower())
        path_to_dir = os.path.join(os.getcwd(), 'logs', folder)
        return helper.create_dir(path_to_dir)

    def set_logger(self):
        # make sure no loggers are already active
        try:
            logging.root.handlers.pop()
        except IndexError:
            # if no logger exist the list will be empty and we need
            # to catch the resulting error
            pass
        if self.params['log_type'] == 'stdout':
            logging.basicConfig(level=getattr(logging,
                                              self.params['log_level'], None),
                                stream=sys.stdout,
                                format='[%(asctime)s][%(levelname)s]'
                                '[%(module)s][%(funcName)s] '
                                '%(message)s')
        else:
            logging.basicConfig(level=getattr(logging,
                                              self.params['log_level'], None),
                                format='[%(asctime)s][%(levelname)s]'
                                '[%(module)s][%(funcName)s] '
                                '%(message)s',
                                filename=os.path.join(self.exp_dir,
                                                      'experiment.log'),
                                filemode='w')
        return logging.getLogger(__name__)

    def set_status(self, status):
        self.status = status
        _logger.debug("[T:%s,R:%s,E:%s] %s" %
                      (str(self.current_task['name']), str(self.current_run),
                       str(self.current_episode), str(self.status)))

    def init_episode(self):
        self.steps_in_episode = 0
        self.reward_in_episode = 0
        self._init_episode()

    @abc.abstractmethod
    def _init_episode(self):
        pass

    def cleanup_episode(self):
        self._cleanup_episode()
        if self.status == 'training':
            if self.learner.epsilon > self.params['epsilon_limit']:
                self.learner.set_epsilon(self.learner.epsilon +
                                         self.learner.epsilon_change)

    @abc.abstractmethod
    def _cleanup_episode(self):
        pass

    def init_run(self):
        _logger.info("..... Starting run %s" % str(self.current_run))
        run_dir = os.path.join(self.task_dir, 'run_' + str(self.current_run))
        self.run_dir = helper.create_dir(run_dir)
        # Create run stats file: run_stats.csv
        self.run_stats_file = os.path.join(self.run_dir, 'stats_run.csv')
        self.run_steps = 0
        helper.write_stats_file(self.run_stats_file, 'episode', 'steps_total',
                                'steps_mean', 'reward_total', 'reward_mean',
                                'epsilon', 'step_count')
        self._init_run()

    @abc.abstractmethod
    def _init_run(self):
        pass

    def cleanup_run(self):
        self.save_best_episode()
        helper.delete_dirs(self.run_dir)
        helper.plot_run(self.run_dir)
        self._cleanup_run()
        _logger.info("..... Finished run %s" % str(self.current_run))

    @abc.abstractmethod
    def _cleanup_run(self):
        pass

    def init_task(self):
        _logger.info("##### Starting task %s" % str(self.current_task['name']))
        task_dir = os.path.join(self.exp_dir,
                                'task_' + self.current_task['name'])
        self.task_dir = helper.create_dir(task_dir)
        self._init_task()

    @abc.abstractmethod
    def _init_task(self):
        pass

    def cleanup_task(self):
        helper.plot_runs(self.task_dir)
        helper.summarize_runs_results(self.task_dir)
        helper.plot_task(self.task_dir)
        self.save_best_run()
        self._cleanup_task()
        # self.set_s tatus('idle')
        _logger.info("##### Finished task %s" % str(self.current_task['name']))

    @abc.abstractmethod
    def _cleanup_task(self):
        pass

    # def evaluate_current_lib rary(self):
    #    pass

    def get_action_id(self, state, policy_name):
        return self._get_action_id(state, policy_name)

    @abc.abstractmethod
    def _get_action_id(self):
        pass

    @abc.abstractmethod
    def _specific_updates(self):
        pass

    def write_test_results(self):
        helper.write_stats_file(
            self.run_stats_file, self.current_episode, sum(self.test_steps),
            np.mean(self.test_steps), sum(self.test_rewards),
            np.mean(self.test_rewards),
            float("{0:.5f}".format(self.learner.last_epsilon)), self.run_steps)
        self._write_test_results()

    @abc.abstractmethod
    def _write_test_results(self):
        pass

    def run_tests(self):
        self.learner.set_epsilon(0.0)
        self.episode_dir = os.path.join(self.run_dir,
                                        'episode_' + str(self.current_episode))
        self.episode_dir = helper.create_dir(self.episode_dir)
        self.test_steps = []
        self.test_rewards = []
        for test_pos in self.params['test_positions']:
            self.init_episode()
            self.run_episode(test_pos, tuple(self.current_task['goal_pos']),
                             self.current_task['name'])
            self.test_steps.append(self.steps_in_episode)
            self.test_rewards.append(self.reward_in_episode)
        self.write_test_results()
        self.learner.save_Qs(os.path.join(self.episode_dir, 'Qs.npy'))
        # Make video from random position
        if self.params['visual']:
            self.set_status('recording')
            self.init_episode()
            self.run_episode(
                self.env.get_random_state(tuple(
                    self.current_task['goal_pos'])),
                tuple(self.current_task['goal_pos']),
                self.current_task['name'])
        self.learner.set_epsilon(self.learner.last_epsilon)

    def run_episode(self, agent_pos, goal_pos, policy_name=None):
        """
            Function to run a single episode.
        """
        if self.status == 'training':
            _logger.debug("Start episode")
        self.env.reset_env()
        self.env.add_agent(agent_pos, self.agent_name)
        self.env.add_goal(goal_pos)
        if self.status == 'recording':
            self.env.draw_frame()
            self.env.save_current_frame(self.episode_dir)
        state = self.env.get_current_state(self.agent_name)
        action_id = self.get_action_id(state, policy_name)
        reward = self.env.step(self.env.actions[action_id], self.agent_name)
        state_prime = self.env.get_current_state(self.agent_name)
        if self.status in ['training', 'policy_eval']:
            self.run_steps += 1
        if self.status == 'recording':
            self.env.draw_frame()
            self.env.save_current_frame(self.episode_dir)
        self.steps_in_episode += 1
        self.reward_in_episode += reward
        if self.status == 'training' and not self.env.episode_ended:
            self.learner.update_Q(state[0:2], action_id, reward,
                                  state_prime[0:2])
        self._specific_updates(policy_name)
        while not self.env.episode_ended:
            state = state_prime
            action_id = self.get_action_id(state, policy_name)
            reward = self.env.step(self.env.actions[action_id],
                                   self.agent_name)
            state_prime = self.env.get_current_state(self.agent_name)
            if self.status in ['training', 'policy_eval']:
                self.run_steps += 1
            if self.status == 'recording':
                self.env.draw_frame()
                self.env.save_current_frame(self.episode_dir)
            # if self.status in ['testing', 'policy_eval']:
            self.steps_in_episode += 1
            self.reward_in_episode += reward
            if self.status == 'training':
                self.learner.update_Q(state[0:2], action_id, reward,
                                      state_prime[0:2])
            if self.env.step_count >= self.env.max_steps:
                self.env.episode_ended = True
            self._specific_updates(policy_name)
        if self.status == 'training':
            _logger.debug("End episode")
        if self.env.visual and self.status == 'recording':
            self.env.make_video(self.episode_dir)

    def save_best_episode(self):
        df = pd.read_csv(os.path.join(self.run_dir, 'stats_run.csv'))
        least_steps_row = df.ix[df['steps_mean'].idxmin()]
        run_best_file = os.path.join(self.run_dir, 'stats_run_best.csv')
        headers = ['run']
        content = [int(self.current_run)]
        for column in df:
            headers.append(str(column))
            content.append(least_steps_row[column])
        helper.write_stats_file(run_best_file, headers)
        helper.write_stats_file(run_best_file, content)
        helper.copy_file(
            os.path.join(self.run_dir,
                         'episode_' + str(int(least_steps_row['episode'])),
                         'Qs.npy'), os.path.join(self.run_dir, 'best_Qs.npy'))

    def save_best_run(self):
        # Save best Q-table for current task
        df = pd.read_csv(
            os.path.join(self.task_dir, 'run_' + str(1), 'stats_run_best.csv'))
        for i in range(2, self.params['runs']):
            df.append(pd.read_csv(
                os.path.join(self.task_dir, 'run_' + str(i),
                             'stats_run_best.csv')),
                      ignore_index=True)
        least_steps_row = df.ix[df['steps_mean'].idxmin()]
        task_best_file = os.path.join(self.task_dir, 'stats_task_best.csv')
        headers = ['task']
        content = [str(self.current_task['name'])]
        for column in df:
            headers.append(str(column))
            content.append(least_steps_row[column])
        helper.write_stats_file(task_best_file, headers)
        helper.write_stats_file(task_best_file, content)
        helper.copy_file(
            os.path.join(self.task_dir,
                         'run_' + str(int(least_steps_row['run'])),
                         'best_Qs.npy'),
            os.path.join(self.task_dir, 'best_Qs.npy'))

    def main(self):
        for task in self.params['tasks']:
            self.current_task = task
            self.init_task()
            for run in range(1, self.params['runs'] + 1):
                self.current_run = run
                self.current_episode = 0
                self.current_policy = self.current_task['name']
                self.init_run()
                self.set_status('testing')
                self.run_tests()
                self.set_status('training')
                for episode in range(1, self.params['episodes'] + 1):
                    self.current_episode = episode
                    self.init_episode()
                    self.run_episode(
                        self.env.get_random_state(
                            tuple(self.current_task['goal_pos'])),
                        tuple(self.current_task['goal_pos']),
                        self.current_policy)
                    self.cleanup_episode()
                    if episode % self.params['test_interval'] == 0:
                        self.set_status('testing')
                        self.run_tests()
                        self.set_status('training')
                self.cleanup_run()
            self.cleanup_task()
        _logger.info("Done")
Beispiel #22
0
def getBookGrid():
    grid = [[' ', ' ', ' ', +1], [' ', '#', ' ', -1], ['S', ' ', ' ', ' ']]
    return Gridworld(grid)
Beispiel #23
0
def getSimpleGrid():
    grid = [[' ', ' ', +1], [' ', ' ', -1], ['S', ' ', ' ']]
    return Gridworld(grid)
Beispiel #24
0
def getBridgeGrid():
    grid = [['#', -100, -100, -100, -100, -100, '#'],
            [1, 'S', ' ', ' ', ' ', ' ', 10],
            ['#', -100, -100, -100, -100, -100, '#']]
    return Gridworld(grid)
Beispiel #25
0
def getDiscountGrid():
    grid = [[' ', ' ', ' ', ' ', ' '], [' ', '#', ' ', ' ', ' '],
            [' ', '#', 1, '#', 10], ['S', ' ', ' ', ' ', ' '],
            [-10, -10, -10, -10, -10]]
    return Gridworld(grid)
Beispiel #26
0
def getCliffGrid2():
    grid = [[' ', ' ', ' ', ' ', ' '], [8, 'S', ' ', ' ', 10],
            [-100, -100, -100, -100, -100]]
    return Gridworld(grid)
Beispiel #27
0
def getCliffGrid():
    grid = [[' ', ' ', ' ', ' ', ' '], ['S', ' ', ' ', ' ', 10],
            [-100, -100, -100, -100, -100]]
    return Gridworld(makeGrid(grid))
Beispiel #28
0
def getMyGrid():
    "Add your own grid definition here."
    grid = [[' ', ' ', +1, ' ', ' ', ' '], [' ', ' ', 'S', ' ', ' ', ' '],
            [-1, -1, -1, '#', ' ', ' '], [+30, ' ', ' ', ' ', ' ', +20]]
    return Gridworld(grid)
Beispiel #29
0
def getMazeGrid():
    grid = [[' ', ' ', ' ', +1], ['#', '#', ' ', '#'], [' ', '#', ' ', ' '],
            [' ', '#', '#', ' '], ['S', ' ', ' ', ' ']]
    return Gridworld(grid)
Beispiel #30
0
    def learn(self, old_state, new_state, action, reward):
        old_val = self.q_table[old_state][action]
        next_value = np.max(self.q_table[new_state])
        # print(old_state, action, reward, new_state)
        new_q_value = self.compute_new_q_value(old_val, reward, next_value)

        self.q_table[old_state][action] = new_q_value

    def print_values(self):
        height, width, _ = self.q_table.shape

        for r in range(1, height - 1):
            for c in range(1, width - 1):
                for a_id, a in enumerate(self.actions):
                    print("q(s{}{}, {}) = {:.3f}".format(
                        r, c, a, self.q_table[r, c, a_id]))
                print()


if __name__ == '__main__':
    from gridworld import Gridworld

    env = Gridworld(5, 5, goal_position=(1, 3), traps=[(2, 1)])
    env.render()

    agent = SARSA(env)
    agent.train(episodes=1000)

    agent.print_values()
Beispiel #31
0
from gridworld import Gridworld
import pygame as pg
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from vicero.algorithms.deepqlearning import DQN

scale = 24
env = Gridworld(scale, width=8, height=8)

pg.init()
screen = pg.display.set_mode(
    (scale * len(env.board[0]), scale * len(env.board)))
env.screen = screen
clock = pg.time.Clock()
"""
while True:
    env.step(env.action_space.sample())
    env.render()
"""


def plot(history):
    plt.figure(2)
    plt.clf()
    durations_t = torch.DoubleTensor(history)
    plt.title('Training...')