Example #1
0
    def __init__(
        self, 
        observation_space=500,
        action_space=6, 
        alpha=0.1,
        gamma=0.9,
        epsilon=1.0,
        epsilon_decay=0.9999,
        epsilon_min=0.01
    ):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA               = action_space
        self.possible_actions = np.arange(self.nA)
        self.epsilon_decay    = epsilon_decay
        self.epsilon          = epsilon
        self.epsilon_min      = epsilon_min
        self.q_table          = QTable(
            observation_space=observation_space,
            action_space=action_space, 
            alpha=alpha, 
            gamma=gamma
        )
Example #2
0
class Agent:
    def __init__(self,
                 observation_space=500,
                 action_space=6,
                 alpha=0.1,
                 gamma=0.9,
                 epsilon=1.0,
                 epsilon_decay=0.9999,
                 epsilon_min=0.01):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = action_space
        self.possible_actions = np.arange(self.nA)
        self.epsilon_decay = epsilon_decay
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.q_table = QTable(observation_space=observation_space,
                              action_space=action_space,
                              alpha=alpha,
                              gamma=gamma)

    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        action_probabilities = self.epsilon_greedy(state)
        return np.random.choice(self.possible_actions, p=action_probabilities)

    def update_epsilon(self):
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

    def epsilon_greedy(self, state):
        policy = np.ones(self.nA) * (self.epsilon / self.nA)
        best_action_idx = np.argmax(self.q_table.q(state))
        policy[best_action_idx] = (1 - self.epsilon) + (self.epsilon / self.nA)
        return policy

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        self.q_table.sarsa_max_update(state, action, reward, next_state)
 def __init__(self, env):
     super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
     self.color = 'red'  # override color
     self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
     self.q_table = QTable(alpha=0.1, gamma=0.1)
     self.q_table_updater = QTableUpdater(self.q_table)
     self.total_actions = 0.0
     self.total_rewards = 0.0
Example #4
0
 def __init__(self, env, qmodel: QNetwork, amodel: PolicyNetwork, tasks, gamma: float = None, num_learn: int = 10,
              steps_per_episode: int = 1000, scheduler_period: int = 150, num_avg_gradient: int = 10,
              listeners=None, temperature=1):
     if gamma is None:
         gamma = qmodel.gamma
     super().__init__(env, qmodel, amodel, tasks, gamma, num_learn, steps_per_episode, scheduler_period,
                      num_avg_gradient, listeners)
     self.Q = QTable()
     self.M = defaultdict(lambda: 0)
     self.scheduler = self.Q.derive_policy(BoltzmannPolicy, lambda x: self.tasks, temperature=temperature)
Example #5
0
 def __init__(self, env: FiniteActionEnvironment, gamma: float = 1.0):
     """
     Create a new MonteCarlo Agent
     :param env: The environment the agent will learn from
     :param gamma: Reward discount factor
     """
     super().__init__(env)
     self.q_table = QTable()
     self.visit_count = defaultdict(int)
     self.policy = self.q_table.derive_policy(EpsilonGreedyPolicy,
                                              env.valid_actions_from,
                                              epsilon=self.epsilon)
     self.gamma = gamma
Example #6
0
class SACQ(SACU):
    def __init__(self, env, qmodel: QNetwork, amodel: PolicyNetwork, tasks, gamma: float = None, num_learn: int = 10,
                 steps_per_episode: int = 1000, scheduler_period: int = 150, num_avg_gradient: int = 10,
                 listeners=None, temperature=1):
        if gamma is None:
            gamma = qmodel.gamma
        super().__init__(env, qmodel, amodel, tasks, gamma, num_learn, steps_per_episode, scheduler_period,
                         num_avg_gradient, listeners)
        self.Q = QTable()
        self.M = defaultdict(lambda: 0)
        self.scheduler = self.Q.derive_policy(BoltzmannPolicy, lambda x: self.tasks, temperature=temperature)


    def train_scheduler(self, tau, Tau):
        main_task = self.tasks[0]
        xi = self.scheduler_period
        main_rewards = [r[main_task] for _, _, r, _ in tau]
        for h in range(len(Tau)):
            R = sum([r * self.gamma**k for k, r in enumerate(main_rewards[h*xi:])])
            self.M[Tau[h]] += 1
            #self.Q[tuple(Tau[:h]), Tau[h]] += (R - self.Q[tuple(Tau[:h]), Tau[h]])/self.M[Tau[h]]

            # We used a Q-Table with 0.1 learning rate to update the values in the table.
            # Change 0.1 to the desired learning rate
            self.Q[tuple(Tau[:h]), Tau[h]] += 0.1 * (R - self.Q[tuple(Tau[:h]), Tau[h]])

    def schedule_task(self, Tau):
        return self.scheduler.sample(tuple(Tau))
Example #7
0
def run_session(problem, param={}):
    '''run a session of qtable'''
    sys_vars = init_sys_vars(problem, param)  # rl system, see util.py
    env = gym.make(sys_vars['GYM_ENV_NAME'])
    env_spec = get_env_spec(env)
    replay_memory = ReplayMemory(env_spec)
    qtable = QTable(env_spec, **param)

    for epi in range(sys_vars['MAX_EPISODES']):
        sys_vars['epi'] = epi
        run_episode(sys_vars, env, qtable, replay_memory)
        # Best so far, increment num epochs every 2 up to a max of 5
        if sys_vars['solved']:
            break

    return sys_vars
Example #8
0
class MonteCarlo(Agent):
    """
        Monte Carlo Agent implementation
    """

    def __init__(self, env: FiniteActionEnvironment, gamma: float = 1.0):
        """
        Create a new MonteCarlo Agent
        :param env: The environment the agent will learn from
        :param gamma: Reward discount factor
        """
        super().__init__(env)
        self.q_table = QTable()
        self.visit_count = defaultdict(int)
        self.policy = self.q_table.derive_policy(EpsilonGreedyPolicy,
                                                 env.valid_actions_from,
                                                 epsilon=self.epsilon)
        self.gamma = gamma

    def learn(self, num_iter=100000) -> EpsilonGreedyPolicy:
        """
        Learn a policy from the environment
        :param num_iter: The number of iterations the algorithm should run
        :return: the derived policy
        """
        Q, N, pi = self.q_table, self.visit_count, self.policy
        for _ in range(num_iter):
            s = self.env.reset()
            e, r = [], 0
            while not s.is_terminal():                          # Execute an episode
                a = pi.sample(s)
                e += [[s, a]]
                s, r = self.env.step(a)
                e[-1] += [r]
            
            for i, (s, a, r) in enumerate(reversed(e)):         # Reverse rewards so G can be computed efficiently
                g = r if i == 0 else g * self.gamma + r
                N[s, a] += 1
                N[s] += 1
                Q[s, a] += (1 / N[s, a]) * (g - Q[s, a])
        return pi

    def epsilon(self, s):
        N_0, N = 100, self.visit_count
        return N_0 / (N_0 + N[s])
Example #9
0
class ValueIterator:
    def __init__(self, target_position):
        self.target_position = target_position
        self._tran = Transitions()
        self._rewards = Rewarder(target_position)
        self._q_tab = QTable()
        self._v_tab = VTable()

    def update(self, debug=False):
        for s1 in self.all_states():
            for a in range(len(Config.actions)):
                s2 = self._tran.run(s1, a)
                rew = self._rewards[s1, s2]
                if s2:
                    q = rew + Config.gamma * self._v_tab[s2]
                else:
                    q = rew
                self._q_tab[s1, a] = q

                if debug:
                    pprint_transition(s1, a, s2, rew)

        self._v_tab.update_from_q_table(self._q_tab)

    # noinspection PyMethodMayBeStatic
    def all_states(self):
        for i in range(len(Config.letters)):
            for j in range(len(Config.numbers)):
                if (i, j) == self.target_position:
                    continue
                for o in range(len(Config.orientations)):
                    yield i, j, o

    def path(self, s0):
        a, _ = self._q_tab.get_best_action(s0)
        s1 = self._tran.run(s0, a)
        if not s1:
            raise ValueError("Переход в запрещенное состояние: " + state_to_str(s0) + "-" + action_to_str(a) + "-> None")
        elif (s1[0], s1[1]) == self.target_position:
            return [s0, a, s1]
        return [s0, a] + self.path(s1)
Example #10
0
        return 0

    def is_game_over(self, board):
        winner = self.get_who_wins(board)
        if winner == 0:
            if np.sum(np.abs(board)) == 9:
                return True
            else:
                return False
        else:
            return True


from q_table import QTable

table = QTable()
game = SelfPlay()
game.verbose = False
game.epsilon = 0.8
for k in range(50000):
    game.play_game(table)
    print(k)
table.save_q_table_to_file('q_table.npy')

game.verbose = True
game.epsilon = 1
# table.load_q_table_from_file('q_table.npy')
game.play_game(table)

board = np.array(
    [[-1, 1, 0],
Example #11
0
from appJar import gui
from random import randint
from tictactoe import TTT
from q_table import QTable
import numpy as np

app = gui("TTT", "400x400")
app.setSticky("news")
app.setStretch("both")
app.setFont(40)

game_ttt = TTT()
q_table = QTable()
q_table.load_q_table_from_file("q_table.npy")


def refresh_buttons():
    for x in range(0, 3):
        for y in range(0, 3):
            title = 3 * x + y
            if game_ttt.board[x][y] == 0:
                app.setButton(title, "")
            elif game_ttt.board[x][y] == 1:
                app.setButton(title, "X")
            elif game_ttt.board[x][y] == -1:
                app.setButton(title, "O")


def restart():
    game_ttt.restart()
    refresh_buttons()
Example #12
0
                break
            step += 1
    print('over')
    env.destroy


def q_learning_run():
    env.after(100, q_learning_update())
    env.mainloop()


def nn_run():
    env1.after(100, nn_update)
    env1.mainloop()


if __name__ == '__main__':
    env = Maze()
    env1 = Maze_dqn()
    table = QTable(actions=list(range(env.n_actions)))
    nn = DQN(env1.n_actions,
             env1.n_features,
             alpha=0.01,
             gamma=0.9,
             epsilon=0.9,
             replace_target_iter=200,
             memory_size=2000,
             epsilon_increment=0.1)
    nn_run()
    # q_learning_run()
Example #13
0
EPSILON_DECAY = 25 * EPSILON_MIN / max_num_steps

train_params = TrainingParameters(MAX_NUM_EPISODES, STEPS_PER_EPISODE)
learn_params = LearningParameters(ALPHA, GAMMA)
agent_params = AgentParameters(EPSILON_MIN, EPSILON_DECAY, 1)

# %% [markdown]
# ### Agente

# %%
from agent import Agent
from q_table import QTable
import numpy as np

agent = Agent(agent_params, env.action_space.n)
q_table = QTable(env.observation_space.n, env.action_space.n, learn_params)
agent.set_q_table(q_table)

# %% [markdown]
# ### Funções de treino e teste

# %%
def train(agent: Agent, env, params: TrainingParameters):
    best_reward = -float('inf')
    for episode in range(MAX_NUM_EPISODES):
        obs = env.reset()
        done = False
        total_reward = 0.0        
        while not done:
            action = agent.get_action(obs)
            next_obs, reward, done, info = env.step(action)
Example #14
0
 def __init__(self, target_position):
     self.target_position = target_position
     self._tran = Transitions()
     self._rewards = Rewarder(target_position)
     self._q_tab = QTable()
     self._v_tab = VTable()
 def set_q_table(self, alpha=0.0, gamma=0.0):
     self.q_table = QTable(alpha=alpha, gamma=gamma)
     self.q_table_updater = QTableUpdater(self.q_table)
Example #16
0
# Testing settings
flags.DEFINE_boolean('run_test', True, 'If the final model should be tested.')
flags.DEFINE_integer('test_runs', 100, 'Number of times to run the test.')
flags.DEFINE_float('test_epsilon', 0.1, 'Epsilon to use on test run.')
flags.DEFINE_integer(
    'test_step_limit', 1000,
    'Limits the number of steps in test to avoid badly performing agents running forever.'
)

settings = flags.FLAGS

# Set up GridWorld
env = GridWorld(settings.field_size, settings.random_seed)
# Set up Q-table
q_table = QTable(settings.field_size, settings.random_seed)

sess = tf.InteractiveSession()
np.random.seed(settings.random_seed)

summary_dir = '../../logs/q-gridworld-fieldsize{}-episodes{}-lr{}/'.format(
    settings.field_size, settings.episodes, settings.learning_rate)
summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)
stats = Stats(sess, summary_writer, 3)

episode = 0
epsilon = settings.initial_epsilon

while settings.episodes > episode:
    # Prepare environment for playing
    env.reset()
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.q_table = QTable(alpha=0.1, gamma=0.1)
        self.q_table_updater = QTableUpdater(self.q_table)
        self.total_actions = 0.0
        self.total_rewards = 0.0
        # self.last_occurence_of_punishment = 0.0

    def set_q_table(self, alpha=0.0, gamma=0.0):
        self.q_table = QTable(alpha=alpha, gamma=gamma)
        self.q_table_updater = QTableUpdater(self.q_table)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator

        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        self.state = 'light: {}, left: {}, oncoming: {}, next_waypoint: {}'.format(inputs['light'],
                inputs['left'],
                inputs['oncoming'],
                self.next_waypoint)

        # Select action according to your policy
        action = self.q_table.best_action(light=inputs['light'],
                next_waypoint=self.next_waypoint,
                left=inputs['left'],
                oncoming=inputs['oncoming'])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.q_table_updater.update(light=inputs['light'],
                next_waypoint=self.next_waypoint,
                left=inputs['left'],
                oncoming=inputs['oncoming'],
                action=action,
                reward=reward)

        self.total_rewards += reward
        self.total_actions += 1.0

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, next_waypoint = {}".format(deadline, inputs, action, reward, self.next_waypoint)  # [debug]

    def __init_q_table(self):
        self.q_table = {}

    def __positions(self):
        positions_list = []
        for i in range(6):
            for j in range(8):
                positions_list.append((i+1,j+1))
        return positions_list
Example #18
0
from q_state import next_state, random_state, actions
from q_learning import QLearning
from q_table import QTable

if __name__ == "__main__":
    episode = 100
    model_save_interval = 10

    table = QTable(actions)
    learning = QLearning(table)

    for step in range(episode):
        init_state = random_state()
        i = 0
        reward = 0
        while reward != 1:
            state = init_state
            while True:
                i += 1
                action = learning.choose_action(state)
                state2, reward, done = next_state(state, action, table)
                learning.learn(state, action, reward, state2, done)
                if done:
                    break
                state = state2
        print(init_state, i, len(table.q_table))
        if (step + 1) % model_save_interval == 0:
            table.save()