def test_list_num_iters(self): lc = LearningCurve(self.fpath) lc.parse() x = lc.list('NumIters') dx = np.diff(x) assert_true(np.all(dx > 0))
def test_list_invalid_key(self): lc = LearningCurve(self.fpath) lc.parse() assert_raises(KeyError, lc.list, 'wrong-key', phase=Phase.TRAIN) assert_raises(KeyError, lc.list, 'wrong-key', phase=Phase.TEST) assert_raises(KeyError, lc.list, 'accuracy', phase=Phase.TRAIN)
def test_list_loss_acc(self): lc = LearningCurve(self.fpath) lc.parse() loss = lc.list('loss') acc = lc.list('accuracy') assert_equal(loss.shape, acc.shape) assert_false(np.all(loss == acc))
def test_keys_parsed(self): lc = LearningCurve(self.fpath) train_keys, test_keys = lc.parse() assert_list_equal(train_keys, ['NumIters', 'Seconds', 'LearningRate', 'loss']) assert_list_equal( test_keys, ['NumIters', 'Seconds', 'LearningRate', 'accuracy', 'loss'])
def test_list(self): lc = LearningCurve(self.fpath) lc.parse() x = lc.list('NumIters') assert_greater(x.size, 0) loss = lc.list('loss') assert_equal(x.shape, loss.shape) acc = lc.list('accuracy') assert_equal(x.shape, acc.shape)
def test_name(self): lc = LearningCurve(self.fpath) assert_is_instance(lc.name(), str) assert_greater(len(lc.name()), 0, 'name is empty')
def test_keys_parsed(self): lc = LearningCurve(self.fpath) train_keys, test_keys = lc.parse() assert_list_equal(train_keys, ['NumIters', 'Seconds', 'LearningRate', 'loss']) assert_list_equal(test_keys, ['NumIters', 'Seconds', 'LearningRate', 'accuracy', 'loss'])
env.reset() # Take one random step to get the pole and cart moving obs, rew, done, _ = env.step(env.action_space.sample()) reward = reward_func(obs, rew) state = state_func(obs) memory = Memory(max_size=memory_size) if gym_env_name == 'CartPole-v0': max_util = 200 else: max_util = 500 curve = LearningCurve(plots=[('utility', 'left', 'r'), ('epsilon', 'right', 'b')], episode_range=1000, min_y_left=0, max_y_left=max_util) # Make a bunch of random actions and store the experiences for ii in range(pretrain_length): # Uncomment the line below to watch the simulation # env.render() # Make a random action action = env.action_space.sample() obs, rew, done, _ = env.step(action) reward = reward_func(obs, rew) next_state = state_func(obs) if done:
def q_learning(self, gamma=0.9, alpha=0.3, episodes=100, max_steps=50, fps=30, epsilon_0=-1.0, plot=False): ''' Q learning q: change view (state values or q values) s: change speed (slow or fast) e: Explore or not ''' if plot: l_curve = LearningCurve(min_y=-1.5, max_y=1.5) self.state_values, self.state_q_values = self.init_values() flag_q = False flag_fast = False flag_exit = False flag_explore = True episode = 0 while (True): # for episode in range(episodes): if flag_explore: if epsilon_0 >= 0.0: epsilon = epsilon_0 else: epsilon = np.exp(-episode / (episodes / 5)) else: epsilon = 0.0 state = random.choice(self.states) done = False explore = False action = '' utility = 0.0 reward = 0.0 for step in range(max_steps): # while(True): self.r_draw_background() if not flag_q: self.r_draw_values() else: self.r_draw_q_values() self.r_draw_agent(state) self.r_draw_reward(reward, utility, done) # self.r_draw_rl_metrics(f'{episode+1}/{episodes}', epsilon, action, explore) self.r_draw_rl_metrics(episode + 1, epsilon, action, explore) pygame.display.flip() if flag_fast: key = self.tick_key(fps) else: key = self.tick_key(1) if key == pygame.K_q: flag_q = not flag_q elif key == pygame.K_s: flag_fast = not flag_fast elif key == pygame.K_e: flag_explore = not flag_explore elif key == pygame.K_ESCAPE: flag_exit = True break if done: break if np.random.uniform() < epsilon: explore = True action = random.choice(self.allowed_actions[state]) else: explore = False action = self.policy[state] new_state, reward, _, done = self.step(state, action) if done: sample = reward else: sample = reward + gamma * self.max_val( self.state_q_values[new_state]) self.state_q_values[state][action] = ( 1 - alpha ) * self.state_q_values[state][action] + alpha * sample self.policy[state], self.state_values[state] = self.key_max( self.state_q_values[state]) utility += (gamma**step) * reward state = new_state if plot: l_curve.add_sample(episode, utility) if flag_exit: break episode += 1