def run_q_agent(policy='ε–greedy', save=False): """Runs a q agent according to a policy""" agent = Q_Agent() all_iterations, all_rewards, step_count = agent.train(env, iter_n=1000, policy=policy) plot_reward(all_iterations, all_rewards) plot_steps(all_iterations, step_count)
def random_search(): """Random search to determine starting point for the model and best params""" gamma = 0.7 alpha = 0.3 epsilon = 1 exploration_rate_decay = 0.87 max_tries = 10 best_score = -1000 scores = {} for attempt in range(max_tries): agent = Q_Agent(epsilon=1, alpha=alpha, gamma=gamma, exploration_rate_decay=exploration_rate_decay) _, rewards, steps = agent.train(env, iter_n=300, policy='ε–greedy', print_results=False) print(np.mean(rewards)) scores[attempt] = np.mean(rewards) print( "Score:{}, gamma {}, alpha {}, epsilon {}, e_decay_rate{}".format( scores[attempt], gamma, alpha, epsilon, exploration_rate_decay)) if scores[attempt] > best_score: best_score = scores[attempt] print(best_score) best_gamma = gamma best_alpha = alpha best_epsilon = epsilon best_decay = exploration_rate_decay gamma = best_gamma + (np.random.randint(-1, 2) / 10) gamma = min(1, gamma) gamma = max(0, gamma) alpha = best_alpha + (np.random.randint(-1, 2) / 10) alpha = min(1, alpha) alpha = max(0, alpha) epsilon = 1 exploration_rate_decay = best_decay + np.random.randint(-1, 2) / 100 exploration_rate_decay = min(0.99, exploration_rate_decay) exploration_rate_decay = max(0.7, exploration_rate_decay) print("Best validation_accuracy:", best_score) print("Best settings:") print("best gamma:", best_gamma) print("best alpha:", best_alpha) print("best epsilon:", best_epsilon) print("best decay:", best_decay)
def grid_search_param(environmnet, policy='ε–greedy', parameter='alpha'): """Grid search for alpha or gamma adjustable via the parameter field""" parameter_values = [] avg_scores = [] avg_steps = [] count = 1 for param_num in np.linspace(0.2, 1, 9): if parameter == 'alpha': agent = Q_Agent(alpha=param_num) elif parameter == 'gamma': agent = Q_Agent(gamma=param_num) all_iterations, all_rewards, step_count = agent.train( environmnet, print_results=True, iter_n=1000, policy=policy) avg_scores.append(np.mean(all_rewards)) avg_steps.append(np.mean(step_count)) parameter_values.append(param_num) rewards_data = np.array([all_iterations, all_rewards]) step_data = np.array([all_iterations, step_count]) np.savetxt('/Users/matthewgalloway/Documents/RF/q_learning/' + parameter + '_inv/' + parameter + '_rewards_' + str(param_num) + '.csv', rewards_data.transpose(), delimiter=",") np.savetxt('/Users/matthewgalloway/Documents/RF/q_learning/' + parameter + '_inv/' + parameter + '_steps_' + str(param_num) + '.csv', step_data.transpose(), delimiter=",") if count % 50 == 0: print('iteration {} of 10'.format(count)) count += 1 results = { 'alpha_values': parameter_values, 'avg_scores': avg_scores, 'avg_steps': avg_steps, } print(results) return pd.DataFrame(results)
def grid_search_epsilon(environmnet, policy='ε–greedy', parameter='epsilon'): """Grid search for epsilon values""" parameter_values = [] avg_scores = [] avg_steps = [] count = 1 decay_search = [0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.99] for param_num in decay_search: agent = Q_Agent(exploration_rate_decay=param_num, epsilon=1) all_iterations, all_rewards, step_count = agent.train( environmnet, print_results=True, iter_n=1000, policy=policy) avg_scores.append(np.mean(all_rewards)) avg_steps.append(np.mean(step_count)) parameter_values.append(param_num) rewards_data = np.array([all_iterations, all_rewards]) step_data = np.array([all_iterations, step_count]) np.savetxt('/Users/matthewgalloway/Documents/RF/q_learning/' + parameter + '_inv/' + parameter + '_rewards_' + str(param_num) + '.csv', rewards_data.transpose(), delimiter=",") np.savetxt('/Users/matthewgalloway/Documents/RF/q_learning/' + parameter + '_inv/' + parameter + '_steps_' + str(param_num) + '.csv', step_data.transpose(), delimiter=",") if count % 50 == 0: print('iteration {} of 10'.format(count)) count += 1 results = { 'param_values': parameter_values, 'avg_scores': avg_scores, 'avg_steps': avg_steps, } print(results) return pd.DataFrame(results)
from pybricks.tools import print import utils_motor import random import time # Play a beep sound brick.sound.beep() print('Should display on VisualStudio') seedling = int(round(time.time())) random.seed(seedling) # Initialize environment # If we invert the reward during training the robot should change direction env = CrawlingRobotEnv(step_angle=45, invert_reward=True) current_state = env.reset() agent = Q_Agent(env, gamma=0.9, alpha=0.2) # Do the right sequence (1,5,2,4,0) # 0: LEG NEUTRAL # 1: LEG UP # 2: LEG DOWN # 3: FEET NEUTRAL # 4: FEET UP # 5: FEET DOWN print('Distance:', env.read_sensor()) # Backward #list_actions = [1, 4, 2, 5, 1, 4, 2, 5, 1, 4, 2, 5, 1, 4, 2, 5,1, 4, 2, 5, 1, 4, 2, 5, 0, 3] # Forward list_actions = [ 1, 5, 2, 4, 1, 5, 2, 4, 1, 5, 2, 4, 1, 5, 2, 4, 1, 5, 2, 4, 1, 5, 2, 4, 0, 3
# Bigger values decay faster e_greedy_decay = 1.0 / num_iterations_train # Initial agent action probability (just try things at random) initial_e_greedy_prob = 1.0 # Number of iterations before check statitics of reward num_steps_eval = num_iterations_train // 10 if __name__ == '__main__': # Initialize environment env = CrawlingRobotEnv(invert_reward=True, run_on_lego=running_on_lego, step_angle=40) current_state = env.reset() agent = Q_Agent(env, gamma=0.9, alpha=0.2, e_greedy_prob=initial_e_greedy_prob, e_greedy_decay=e_greedy_decay) print(agent.q_val_table) # Train sum_rewards = 0 sum_rewards_vec = [] for steps in range(num_iterations_train): action = agent.choose_action(current_state) current_state_str = str(env) next_state, reward, done, info = env.step(action) next_state_str = env.state_idx_to_str(next_state) action_str = env.action_idx_to_str(action) agent.update_q_table(current_state, action, reward, next_state) print('steps:', steps, '\n\tcurrent_state:', current_state_str,