decimal_state=parameters["decimal_state"], decimal_action=parameters["decimal_action"], step_state=parameters["step_state"], step_action=parameters["step_action"], episodes=parameters["episodes"], max_steps=parameters["max_steps"], epsilon=parameters["epsilon"], alpha=parameters["alpha"], gamma=parameters["gamma"]) lr_learner.train() rs = [] ss = [] for j in range(parameters["n_greedy_episodes"]): r, s = lr_learner.run_greedy(parameters["n_greedy_steps"]) rs.append(r) ss.append(s) r = np.mean(rs) s = np.mean(ss) steps.append(lr_learner.greedy_steps) rewards.append(lr_learner.greedy_r) final_mean_reward.append(r) saver.save_to_pickle("results/exp_1_lr_learning_steps.pickle", steps) saver.save_to_pickle("results/exp_1_lr_learning_rewards.pickle", rewards) saver.save_to_pickle("results/exp_1_lr_learning_final_reward.pickle", final_mean_reward)
alpha = 0.001 model = Sequential() model.add( Dense(parameters["hidden_size"], input_dim=env.observation_space.shape[0], activation='tanh')) model.add(Dense(env.action_space.n, activation='linear')) model.compile(loss='mse', optimizer=Adam(lr=alpha)) dqn_learner = Dqn(model=model, buffer=Buffer(parameters["buffer_size"]), env=env, gamma=parameters["gamma"], epsilon=parameters["epsilon"], decayment_rate=parameters["decayment_rate"], episodes=parameters["episodes"], max_steps=parameters["max_steps"], batch_size=parameters["batch_size"]) dqn_learner.train() rewards.append(dqn_learner.rewards_greedy) steps.append(dqn_learner.steps_greedy) rewards = np.array(rewards) steps = np.array(steps) saver.save_to_pickle("results/dqn_rewards.pck", rewards) saver.save_to_pickle("results/dqn_steps.pck", steps)
max_steps=max_steps, epsilon=epsilon, alpha=alpha_q, gamma=gamma) lr_learner = LowRankLearning(env=env, state_map=state_map, action_map=action_map, state_reverse_map=state_reverse_map, action_reverse_map=action_reverse_map, n_states=n_states, n_actions=n_actions, decimal_state=decimal, decimal_action=decimal, step_state=step, step_action=step, episodes=episodes, max_steps=max_steps, epsilon=epsilon, alpha=alpha_lr, gamma=gamma, k=k, lambda_l=lambda_l, lambda_r=lambda_r) q_learner.train() lr_learner.train() saver.save_to_pickle("results/q_learner_example.pickle", q_learner) saver.save_to_pickle("results/low_rank_learner_example.pickle", lr_learner)
medians_temp = [] standard_devs_temp = [] frob_errors_temp = [] for i in range(parameters["n_simulations"]): lr_learner = LowRankLearning( env=env, episodes=parameters["episodes"], max_steps=parameters["max_steps"], epsilon=epsilon, gamma=parameters["gamma"], k=parameters["k"], tol_convergence=parameters["tol_convergence"], tol_approx=parameters["tol_approx"]) lr_learner.train(check_optimality=True, reference=Q_optimal) median, standard_dev = test_utils.smooth_signal( signal=lr_learner.steps, w=100) medians_temp.append(median) standard_devs_temp.append(standard_dev) frob_errors_temp.append(lr_learner.frobenius_error) medians[str(epsilon)] = np.median(medians_temp, axis=0) standard_devs[str(epsilon)] = np.median(standard_devs_temp, axis=0) frob_errors[str(epsilon)] = np.median(frob_errors_temp, axis=0) saver.save_to_pickle("results/lr_learning_medians.pickle", medians) saver.save_to_pickle("results/lr_learning_stds.pickle", standard_devs) saver.save_to_pickle("results/lr_learning_frob_errors.pickle", frob_errors)
from utils import QLearning, Saver, get_env env = get_env() saver = Saver() q_learner = QLearning(env=env, episodes=10000, max_steps=100, epsilon=.4, alpha=.9, gamma=.9) q_learner.train() saver.save_to_pickle("results/Q_optimal.pickle", q_learner.Q)