Exemple #1
0
                                 decimal_state=parameters["decimal_state"],
                                 decimal_action=parameters["decimal_action"],
                                 step_state=parameters["step_state"],
                                 step_action=parameters["step_action"],
                                 episodes=parameters["episodes"],
                                 max_steps=parameters["max_steps"],
                                 epsilon=parameters["epsilon"],
                                 alpha=parameters["alpha"],
                                 gamma=parameters["gamma"])

    lr_learner.train()

    rs = []
    ss = []

    for j in range(parameters["n_greedy_episodes"]):
        r, s = lr_learner.run_greedy(parameters["n_greedy_steps"])
        rs.append(r)
        ss.append(s)
    r = np.mean(rs)
    s = np.mean(ss)

    steps.append(lr_learner.greedy_steps)
    rewards.append(lr_learner.greedy_r)
    final_mean_reward.append(r)

saver.save_to_pickle("results/exp_1_lr_learning_steps.pickle", steps)
saver.save_to_pickle("results/exp_1_lr_learning_rewards.pickle", rewards)
saver.save_to_pickle("results/exp_1_lr_learning_final_reward.pickle",
                     final_mean_reward)
Exemple #2
0
    alpha = 0.001

    model = Sequential()
    model.add(
        Dense(parameters["hidden_size"],
              input_dim=env.observation_space.shape[0],
              activation='tanh'))
    model.add(Dense(env.action_space.n, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(lr=alpha))

    dqn_learner = Dqn(model=model,
                      buffer=Buffer(parameters["buffer_size"]),
                      env=env,
                      gamma=parameters["gamma"],
                      epsilon=parameters["epsilon"],
                      decayment_rate=parameters["decayment_rate"],
                      episodes=parameters["episodes"],
                      max_steps=parameters["max_steps"],
                      batch_size=parameters["batch_size"])

    dqn_learner.train()

    rewards.append(dqn_learner.rewards_greedy)
    steps.append(dqn_learner.steps_greedy)

rewards = np.array(rewards)
steps = np.array(steps)

saver.save_to_pickle("results/dqn_rewards.pck", rewards)
saver.save_to_pickle("results/dqn_steps.pck", steps)
Exemple #3
0
                      max_steps=max_steps,
                      epsilon=epsilon,
                      alpha=alpha_q,
                      gamma=gamma)

lr_learner = LowRankLearning(env=env,
                             state_map=state_map,
                             action_map=action_map,
                             state_reverse_map=state_reverse_map,
                             action_reverse_map=action_reverse_map,
                             n_states=n_states,
                             n_actions=n_actions,
                             decimal_state=decimal,
                             decimal_action=decimal,
                             step_state=step,
                             step_action=step,
                             episodes=episodes,
                             max_steps=max_steps,
                             epsilon=epsilon,
                             alpha=alpha_lr,
                             gamma=gamma,
                             k=k,
                             lambda_l=lambda_l,
                             lambda_r=lambda_r)

q_learner.train()
lr_learner.train()

saver.save_to_pickle("results/q_learner_example.pickle", q_learner)
saver.save_to_pickle("results/low_rank_learner_example.pickle", lr_learner)
Exemple #4
0
    medians_temp = []
    standard_devs_temp = []
    frob_errors_temp = []

    for i in range(parameters["n_simulations"]):
        lr_learner = LowRankLearning(
            env=env,
            episodes=parameters["episodes"],
            max_steps=parameters["max_steps"],
            epsilon=epsilon,
            gamma=parameters["gamma"],
            k=parameters["k"],
            tol_convergence=parameters["tol_convergence"],
            tol_approx=parameters["tol_approx"])

        lr_learner.train(check_optimality=True, reference=Q_optimal)

        median, standard_dev = test_utils.smooth_signal(
            signal=lr_learner.steps, w=100)

        medians_temp.append(median)
        standard_devs_temp.append(standard_dev)
        frob_errors_temp.append(lr_learner.frobenius_error)

    medians[str(epsilon)] = np.median(medians_temp, axis=0)
    standard_devs[str(epsilon)] = np.median(standard_devs_temp, axis=0)
    frob_errors[str(epsilon)] = np.median(frob_errors_temp, axis=0)

saver.save_to_pickle("results/lr_learning_medians.pickle", medians)
saver.save_to_pickle("results/lr_learning_stds.pickle", standard_devs)
saver.save_to_pickle("results/lr_learning_frob_errors.pickle", frob_errors)
Exemple #5
0
from utils import QLearning, Saver, get_env

env = get_env()
saver = Saver()

q_learner = QLearning(env=env,
                      episodes=10000,
                      max_steps=100,
                      epsilon=.4,
                      alpha=.9,
                      gamma=.9)

q_learner.train()
saver.save_to_pickle("results/Q_optimal.pickle", q_learner.Q)