Esempio n. 1
0
def evaluate(mdp, fqi, initial_states, args):
    values = evaluation.evaluate_policy(mdp,
                                        fqi,
                                        initial_states=initial_states)
    iteration_values = list()
    results = list()
    print('J: %f' % values[0])
    iteration_values.append(values[0])
    results.append(values)

    if args.plot:
        if i == 1:
            fig1 = plt.figure(1)
            ax = fig1.add_subplot(1, 1, 1)
            h = ax.plot(range(i + 1), iteration_values, 'ro-')
            plt.ylim(min(iteration_values), max(iteration_values))
            plt.xlim(0, i + 1)
            plt.ion()  # turns on interactive mode
            plt.show()
        elif i > 1:
            h[0].set_data(range(i + 1), iteration_values)
            ax.figure.canvas.draw()
            plt.ylim(min(iteration_values), max(iteration_values))
            plt.xlim(0, i + 1)
            plt.show()

    return results
Esempio n. 2
0
def terminal_evaluation(old_theta, new_theta, tol_theta=1e-2):
    if increment_base_termination(old_theta, new_theta, 2, tol_theta):
        estimator = LQG_Q()
        estimator.omega = new_theta[0]
        agent = Algorithm(estimator, state_dim, action_dim,
                          discrete_actions, mdp.gamma, mdp.horizon)
        agent._iteration = 1
        initial_states = np.array([[1, 2, 5, 7, 10]]).T
        values = evaluation.evaluate_policy(mdp, agent,
                                            initial_states=initial_states)
        stop = values[0] > -67.
        return stop
    else:
        return False
Esempio n. 3
0
def terminal_evaluation(old_theta, new_theta, tol_theta=1e-2):
    if increment_base_termination(old_theta, new_theta, 2, tol_theta):
        estimator = LQG_Q()
        estimator.omega = new_theta[0]
        agent = Algorithm(estimator, state_dim, action_dim, discrete_actions,
                          mdp.gamma, mdp.horizon)
        agent._iteration = 1
        initial_states = np.array([[1, 2, 5, 7, 10]]).T
        values = evaluation.evaluate_policy(mdp,
                                            agent,
                                            initial_states=initial_states)
        stop = values[0] > -67.
        return stop
    else:
        return False
Esempio n. 4
0
def evaluate(mdp, fqi, initial_states, args):
    values = evaluation.evaluate_policy(mdp, fqi,
                                        initial_states=initial_states)
    iteration_values = list()
    iteration_values.append(values[0])

    if args.plot:
        if i == 1:
            fig1 = plt.figure(1)
            ax = fig1.add_subplot(1, 1, 1)
            h = ax.plot(range(i + 1), iteration_values, 'ro-')
            plt.ylim(min(iteration_values), max(iteration_values))
            plt.xlim(0, i + 1)
            plt.ion()  # turns on interactive mode
            plt.show()
        elif i > 1:
            h[0].set_data(range(i + 1), iteration_values)
            ax.figure.canvas.draw()
            plt.ylim(min(iteration_values), max(iteration_values))
            plt.xlim(0, i + 1)
            plt.show()

    return values
Esempio n. 5
0
fit_params = {}
# fit_params = {
#     "n_epochs": 300,
#     "batch_size": 50,
#     "validation_split": 0.1,
#     "verbosity": False,
#     "criterion": "mse"
# }

fqi.partial_fit(sast, r, **fit_params)

iterations = 20
iteration_values = []
for i in range(iterations - 1):
    fqi.partial_fit(None, None, **fit_params)
    values = evaluation.evaluate_policy(mdp, fqi, initial_states=mdp.initial_states)
    print(values)
    iteration_values.append(values[0])

    if i == 1:
        fig1 = plt.figure(1)
        ax = fig1.add_subplot(1, 1, 1)
        h = ax.plot(range(i + 1), iteration_values, 'ro-')
        plt.ylim(min(iteration_values), max(iteration_values))
        plt.xlim(0, i + 1)
        plt.ion()  # turns on interactive mode
        plt.show()
    elif i > 1:
        h[0].set_data(range(i + 1), iteration_values)
        ax.figure.canvas.draw()
        plt.ylim(min(iteration_values), max(iteration_values))
Esempio n. 6
0
state, actions, reward, next_states, absorbing = split_dataset(dataset,
                                                    state_dim=state_dim,
                                                    action_dim=action_dim,
                                                    reward_dim=reward_dim)

theta0 = np.array([6., 10.001], dtype='float32').reshape(1, -1)
# theta0 = np.array([16., 10.001], dtype='float32').reshape(1, -1)
history = pbo.fit(state, actions, next_states, reward, absorbing, theta0,
                  batch_size=10, nb_epoch=EPOCH,
                  theta_metrics={'k': tmetric})

##########################################
# Evaluate the final solution
initial_states = np.array([[1, 2, 5, 7, 10]]).T
values = evaluation.evaluate_policy(mdp, pbo, initial_states=initial_states)
print('Learned theta: {}'.format(pbo.learned_theta_value))
print('Final performance of PBO: {}'.format(values))

##########################################
# Some plot
ks = np.array(history['k']).squeeze()
weights = np.array(history['theta']).squeeze()
print(weights.shape)

plt.figure()
plt.title('[train] evaluated weights')
plt.scatter(weights[:, 0], weights[:, 1], s=50, c=np.arange(weights.shape[0]),
            cmap='viridis', linewidth='0')
plt.xlabel('b')
plt.ylabel('k')
Esempio n. 7
0
          estimator_rho=rho_regressor,
          state_dim=state_dim,
          action_dim=action_dim,
          discrete_actions=discrete_actions,
          gamma=mdp.gamma,
          learning_steps=50,
          batch_size=10,
          learning_rate=1e-1,
          incremental=INCREMENTAL,
          verbose=True)

weights = pbo.fit(sast, r)
##########################################

initial_states = np.array([[1, 2, 5, 7, 10]]).T
values = evaluation.evaluate_policy(mdp, pbo, initial_states=initial_states)

print(values)

from matplotlib import pyplot as plt
weights = np.array(weights)
plt.subplot(1, 3, 1)
plt.title('[train] evaluated weights')
plt.xlabel('b')
plt.ylabel('k')
plt.scatter(weights[:, 1], weights[:, 0], s=50, c=np.arange(weights.shape[0]), cmap='inferno')
plt.colorbar()

best_rhos = pbo._rho_values[-1]
ks = q_regressor._regressor.get_k(np.array(pbo._q_weights_list))
plt.subplot(1, 3, 2)
Esempio n. 8
0
            s_t = mdp.get_state()
            u_t = policy.draw_action(s_t, False, True)
            returns[j] += df * mdp.step(u_t)[1]
            df *= mdp.gamma
    return returns.mean(), 2 * returns.std() / np.sqrt(n_rep)


##############################################################
# Compute the discounted reward
n_rep = 1000
J = mdp.computeJ(K, S, n_random_x0=n_rep)
print("K", K)
pol = tmp_policy(K, S)
Jsample = []
for i in range(n_rep):
    Jsample.append(evaluate.evaluate_policy(mdp, pol, initial_states=initial_state)[0])
#Jsample /= n_rep
print("J", J, np.mean(Jsample), np.std(Jsample) / np.sqrt(n_rep) * 1.96)

##############################################################
# Compute the q-function
x = np.array([2])
u = np.array([0])
q_val, q_std = estimate_qvalue(mdp, x, u, policy=pol, ep_length=400, n_rep=n_rep)
v = mdp.computeQFunction(x, u, K, S, n_rep)

print("Q", q_val, q_std, v)

##############################################################
# Plot the q-function
xs = np.linspace(-mdp.max_pos, mdp.max_pos, 60)
Esempio n. 9
0
from ifqi.algorithms.lspi import LSPI
from ifqi.envs.utils import get_space_info
from ifqi.evaluation import evaluation
from ifqi.evaluation.utils import check_dataset, split_data_for_fqi
from ifqi.models.linear import Linear
from ifqi.models.regressor import Regressor

mdp = env.CarOnHill()
state_dim, action_dim, reward_dim = get_space_info(mdp)
nextstate_idx = state_dim + action_dim + reward_dim
reward_idx = action_dim + state_dim

# dataset: s, a, r, s'
dataset = evaluation.collect_episodes(mdp, n_episodes=500)
check_dataset(dataset, state_dim, action_dim, reward_dim)

regressor_params = dict(features=dict(name='poly', params=dict(degree=5)))
regressor = Regressor(Linear, **regressor_params)
lspi = LSPI(regressor, state_dim, action_dim, mdp.action_space.values,
            mdp.gamma)

sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim)

lspi.fit(sast, r)

values = evaluation.evaluate_policy(mdp,
                                    lspi,
                                    initial_states=mdp.initial_states)

print(values)