def plot_actions(dataset_path, qw, index, task, n_actions, save_path): dataset = utils.load_object(dataset_path) dataset = np.array(dataset) actions_etr = np.zeros((n_actions, 3)) for i in range(n_actions): for j in range(3): actions_etr[i, j] = dataset[i, 1][j] actions_nn = np.zeros((n_actions, 3)) q = MLPQFunction(task.state_dim, task.action_space.n, layers=layers, initial_params=qw) task.starting_day_index = 0 task.reset() actions_counter = 0 for di in range(task.n_days): task.starting_day_index = di s = task.reset() done = False while not done: a_list = q.value_actions(s) actions_nn[actions_counter, :] = a_list a = np.argmax(a_list) s, r, done, _ = task.step([a]) done = done[0] actions_counter += 1 if actions_counter >= n_actions: break percentage = actions_counter * 100 / n_actions if percentage % 10 == 0: print("Actions evaluation: {0:3d}%".format(int(percentage))) if actions_counter >= n_actions: break fig, ax = plt.subplots(3, sharex=True, figsize=(16, 9)) for i in range(3): ax[i].plot(actions_etr[:10000, i], label="ETR") ax[i].plot(actions_nn[:10000, i], label="NN") ax[i].set_title("Action " + str(i - 1)) ax[i].legend() plt.savefig(save_path + '.pdf', format='pdf')
def make_Q(weights, task): # task params state_dim = task.state_dim action_dim = 1 n_actions = task.action_space.n return MLPQFunction(state_dim, n_actions, layers=layers, initial_params=weights)
tasks_data = utils.load_object(tasks_file) n_eval_episodes = 5 state_dim = temp_mdp.observation_space.shape[0] action_dim = 1 n_actions = temp_mdp.N_DISCRETE_ACTIONS # Create BellmanOperator operator = MellowBellmanOperator(kappa, tau, xi, temp_mdp.gamma, state_dim, action_dim) # Create Q Function layers = [l1] if l2 > 0: layers.append(l2) Q = MLPQFunction(state_dim, n_actions, layers=layers, activation=activation) def run(seed=None): return learn(Q, operator, tasks_data, demand, min_env_flow, max_iter=max_iter, buffer_size=buffer_size, batch_size=batch_size, alpha_adam=alpha_adam, alpha_sgd=alpha_sgd, lambda_=lambda_, n_weights=n_weights,
def transfer(dataset_path, mdp, save_path, iterations, year, seed=0): np.random.seed(seed) data = utils.load_object(dataset_path) data = np.array(data) state_dim = mdp.state_dim n_actions = mdp.action_space.n mdp.starting_day_index = 0 mdp.reset() day_length = len(mdp.prices[0]) Q = MLPQFunction(state_dim, n_actions, layers=layers) Q.init_weights() m_t = 0 v_t = 0 t = 0 utils.save_object([], save_path) losses = [[], [], []] for i in range(iterations): # sample time of day time = int(np.random.uniform(low=0, high=day_length)) datapoints = np.arange(0, len(data) - day_length, day_length) datapoints += time datapoints = data[datapoints] np.random.shuffle(datapoints) datapoints = datapoints[:batch_size] for a in range(n_actions): with torch.autograd.set_detect_anomaly(True): train_loss, grad = compute_gradient_single_action( Q, datapoints, a) losses[a].append(train_loss) print( "Y: {0}, I: {1:5d}, Time: {2:4d}, A: {3:1d}, Grad: {4:8.6f}, Train Loss: {5:8.6f}" .format(year, i, time, a, np.linalg.norm(grad), train_loss)) Q._w, t, m_t, v_t = utils.adam(Q._w, grad, t, m_t, v_t, alpha=alpha) if save_freq > 0 and i % save_freq == 0: past_Qs = utils.load_object(save_path) past_Qs.append(np.array(Q._w)) utils.save_object(past_Qs, save_path) plot_actions(dataset_path, Q._w, i, mdp, n_actions_plot, path + "/plot-" + year + "-" + str(i)) print( "Model selected index: {0:4d}, Train Loss: [{1:8.6f}, {2:8.6f}, {3:8.6f}]" .format(i, losses[0][i], losses[1][i], losses[2][i])) return [mdp.get_info(), np.array(Q._w), losses]
for (d1, d2) in zip(doors, doors2) ]) # print([(d1,d2) for (d1,d2) in zip(doors,doors2)]) eval_states = [np.array([0., 0.]) for _ in range(10)] state_dim = mdps[0][0].state_dim action_dim = 1 n_actions = mdps[0][0].action_space.n K = n_basis**2 # Create BellmanOperator operator = MellowBellmanOperator(kappa, tau, xi, mdps[0][0].gamma, K, action_dim) # Create Q Function Q = MLPQFunction(K, n_actions, layers=None) # Create RBFs rbf = build_features_gw_state(gw_size, n_basis, state_dim) def run(mdp, seed=None): return learn(mdp, Q, operator, max_iter=max_iter, buffer_size=buffer_size, batch_size=batch_size, alpha=alpha, train_freq=train_freq, eval_freq=eval_freq, eps_start=eps_start,
n_eval_episodes = 5 state_dim = mdps[0].state_dim action_dim = 1 n_actions = mdps[0].action_space.n layers = [l1] if l2 > 0: layers.append(l2) if not dqn: # Create BellmanOperator operator = MellowBellmanOperator(kappa, tau, xi, mdps[0].gamma, state_dim, action_dim) # Create Q Function Q = MLPQFunction(state_dim, n_actions, layers=layers) else: Q, operator = DQN(state_dim, action_dim, n_actions, mdps[0].gamma, layers=layers) def run(mdp, seed=None): return learn(mdp, Q, operator, max_iter=max_iter, buffer_size=buffer_size, batch_size=batch_size,