def day_pass(k, v, d): Q = utils.load_object(etr_path + v["policy"]) task = v["task"] task.starting_day_index = d s = task.reset() rewards = np.zeros((len(task.prices[0]))) actions = np.zeros((len(task.prices[0]))) state_value_list = [] done = False while not done: a_list = Q._q_values(s) state_value_list.append([s[0], a_list]) a = np.argmax(a_list) s, r, done, _ = task.step([a]) r = r[0] done = done[0] actions[task.current_timestep] = a - 1 # [0, 2] -> [-1, 1] rewards[task.current_timestep] = r print("{0:s} - Day: {1:4d}, Cumulative reward: {2:8.6f}".format( k, d, np.sum(rewards))) return (d, rewards, actions, state_value_list)
def year_pass(k, v): Q = utils.load_object(etr_path + v["policy"]) task = v["task"] task.starting_day_index = 0 task.reset() num_days = task.n_days if n_jobs == 1: outputs = [day_pass(k, v, d) for d in range(num_days)] elif n_jobs > 1: outputs = Parallel(n_jobs=n_jobs, max_nbytes=None)(delayed(day_pass)(k, v, d) for d in range(num_days)) days = [] actions = np.zeros((num_days, len(task.prices[0]))) rewards = np.zeros((num_days, len(task.prices[0]))) state_value_list = [] for (d, r, a, svl) in outputs: days.append(d) rewards[d, :] = r actions[d, :] = a state_value_list.extend(svl) print("Days:", len(days)) print("Rewards sum:", np.sum(rewards)) print("State values list length:", len(state_value_list)) utils.save_object(state_value_list, save_dataset_path + k) utils.save_object([days, actions, rewards], save_actions_path + k)
def plot_actions(dataset_path, qw, index, task, n_actions, save_path): dataset = utils.load_object(dataset_path) dataset = np.array(dataset) actions_etr = np.zeros((n_actions, 3)) for i in range(n_actions): for j in range(3): actions_etr[i, j] = dataset[i, 1][j] actions_nn = np.zeros((n_actions, 3)) q = MLPQFunction(task.state_dim, task.action_space.n, layers=layers, initial_params=qw) task.starting_day_index = 0 task.reset() actions_counter = 0 for di in range(task.n_days): task.starting_day_index = di s = task.reset() done = False while not done: a_list = q.value_actions(s) actions_nn[actions_counter, :] = a_list a = np.argmax(a_list) s, r, done, _ = task.step([a]) done = done[0] actions_counter += 1 if actions_counter >= n_actions: break percentage = actions_counter * 100 / n_actions if percentage % 10 == 0: print("Actions evaluation: {0:3d}%".format(int(percentage))) if actions_counter >= n_actions: break fig, ax = plt.subplots(3, sharex=True, figsize=(16, 9)) for i in range(3): ax[i].plot(actions_etr[:10000, i], label="ETR") ax[i].plot(actions_nn[:10000, i], label="NN") ax[i].set_title("Action " + str(i - 1)) ax[i].legend() plt.savefig(save_path + '.pdf', format='pdf')
def transfer(dataset_path, mdp, save_path, iterations, year, seed=0): np.random.seed(seed) data = utils.load_object(dataset_path) data = np.array(data) state_dim = mdp.state_dim n_actions = mdp.action_space.n mdp.starting_day_index = 0 mdp.reset() day_length = len(mdp.prices[0]) Q = MLPQFunction(state_dim, n_actions, layers=layers) Q.init_weights() m_t = 0 v_t = 0 t = 0 utils.save_object([], save_path) losses = [[], [], []] for i in range(iterations): # sample time of day time = int(np.random.uniform(low=0, high=day_length)) datapoints = np.arange(0, len(data) - day_length, day_length) datapoints += time datapoints = data[datapoints] np.random.shuffle(datapoints) datapoints = datapoints[:batch_size] for a in range(n_actions): with torch.autograd.set_detect_anomaly(True): train_loss, grad = compute_gradient_single_action( Q, datapoints, a) losses[a].append(train_loss) print( "Y: {0}, I: {1:5d}, Time: {2:4d}, A: {3:1d}, Grad: {4:8.6f}, Train Loss: {5:8.6f}" .format(year, i, time, a, np.linalg.norm(grad), train_loss)) Q._w, t, m_t, v_t = utils.adam(Q._w, grad, t, m_t, v_t, alpha=alpha) if save_freq > 0 and i % save_freq == 0: past_Qs = utils.load_object(save_path) past_Qs.append(np.array(Q._w)) utils.save_object(past_Qs, save_path) plot_actions(dataset_path, Q._w, i, mdp, n_actions_plot, path + "/plot-" + year + "-" + str(i)) print( "Model selected index: {0:4d}, Train Loss: [{1:8.6f}, {2:8.6f}, {3:8.6f}]" .format(i, losses[0][i], losses[1][i], losses[2][i])) return [mdp.get_info(), np.array(Q._w), losses]
def learn( mdp, Q, operator, max_iter=5000, buffer_size=10000, batch_size=50, alpha_adam=0.001, alpha_sgd=0.1, lambda_=0.001, n_weights=10, train_freq=1, eval_freq=50, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, cholesky_clip=0.0001, bandwidth=0.00001, post_components=1, max_iter_ukl=60, eps=0.001, eta=1e-6, time_coherent=False, source_file=None, seed=None, render=False, verbose=True, ukl_tight_freq=1, sources=None, # Lambda function to calculate the weights weights_calculator=None): if seed is not None: np.random.seed(seed) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): Q.init_weights() # Reset global variables global prior_eigen prior_eigen = None global cholesky_mask cholesky_mask = None global prior_normal prior_normal = None global posterior_normal posterior_normal = None # Initialize policies pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0) # Get number of features K = Q._w.size C = post_components # Load weights and construct prior distribution weights = utils.load_object(source_file) if sources is None else sources timesteps = len(weights) ws = [] # Take only 1 sample per timestep for i in range(timesteps): samples = weights[i] np.random.shuffle(samples) ws.append(samples[0][1]) # 0: first sample (random), 1: weights ws = np.array(ws) # The gaussian mixture weights are uniform if not provided. c_bar = np.ones( timesteps ) / timesteps if weights_calculator is None else weights_calculator(ws) # Take only gaussians with non-zero weights ws = ws[c_bar > 0] timesteps = len(ws) c_bar = c_bar[c_bar > 0] mu_bar = ws Sigma_bar = np.tile(np.eye(K) * bandwidth, (timesteps, 1, 1)) Sigma_bar_inv = np.tile((1 / bandwidth * np.eye(K))[np.newaxis], (timesteps, 1, 1)) # We initialize the parameters of the posterior to the best approximation of the posterior family to the prior c = np.ones(C) / C psi = c[:, np.newaxis] * c_bar[np.newaxis] phi = np.array(psi) mu = np.array([100 * np.random.randn(K) for _ in range(C)]) Sigma = np.array([np.eye(K) for _ in range(C)]) phi, psi = tight_ukl(c, mu, Sigma, c_bar, mu_bar, Sigma_bar, phi, psi, max_iter=max_iter_ukl, eps=eps) params, phi, psi = init_posterior(c, mu, Sigma, c_bar, mu_bar, Sigma_bar, phi, psi, C, K, cholesky_clip, max_iter_ukl, max_iter=max_iter_ukl * 10, precision=Sigma_bar_inv, eta=eta, eps=eps, verbose=verbose) # Add random episodes if needed init_samples = list() if random_episodes > 0: w, _ = sample_gmm(random_episodes, c_bar, mu_bar, np.sqrt(Sigma_bar)) for i in range(random_episodes): Q._w = w[i] init_samples.append( utils.generate_episodes(mdp, pi_g, n_episodes=1, preprocess=preprocess)) init_samples = np.concatenate(init_samples) t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.state_dim, mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] l_2 = [] l_inf = [] fvals = [] episode_t = [] # Create masks for ADAM and SGD adam_mask = pack(np.zeros(C), np.ones((C, K)) * alpha_adam, np.zeros( (C, K, K))) # ADAM learns only \mu sgd_mask = pack(np.zeros(C), np.zeros((C, K)), np.ones((C, K, K)) * alpha_sgd) # SGD learns only L # Adam initial params m_t = 0 v_t = 0 t = 0 # Init env s = mdp.reset() h = 0 Q._w = sample_posterior(params, C, K) start_time = time.time() # Learning for i in range(max_iter): # If we do not use time coherent exploration, resample parameters Q._w = sample_posterior(params, C, K) if not time_coherent else Q._w # Take greedy action wrt current Q-function s_prep = preprocess(s) a = np.argmax(Q.value_actions(s_prep)) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = gradient(buffer.sample_batch(batch_size), params, Q, c_bar, mu_bar, Sigma_bar, operator, i + 1, phi, psi, n_weights, lambda_, max_iter_ukl, C, K, precision=Sigma_bar_inv, t_step=i, ukl_tight_freq=ukl_tight_freq) # Take a gradient step for \mu params, t, m_t, v_t = utils.adam(params, g, t, m_t, v_t, alpha=adam_mask) # Take a gradient step for L params = utils.sgd(params, g, alpha=sgd_mask) # Clip parameters params = clip(params, cholesky_clip, C, K) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: episode_rewards.append(0.0) s = mdp.reset() h = 0 Q._w = sample_posterior(params, C, K) episode_t.append(i) # Evaluate model if i % eval_freq == 0: #Save current weights current_w = np.array(Q._w) # Evaluate MAP Q-function c, mu, _ = unpack(params, C, K) rew = 0 for j in range(C): Q._w = mu[j] rew += utils.evaluate_policy(mdp, pi_g, render=render, initial_states=eval_states, n_episodes=eval_episodes, preprocess=preprocess)[0] rew /= C learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) fval = objective(buffer.sample_batch(batch_size), params, Q, c_bar, mu_bar, Sigma_bar, operator, i + 1, phi, psi, n_weights, lambda_, C, K, precision=Sigma_bar_inv) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) fvals.append(fval) # Make sure we restart from s mdp.reset(s) # Restore weights Q._w = current_w end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, fval, l_2_err, l_inf_err, elapsed_time)) if (i * 100 / max_iter) % 10 == 0: print("Seed: " + str(seed) + " - Progress: " + str(int(i * 100 / max_iter)) + "%") run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(mu) print("Task over: ", mdp.get_info(), " - Last learning rewards: ", np.around(run_info[3][-5:], decimals=3)) return [mdp.get_info(), weights, run_info]
l1 = int(args.l1) l2 = int(args.l2) alpha = float(args.alpha) env = str(args.env) cart_mass = float(args.cart_mass) pole_mass = float(args.pole_mass) pole_length = float(args.pole_length) n_jobs = int(args.n_jobs) n_runs = int(args.n_runs) file_name = str(args.file_name) dqn = bool(args.dqn) source_file = str(args.source_file) # load weights weights = utils.load_object(source_file) ws = np.array([w[1] for w in weights]) np.random.shuffle(ws) params = np.array([w[0][1:] for w in weights]) n_runs = min((n_runs, len(weights))) # Generate tasks mc = [ np.random.uniform(0.5, 1.5) if cart_mass < 0 else cart_mass for _ in range(n_runs) ] mp = [ np.random.uniform(0.1, 0.3) if pole_mass < 0 else pole_mass for _ in range(n_runs) ] l = [
if just_one_timestep in range( 0, len(tasks_data) - 1): # Learn optimal policies just for one timestep print("Timestep", just_one_timestep) if n_jobs == 1: timestep_results = [ run(tasks_data[just_one_timestep], seeds[j]) for j in range(seeds_per_task) ] elif n_jobs > 1: timestep_results = Parallel(n_jobs=n_jobs)( delayed(run)(tasks_data[just_one_timestep], seeds[j]) for j in range(seeds_per_task)) results = utils.load_object( sources_file_name) # sources must already exist. results[just_one_timestep] = timestep_results # overwrite utils.save_object(results, sources_file_name) else: # Learn optimal policies for all sources for i in range(len(tasks_data) - 1): print("Timestep", i) if n_jobs == 1: timestep_results = [ run(tasks_data[i], seeds[j]) for j in range(seeds_per_task) ] elif n_jobs > 1: timestep_results = Parallel(n_jobs=n_jobs)( delayed(run)(tasks_data[i], seeds[j]) for j in range(seeds_per_task))
c.set_color(0, 0, 0) if a == 3: c.set_color(0.8, 0.8, 0) c.add_attr(rendering.Transform(translation=(0.5, self.size[1] - 1))) goal = self.viewer.draw_circle(radius=self.goal_radius) goal.set_color(0, 0.8, 0) goal.add_attr(rendering.Transform(translation=(self.goal[0], self.goal[1]))) agent = self.viewer.draw_circle(radius=0.1) orientation = self.viewer.draw_line([0.,0.], [.1 * np.cos(self.current_state[2]), .1 * np.sin(self.current_state[2])]) agent.set_color(.8, 0, 0) transform = rendering.Transform(translation=(self.current_state[0], self.current_state[1])) agent.add_attr(transform) orientation.add_attr(transform) return self.viewer.render(return_rgb_array=mode == 'rgb_array') if __name__ == '__main__': from misc import utils mazes = utils.load_object("../scripts/mazes10x10") for maze in mazes: m = Maze(size=maze[0], wall_dim=maze[1], goal_pos=maze[2], start_pos=maze[3], walls=maze[4]) print(maze[4][3]) for i in range(1000): a = np.random.randint(0, 3) s, _, _, _ = m.step(a) print("Iter {} State {} A {}".format(i,s,a)) m._render(a=a) time.sleep(.1)
filenames = [ #"mgvt_1c", #"t2vt_1c", #"source_2014", "source_2015", "source_2016", "source_2017", #"2015", #"2016", #"2017", #"2018" ] for filename in filenames: results = utils.load_object("visualize-actions/" + filename) days = results[0] actions = results[1] rewards = results[2] # transpose actions matrix #actions = np.transpose(actions) # rewards cumulative sum rewards = np.sum(rewards, axis = 1) rewards = np.cumsum(rewards) def format_time(value, tick_number): hours = str(int(2 + value // 60)) minutes = int(value % 60) if minutes < 10:
def learn(mdp, Q, operator, max_iter=5000, buffer_size=10000, batch_size=50, alpha_adam=0.001, alpha_sgd=0.1, lambda_=0.001, n_weights=10, train_freq=1, eval_freq=50, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, sigma_reg=0.0001, cholesky_clip=0.0001, time_coherent=False, n_source=10, source_file=None, seed=None, render=False, verbose=True, sources=None): if seed is not None: np.random.seed(seed) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): Q.init_weights() global prior_eigen_torch prior_eigen_torch = None # Initialize policies pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0) # Get number of features K = Q._w.size # Load weights and construct prior distribution weights = utils.load_object(source_file) if sources is None else sources ws = np.array([w[1] for w in weights]) np.random.shuffle(ws) # Take only the first n_source weights ws = ws[:n_source, :] mu_bar = np.mean(ws, axis=0) Sigma_bar = np.cov(ws.T) # We use higher regularization for the prior to prevent the ELBO from diverging Sigma_bar_inv = np.linalg.inv(Sigma_bar + np.eye(K) * sigma_reg) # We initialize the parameters at the prior with smaller regularization (just to make sure Sigma_bar is pd) params = clip( pack(mu_bar, np.linalg.cholesky(Sigma_bar + np.eye(K) * cholesky_clip**2)), cholesky_clip, K) # Add random episodes if needed if random_episodes > 0: init_samples = list() for i in range(random_episodes): Q._w = sample_posterior(params, K) init_samples.append( utils.generate_episodes(mdp, pi_g, n_episodes=1, preprocess=preprocess)) init_samples = np.concatenate(init_samples) t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.state_dim, mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] episode_t = [] l_2 = [] l_inf = [] fvals = [] # Create masks for ADAM and SGD adam_mask = pack(np.ones(K) * alpha_adam, np.zeros( (K, K))) # ADAM learns only \mu sgd_mask = pack(np.zeros(K), np.ones((K, K)) * alpha_sgd) # SGD learns only L # Adam initial params m_t = 0 v_t = 0 t = 0 # RMSprop for Variance v_t_var = 0. # Init env s = mdp.reset() h = 0 Q._w = sample_posterior(params, K) start_time = time.time() # Learning for i in range(max_iter): # If we do not use time coherent exploration, resample parameters Q._w = sample_posterior(params, K) if not time_coherent else Q._w # Take greedy action wrt current Q-function s_prep = preprocess(s) a = np.argmax(Q.value_actions(s_prep)) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = gradient(buffer.sample_batch(batch_size), params, Q, mu_bar, Sigma_bar_inv, operator, i + 1, lambda_, n_weights) # Take a gradient step for \mu params, t, m_t, v_t = utils.adam(params, g, t, m_t, v_t, alpha=adam_mask) # Take a gradient step for L params = utils.sgd(params, g, alpha=sgd_mask) # params,v_t_var = utils.rmsprop(params, g, v_t_var, alpha=sgd_mask) # Clip parameters params = clip(params, cholesky_clip, K) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: episode_rewards.append(0.0) s = mdp.reset() h = 0 Q._w = sample_posterior(params, K) episode_t.append(i) # Evaluate model if i % eval_freq == 0: #Save current weights current_w = np.array(Q._w) # Evaluate MAP Q-function mu, _ = unpack(params, K) Q._w = mu rew = utils.evaluate_policy(mdp, pi_g, render=render, initial_states=eval_states, n_episodes=eval_episodes, preprocess=preprocess)[0] learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) fval = objective(buffer.sample_batch(batch_size), params, Q, mu_bar, Sigma_bar_inv, operator, i + 1, lambda_, n_weights) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) fvals.append(fval) # Make sure we restart from s mdp.reset(s) # Restore weights Q._w = current_w end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, fval, l_2_err, l_inf_err, elapsed_time)) run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(mu) return [mdp.get_info(), weights, run_info]
eps_start=eps_start, eps_end=eps_end, exploration_fraction=exploration_fraction, random_episodes=random_episodes, eval_episodes=n_eval_episodes, mean_episodes=mean_episodes, seed=seed, verbose=verbose) last_rewards = 5 results = [] if just_one_timestep in envs.keys(): results = utils.load_object(sources_file_name) index = list(envs.keys()).index(just_one_timestep) mdp = mdps[index] print(mdp.get_info()) if index >= len(results): results.append([run(mdp, seed)]) else: results[index] = [run(mdp, seed)] print("Last learning rewards:", np.around(results[index][0][2][3][-last_rewards:], decimals=5)) utils.save_object(results, sources_file_name) else: for mdp in mdps: print(mdp.get_info()) results.append([run(mdp, seed)]) print("Last learning rewards:",
"3c-lambda-0.9": "t2vt_3c_l=0.9", "3c-lambda-1.0": "t2vt_3c_l=1.0", "3c-likelihood": "t2vt_3c_l=likelihood" } data_index = 3 print(results_path) out = {"i": []} for name, file in experiments.items(): fs = glob.glob(results_path + file + "*.pkl") if len(fs) == 0: continue r = utils.load_object(fs[0][:-4]) if not out["i"]: out["i"] = r[0][2][0][1:] data = [r[i][2][data_index][1:] for i in range(len(r))] mean = np.mean(data, axis=0) std = 2 * np.std(data, axis=0, ddof=1) / np.sqrt(np.array(data).shape[0]) out["mean-" + name] = mean out["std-" + name] = std keys = list(out.keys()) with open(results_path + "results.csv", "w", newline='') as outfile:
names = [ "NT", "GVT", "GVT (TC)", "1-MGVT", "1-MGVT (TC)", "2-MGVT", "2-MGVT (TC)" ] x = [] y_mean = [] y_std = [] y2_mean = [] y2_std = [] y3_mean = [] y3_std = [] y4_mean = [] y4_std = [] for file in files: results = [r[2] for r in utils.load_object(file)] iterations = [] episodes = [] n_samples = [] lear_rew = [] eval_rew = [] l_2 = [] l_inf = [] for result in results: iterations.append(result[0]) episodes.append(result[1]) n_samples.append(result[2]) lear_rew.append(result[3]) eval_rew.append(result[4]) l_2.append(result[5]) l_inf.append(result[6])
"task": gym.make("VecTradingPrices2016-v2"), "filepath": "additions/experiments/trading/", "filename": "sources", "source_index": 1 }, "source_2017": { "task": gym.make("VecTradingPrices-v3"), "filepath": "additions/experiments/trading/", "filename": "sources", "source_index": 2 } } for k, v in w_dict.items(): if "source_index" in v.keys(): weights = utils.load_object(v["filepath"] + v["filename"]) weights = weights[v["source_index"]][0][1] v["weights"] = weights else: weights = utils.load_object(v["filepath"] + v["filename"]) weights = weights[0][1] v["weights"] = weights def year_pass(Q, task): days = [] rewards = np.zeros((task.n_days, len(task.prices[0]))) actions = np.zeros((task.n_days, len(task.prices[0]))) state_value_list = []
task.starting_day_index = di s = task.reset() s = [s] print("Day index:", di) days.append(task.selected_day) done = False while not done: a = np.argmax(Q._q_values(s)) s, r, done, _ = task.step(a) s = [s] actions[di, task.current_timestep] = a - 1 # [0, 2] -> [-1, 1] rewards[di, task.current_timestep] = r print("Cumulative reward:", np.sum(rewards)) return [days, actions, rewards] for k, v in etrs.items(): print(k) Q = utils.load_object(etr_path + v["policy"]) task = v["task"] task.starting_day_index = 0 task.reset() output = year_pass(Q, task) utils.save_object(output, "visualize-actions/" + k)
eval_freq = int(args.eval_freq) mean_episodes = int(args.mean_episodes) alpha = float(args.alpha) maze = int(args.maze) l1 = int(args.l1) l2 = int(args.l2) n_jobs = int(args.n_jobs) n_runs = int(args.n_runs) file_name = str(args.file_name) render = bool(args.render) dqn = bool(args.dqn) eval_episodes = 10 mazes_file = args.mazes_file # Generate tasks mazes = utils.load_object(mazes_file) mdps = [Maze(size=maze[0], wall_dim=maze[1], goal_pos=maze[2], start_pos=maze[3], walls=maze[4]) \ for maze in mazes] if maze == -1: shuffle(mdps) mdps = [mdps[i] for i in range(min(n_runs,len(mdps)))] else: mdps = [mdps[maze]] state_dim = mdps[0].state_dim action_dim = 1 n_actions = mdps[0].action_space.n # Create Q Function layers = [l1]
"%Y-%m-%d_%H-%M-%S") # Seed to get reproducible results seed = 1 np.random.seed(seed) como_data = pd.read_csv(path + '/../../lake/data/como_data.csv') demand = np.loadtxt(path + '/../../lake/data/comoDemand.txt') min_env_flow = np.loadtxt(path + '/../../lake/data/MEF_como.txt') temp_lake = Lakecomo(None, None, min_env_flow, None, None, seed=seed) temp_inflow = list(como_data.loc[como_data['year'] == 1946, 'in']) temp_mdp = LakeEnv(temp_inflow, demand, temp_lake) # Load tasks tasks_data = utils.load_object(tasks_file) n_eval_episodes = 5 state_dim = temp_mdp.observation_space.shape[0] action_dim = 1 n_actions = temp_mdp.N_DISCRETE_ACTIONS # Create BellmanOperator operator = MellowBellmanOperator(kappa, tau, xi, temp_mdp.gamma, state_dim, action_dim) # Create Q Function layers = [l1] if l2 > 0: layers.append(l2) Q = MLPQFunction(state_dim, n_actions, layers=layers, activation=activation)
alpha_sgd = float(args.alpha_sgd) lambda_ = float(args.lambda_) n_weights = int(args.n_weights) sigma_reg = float(args.sigma_reg) cholesky_clip = float(args.cholesky_clip) n_source = int(args.n_source) source_file = str(args.source_file) time_coherent = bool(args.time_coherent) fixed_seed = int(args.fixed_seed) # Generate tasks np.random.seed(485) mazes = utils.load_object(mazes_file) weights = utils.load_object(source_file) mdps = [Maze(size=maze[0], wall_dim=maze[1], goal_pos=maze[2], start_pos=maze[3], walls=maze[4]) \ for maze in mazes] envs = list() sources = list() if maze == -1: for i in range(min(n_runs, len(mdps))): envs.append(mdps[i % len(mdps)]) sources.append([w for w in weights if not np.array_equal(w[0][-1], envs[-1].walls) and not np.array_equal(w[0][-2], envs[-1].goal)]) else: envs = [mdps[maze] for i in range(n_runs)] sources = [w for w in weights if not np.array_equal(w[0][-1], envs[-1].walls) and not np.array_equal(w[0][-2], envs[-1].goal)]