def universal_approximation(f, x): [train_x, test_x] = split_data(x, ratio=0.75, random=True) train_y = np.sin(train_x) test_x = np.sort(test_x, axis=0) test_y = f(test_x) # build simple FNN model = Sequential() model.add(Dense(50, input_shape=(1, ), activation='relu')) model.add(Dense(1)) model.compile(loss='mse', optimizer='adam') # training process model.fit(train_x, train_y, batch_size=100, epochs=1000) layer = model.get_layer(index=0) plt.plot(model.history.history['loss']) plt.show() # predict y_hat = model.predict(test_x) plt.plot(test_x, test_y, 'b-', label='original') plt.plot(test_x, y_hat, 'r-', label='predicted') plt.legend() plt.show()
def linear_regression(a=1.0, b=0.0): X = np.linspace(-100, 100, 200) X = X.reshape((-1, 1)) [train_x, test_x] = split_data(X, ratio=0.8, random=True) train_y = a * train_x + b test_y = a * test_x + b i = Input(1) x = Dense(1)(i) # define trainer trainer = Trainer(loss='mse', optimizer=Adam(learning_rate=0.2), batch_size=50, epochs=50) # create model model = Sequential(i, x, trainer) model.summary() # training process model.fit(train_x, train_y) # predict y_hat = model.predict(test_x) plt.plot(test_x, test_y, 'b') plt.plot(test_x, y_hat, 'r') plt.show()
def linear_classification(a=1.0, b=0.0): x = np.linspace(-100, 100, 200) y = a * x + b X = np.array(list(zip(x, y))) + np.random.randn(200, 2) * 100 Y = np.where(a * X[:, 0] + b > X[:, 1], 1, 0) (train_x, train_y), (test_x, test_y) = split_data(X, Y, ratio=0.8, random=True) train_y = to_one_hot(train_y) test_y = np.where(a * test_x[:, 0] + b > test_x[:, 1], 1, 0) # build simple FNN i = Input(2) x = Dense(2, activation='softmax')(i) # define trainer # create model model = Model(i, x) model.compile(optimizer=Adam(learning_rate=0.1), loss='binary_crossentropy', metrics=['accuracy']) model.summary() # training process model.fit(train_x, train_y, batch_size=50, epochs=50) # predict y_hat = model.predict(test_x) y_hat = np.argmax(y_hat, axis=1) simple_plot(test_x, y_hat, a, b)
def multi_classification(csv_file_path): """assuming the csv file has columns: x, y, class""" df = pd.read_csv(csv_file_path) X = df[['x', 'y']].to_numpy() Y = df['class'].to_numpy().reshape((-1, 1)) plt.scatter(X[:, 0], X[:, 1], c=Y, s=100, marker='o') plt.show() (train_x, train_y), (test_x, test_y) = split_data(X, Y, ratio=0.75, random=True)
def _bellman_residual_surrogate(self, Q, samples, weights=None): _, _, _, r, s_prime, absorbing, sa = utils.split_data(samples, self._state_dim, self._action_dim) if weights is None: amax = torch.argmax(Q.value_actions(s_prime, absorbing, grad_required=True), dim=1) amax = amax.detach().numpy() # ensure that is not taken for the derivative maxQ = self._q_target.value(np.concatenate((s_prime, amax[:,np.newaxis]), axis=1), grad_required=True).detach() r = torch.from_numpy(r) absorbing = torch.from_numpy(absorbing) qval = Q.value(sa, grad_required=True) else: qprime = Q.value_actions_weights(s_prime, weights=weights, done=absorbing, grad_required=True).detach() amax = torch.argmax(qprime, dim=1).type("int64") # best actions maxQ = self._q_target.value_actions_weights(s_prime, weights=weights, done=absorbing, grad_required=True) state = np.repeat(np.arange(s_prime.shape[0], dtype="int64"), weights.shape[0]) amax = amax.view(-1) # flattens the tensor maxQ = maxQ[state, amax].view(s_prime.shape[0], weights.shape[0]).detach() r = torch.from_numpy(r).unsqueeze(1) absorbing = torch.from_numpy(absorbing).unsqueeze(1) qval = Q.value_weights(sa, grad_required=True) return smooth_l1_loss(qval, r + self._gamma * maxQ * (1-absorbing), reduce=False)
def linear_classification(a=1.0, b=0.0, graph=False): # prepare data x = np.linspace(-100, 100, 200) y = a * x + b X = np.array(list(zip(x, y))) + np.random.randn(200, 2) * 100 Y = to_one_hot(np.where(a * X[:, 0] + b > X[:, 1], 1, 0)) (train_x, train_y), (test_x, test_y) = split_data(X, Y, ratio=0.8, random=True) # build simple FNN i = Input(2) x = Dense(2, activation='softmax')(i) # define trainer trainer = Trainer(loss='cross_entropy', optimizer=Adam(learning_rate=0.05), batch_size=50, epochs=50, metrics=['accuracy']) # create model model = Sequential(i, x, trainer) model.summary() # training process model.fit(train_x, train_y) print(model.evaluate(test_x, test_y)) if graph: plt.plot(model.history['loss']) plt.show() # predict y_hat = model.predict(test_x) y_hat = np.argmax(y_hat, axis=1) simple_plot(test_x, y_hat, a, b)
def universal_approximation(f, x): [train_x, test_x] = split_data(x, ratio=0.8, random=True) train_y = f(train_x) test_x = np.sort(test_x, axis=0) test_y = f(test_x) # build simple FNN i = Input(1) x = Dense(50, activation='relu')(i) x = Dense(1)(x) # define trainer schedule = ExponentialDecay(initial_learning_rate=0.01, decay_rate=0.75) trainer = Trainer(loss='mse', optimizer=Adam(learning_rate=schedule), batch_size=50, epochs=750) # create model model = Sequential(i, x, trainer) model.summary() # training process start = time.time() model.fit(train_x, train_y) print(time.time() - start) plt.plot(range(len(model.history['loss'])), model.history['loss']) plt.show() # predict y_hat = model.predict(test_x) plt.plot(test_x, test_y, 'b-', label='original') plt.plot(test_x, y_hat, 'r-', label='predicted') plt.legend() plt.show()
def _bellman_residual_surrogate(self, Q, samples, weights=None): _, _, _, r, s_prime, absorbing, sa = utils.split_data( samples, self._state_dim, self._action_dim) if weights is None: Qs_prime = Q.value_actions(s_prime, absorbing, grad_required=True) mmQs = mellow_max(Qs_prime, self._kappa, axis=1) r = torch.from_numpy(r) absorbing = torch.from_numpy(absorbing) qval = Q.value(sa, grad_required=True) else: Qs_prime = Q.value_actions_weights(s_prime, weights=weights, done=absorbing, grad_required=True) mmQs = mellow_max(Qs_prime, self._kappa, axis=1) r = torch.from_numpy(r).unsqueeze(1) absorbing = torch.from_numpy(absorbing).unsqueeze(1) qval = Q.value_weights(sa, grad_required=True) mean_weight = (r + self._gamma * mmQs * (1 - absorbing) - qval).detach() return 2 * mean_weight * ( r + self._xi * self._gamma * mmQs - qval ) # TODO does the (1-done) goes in the derivative?
def linear_regression(a=1.0, b=0.0): X = np.linspace(-100, 100, 200) X = X.reshape((-1, 1)) [train_x, test_x] = split_data(X, ratio=0.8, random=True) train_y = a * train_x + b test_y = a * test_x + b # build simple FNN i = Input(1) x = Dense(1)(i) # create model model = Model(i, x) # training process model.compile(optimizer=Adam(learning_rate=0.1), loss='mse') model.fit(train_x, train_y, batch_size=50, epochs=50) # predict y_hat = model.predict(test_x) plt.plot(test_x, test_y, 'b') plt.plot(test_x, y_hat, 'r') plt.show()
def learn(Q, operator, data, demand, min_env_flow, actions_report_file="", max_iter=5000, buffer_size=10000, batch_size=50, alpha=0.001, train_freq=1, eval_freq=50, eps_start=1.0, eps_end=0.02, exploration_fraction=0.2, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, seed=None, render=False, verbose=True): leap_year_demand = np.insert(demand, 60, demand[59]) if seed is not None: np.random.seed(seed) # mdp creation lake = Lakecomo(None, None, min_env_flow, None, None, seed=seed) years = data.year.unique() description = str(int(years[0])) + "-" + str(int(years[-1])) sampled_year = np.random.choice(years) inflow = list(data.loc[data['year'] == sampled_year, 'in']) if sampled_year % 4 == 0: # leap years between 1946 and 2011 satisfy this condition even though it's not the complete leap year condition mdp = LakeEnv(inflow, leap_year_demand, lake) else: mdp = LakeEnv(inflow, demand, lake) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): Q.init_weights() if isinstance(operator, DQNOperator): operator._q_target._w = Q._w # Initialize policies schedule = np.linspace(eps_start, eps_end, int(exploration_fraction * max_iter)) pi = ScheduledGibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), schedule) pi_u = Gibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), tau=0) pi_g = Gibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), tau=np.inf) # Add random episodes if needed init_samples = utils.generate_episodes( mdp, pi_u, n_episodes=random_episodes, preprocess=preprocess) if random_episodes > 0 else None if random_episodes > 0: t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.observation_space.shape[0], mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.observation_space.shape[0])).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] episode_t = [] l_2 = [] l_inf = [] # Adam initial params m_t = 0 v_t = 0 t = 0 # Init env s = mdp.reset() h = 0 start_time = time.time() if actions_report_file: actions_executed = [] columns = list(range(mdp.N_DISCRETE_ACTIONS)) actions_report_df = pd.DataFrame(columns=columns) actions_report_df.to_csv(actions_report_file, index=False) done_counter = 0 # Learning for i in range(max_iter): # Take epsilon-greedy action wrt current Q-function s_prep = preprocess(s) a = pi.sample_action(s_prep) if actions_report_file: actions_executed.append(a) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = operator.gradient_be(Q, buffer.sample_batch(batch_size)) # Take a gradient step Q._w, t, m_t, v_t = utils.adam(Q._w, g, t, m_t, v_t, alpha=alpha) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: if actions_report_file: actions_counts = np.bincount(actions_executed) actions_freqs = list(actions_counts / sum(actions_counts)) new_row = dict(zip(columns, actions_freqs)) actions_report_df = actions_report_df.append(new_row, ignore_index=True) actions_report_df.to_csv(actions_report_file, index=False) actions_executed = [] episode_rewards.append(0.0) sampled_year = np.random.choice(years) inflow = list(data.loc[data['year'] == sampled_year, 'in']) if sampled_year % 4 == 0: mdp = LakeEnv(inflow, leap_year_demand, lake) else: mdp = LakeEnv(inflow, demand, lake) s = mdp.reset() h = 0 episode_t.append(i) done_counter += 1 # Evaluate model if done_counter == eval_freq: # Evaluate greedy policy scores = [] for _ in range(eval_episodes): sampled_year = np.random.choice(years) inflow = list(data.loc[data['year'] == sampled_year, 'in']) if sampled_year % 4 == 0: mdp = LakeEnv(inflow, leap_year_demand, lake) else: mdp = LakeEnv(inflow, demand, lake) scores.append(_single_year_eval(mdp, pi_g)) rew = np.mean(scores) learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) sampled_year = np.random.choice(years) inflow = list(data.loc[data['year'] == sampled_year, 'in']) if sampled_year % 4 == 0: mdp = LakeEnv(inflow, leap_year_demand, lake) else: mdp = LakeEnv(inflow, demand, lake) s = mdp.reset() end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, l_2_err, l_inf_err, elapsed_time)) done_counter = 0 if (i * 100 / max_iter) % 10 == 0: print("years:", description, "- Progress:", str(int(i * 100 / max_iter)) + "%") run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(Q._w) last_rewards = 5 print("years:", description, "- Last evaluation rewards:", np.around(evaluation_rewards[-last_rewards:], decimals=3)) return [[], weights, run_info]
def learn( mdp, Q, operator, max_iter=5000, buffer_size=10000, batch_size=50, alpha_adam=0.001, alpha_sgd=0.1, lambda_=0.001, n_weights=10, train_freq=1, eval_freq=50, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, cholesky_clip=0.0001, bandwidth=0.00001, post_components=1, max_iter_ukl=60, eps=0.001, eta=1e-6, time_coherent=False, source_file=None, seed=None, render=False, verbose=True, ukl_tight_freq=1, sources=None, # Lambda function to calculate the weights weights_calculator=None): if seed is not None: np.random.seed(seed) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): Q.init_weights() # Reset global variables global prior_eigen prior_eigen = None global cholesky_mask cholesky_mask = None global prior_normal prior_normal = None global posterior_normal posterior_normal = None # Initialize policies pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0) # Get number of features K = Q._w.size C = post_components # Load weights and construct prior distribution weights = utils.load_object(source_file) if sources is None else sources timesteps = len(weights) ws = [] # Take only 1 sample per timestep for i in range(timesteps): samples = weights[i] np.random.shuffle(samples) ws.append(samples[0][1]) # 0: first sample (random), 1: weights ws = np.array(ws) # The gaussian mixture weights are uniform if not provided. c_bar = np.ones( timesteps ) / timesteps if weights_calculator is None else weights_calculator(ws) # Take only gaussians with non-zero weights ws = ws[c_bar > 0] timesteps = len(ws) c_bar = c_bar[c_bar > 0] mu_bar = ws Sigma_bar = np.tile(np.eye(K) * bandwidth, (timesteps, 1, 1)) Sigma_bar_inv = np.tile((1 / bandwidth * np.eye(K))[np.newaxis], (timesteps, 1, 1)) # We initialize the parameters of the posterior to the best approximation of the posterior family to the prior c = np.ones(C) / C psi = c[:, np.newaxis] * c_bar[np.newaxis] phi = np.array(psi) mu = np.array([100 * np.random.randn(K) for _ in range(C)]) Sigma = np.array([np.eye(K) for _ in range(C)]) phi, psi = tight_ukl(c, mu, Sigma, c_bar, mu_bar, Sigma_bar, phi, psi, max_iter=max_iter_ukl, eps=eps) params, phi, psi = init_posterior(c, mu, Sigma, c_bar, mu_bar, Sigma_bar, phi, psi, C, K, cholesky_clip, max_iter_ukl, max_iter=max_iter_ukl * 10, precision=Sigma_bar_inv, eta=eta, eps=eps, verbose=verbose) # Add random episodes if needed init_samples = list() if random_episodes > 0: w, _ = sample_gmm(random_episodes, c_bar, mu_bar, np.sqrt(Sigma_bar)) for i in range(random_episodes): Q._w = w[i] init_samples.append( utils.generate_episodes(mdp, pi_g, n_episodes=1, preprocess=preprocess)) init_samples = np.concatenate(init_samples) t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.state_dim, mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] l_2 = [] l_inf = [] fvals = [] episode_t = [] # Create masks for ADAM and SGD adam_mask = pack(np.zeros(C), np.ones((C, K)) * alpha_adam, np.zeros( (C, K, K))) # ADAM learns only \mu sgd_mask = pack(np.zeros(C), np.zeros((C, K)), np.ones((C, K, K)) * alpha_sgd) # SGD learns only L # Adam initial params m_t = 0 v_t = 0 t = 0 # Init env s = mdp.reset() h = 0 Q._w = sample_posterior(params, C, K) start_time = time.time() # Learning for i in range(max_iter): # If we do not use time coherent exploration, resample parameters Q._w = sample_posterior(params, C, K) if not time_coherent else Q._w # Take greedy action wrt current Q-function s_prep = preprocess(s) a = np.argmax(Q.value_actions(s_prep)) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = gradient(buffer.sample_batch(batch_size), params, Q, c_bar, mu_bar, Sigma_bar, operator, i + 1, phi, psi, n_weights, lambda_, max_iter_ukl, C, K, precision=Sigma_bar_inv, t_step=i, ukl_tight_freq=ukl_tight_freq) # Take a gradient step for \mu params, t, m_t, v_t = utils.adam(params, g, t, m_t, v_t, alpha=adam_mask) # Take a gradient step for L params = utils.sgd(params, g, alpha=sgd_mask) # Clip parameters params = clip(params, cholesky_clip, C, K) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: episode_rewards.append(0.0) s = mdp.reset() h = 0 Q._w = sample_posterior(params, C, K) episode_t.append(i) # Evaluate model if i % eval_freq == 0: #Save current weights current_w = np.array(Q._w) # Evaluate MAP Q-function c, mu, _ = unpack(params, C, K) rew = 0 for j in range(C): Q._w = mu[j] rew += utils.evaluate_policy(mdp, pi_g, render=render, initial_states=eval_states, n_episodes=eval_episodes, preprocess=preprocess)[0] rew /= C learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) fval = objective(buffer.sample_batch(batch_size), params, Q, c_bar, mu_bar, Sigma_bar, operator, i + 1, phi, psi, n_weights, lambda_, C, K, precision=Sigma_bar_inv) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) fvals.append(fval) # Make sure we restart from s mdp.reset(s) # Restore weights Q._w = current_w end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, fval, l_2_err, l_inf_err, elapsed_time)) if (i * 100 / max_iter) % 10 == 0: print("Seed: " + str(seed) + " - Progress: " + str(int(i * 100 / max_iter)) + "%") run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(mu) print("Task over: ", mdp.get_info(), " - Last learning rewards: ", np.around(run_info[3][-5:], decimals=3)) return [mdp.get_info(), weights, run_info]
def learn(mdp, Q, operator, max_iter=5000, buffer_size=10000, batch_size=50, alpha=0.001, train_freq=1, eval_freq=50, eps_start=1.0, eps_end=0.02, exploration_fraction=0.2, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, seed=None, render=False, verbose=True): if seed is not None: np.random.seed(seed) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): # Q.init_weights() if isinstance(operator, DQNOperator): operator._q_target._w = Q._w # Initialize policies schedule = np.linspace(eps_start, eps_end, exploration_fraction * max_iter) pi = ScheduledEpsilonGreedy(Q, np.arange(mdp.action_space.n), schedule) pi_u = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=1) pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0) # Add random episodes if needed init_samples = utils.generate_episodes( mdp, pi_u, n_episodes=random_episodes, preprocess=preprocess) if random_episodes > 0 else None if random_episodes > 0: t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.state_dim, mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] episode_t = [] l_2 = [] l_inf = [] # Adam initial params m_t = 0 v_t = 0 t = 0 # Init env s = mdp.reset() h = 0 start_time = time.time() # Learning for i in range(max_iter): # Take epsilon-greedy action wrt current Q-function s_prep = preprocess(s) a = pi.sample_action(s_prep) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = operator.gradient_be(Q, buffer.sample_batch(batch_size)) # Take a gradient step Q._w, t, m_t, v_t = utils.adam(Q._w, g, t, m_t, v_t, alpha=alpha) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: episode_rewards.append(0.0) s = mdp.reset() h = 0 episode_t.append(i) # Evaluate model if i % eval_freq == 0: # Evaluate greedy policy rew = utils.evaluate_policy(mdp, pi_g, render=render, initial_states=eval_states, n_episodes=eval_episodes, preprocess=preprocess)[0] learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) # Make sure we restart from s mdp.reset(s) end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, l_2_err, l_inf_err, elapsed_time)) # if np.mean(episode_rewards[-mean_episodes - 1:-1]) > -80: # render=True run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(Q._w) return [mdp.get_info(), weights, run_info]
def bellman_residual(self, Q, samples, weights=None): """General function for computing Bellman residuals""" _, _, _, r, s_prime, absorbing, sa = utils.split_data( samples, self._state_dim, self._action_dim) return self._bellman_residual_single(Q, r, s_prime, absorbing, sa) if weights is None else \ self._bellman_residual_multi(Q, r, s_prime, absorbing, sa, weights)
def gradient_be(self, Q, samples, weights=None): """General function for gradients of the Bellman error""" _, _, _, _, _, _, sa = utils.split_data(samples, self._state_dim, self._action_dim) return self._gradient_be_single(Q, samples, sa) if weights is None else \ self._gradient_be_multi(Q, samples, sa, weights)
def gradient_mm(self, Q, samples, weights=None): """General function for computing mellow-max gradients""" _, _, _, _, s_prime, absorbing, _ = utils.split_data(samples, self._state_dim, self._action_dim) return self._gradient_mm_single(Q, s_prime, absorbing) if weights is None else \ self._gradient_mm_multi(Q, s_prime, absorbing, weights)
def learn(mdp, Q, operator, max_iter=5000, buffer_size=10000, batch_size=50, alpha_adam=0.001, alpha_sgd=0.1, lambda_=0.001, n_weights=10, train_freq=1, eval_freq=50, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, sigma_reg=0.0001, cholesky_clip=0.0001, time_coherent=False, n_source=10, source_file=None, seed=None, render=False, verbose=True, sources=None): if seed is not None: np.random.seed(seed) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): Q.init_weights() global prior_eigen_torch prior_eigen_torch = None # Initialize policies pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0) # Get number of features K = Q._w.size # Load weights and construct prior distribution weights = utils.load_object(source_file) if sources is None else sources ws = np.array([w[1] for w in weights]) np.random.shuffle(ws) # Take only the first n_source weights ws = ws[:n_source, :] mu_bar = np.mean(ws, axis=0) Sigma_bar = np.cov(ws.T) # We use higher regularization for the prior to prevent the ELBO from diverging Sigma_bar_inv = np.linalg.inv(Sigma_bar + np.eye(K) * sigma_reg) # We initialize the parameters at the prior with smaller regularization (just to make sure Sigma_bar is pd) params = clip( pack(mu_bar, np.linalg.cholesky(Sigma_bar + np.eye(K) * cholesky_clip**2)), cholesky_clip, K) # Add random episodes if needed if random_episodes > 0: init_samples = list() for i in range(random_episodes): Q._w = sample_posterior(params, K) init_samples.append( utils.generate_episodes(mdp, pi_g, n_episodes=1, preprocess=preprocess)) init_samples = np.concatenate(init_samples) t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.state_dim, mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] episode_t = [] l_2 = [] l_inf = [] fvals = [] # Create masks for ADAM and SGD adam_mask = pack(np.ones(K) * alpha_adam, np.zeros( (K, K))) # ADAM learns only \mu sgd_mask = pack(np.zeros(K), np.ones((K, K)) * alpha_sgd) # SGD learns only L # Adam initial params m_t = 0 v_t = 0 t = 0 # RMSprop for Variance v_t_var = 0. # Init env s = mdp.reset() h = 0 Q._w = sample_posterior(params, K) start_time = time.time() # Learning for i in range(max_iter): # If we do not use time coherent exploration, resample parameters Q._w = sample_posterior(params, K) if not time_coherent else Q._w # Take greedy action wrt current Q-function s_prep = preprocess(s) a = np.argmax(Q.value_actions(s_prep)) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = gradient(buffer.sample_batch(batch_size), params, Q, mu_bar, Sigma_bar_inv, operator, i + 1, lambda_, n_weights) # Take a gradient step for \mu params, t, m_t, v_t = utils.adam(params, g, t, m_t, v_t, alpha=adam_mask) # Take a gradient step for L params = utils.sgd(params, g, alpha=sgd_mask) # params,v_t_var = utils.rmsprop(params, g, v_t_var, alpha=sgd_mask) # Clip parameters params = clip(params, cholesky_clip, K) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: episode_rewards.append(0.0) s = mdp.reset() h = 0 Q._w = sample_posterior(params, K) episode_t.append(i) # Evaluate model if i % eval_freq == 0: #Save current weights current_w = np.array(Q._w) # Evaluate MAP Q-function mu, _ = unpack(params, K) Q._w = mu rew = utils.evaluate_policy(mdp, pi_g, render=render, initial_states=eval_states, n_episodes=eval_episodes, preprocess=preprocess)[0] learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) fval = objective(buffer.sample_batch(batch_size), params, Q, mu_bar, Sigma_bar_inv, operator, i + 1, lambda_, n_weights) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) fvals.append(fval) # Make sure we restart from s mdp.reset(s) # Restore weights Q._w = current_w end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, fval, l_2_err, l_inf_err, elapsed_time)) run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(mu) return [mdp.get_info(), weights, run_info]