def run_iters(self): results = { 'lnr_costs': [], 'opt_costs': [], 'variations': [], 'opt_variations': [], 'param_norms': [], 'opt_param_norms': [], 'lambdas': [], 'lnr_batch_costs': [], 'opt_batch_costs': [], 'static_regret': [], 'rewards': [], 'betas': [], 'alphas': [], } d = self.env.observation_space.shape[0] # self.data_states = [np.zeros(d), np.zeros(d)] # self.data_actions = [1, 0] self.data_states = [] self.data_actions = [] for iteration in range(self.iters): print("\tIteration: " + str(iteration)) print("\tData states: " + str(len(self.data_states))) print("\tParameters: " + str(self.lnr.est.coef_)) self.compute_statistics(iteration, results) states, tmp_actions, _, _ = statistics.collect_traj(self.env, self.lnr, self.params['T']) i_actions = [self.sup.intended_action(s) for s in states] self.data_states += states self.data_actions += i_actions self.lnr.set_update(states, i_actions) self.lnr.update(iteration) # Adaptive regularization: if self.reg and (iteration + 1) % 20 == 0: mean_lambda = np.mean(results['lambdas'][-10:] + self.lambda_prior) next_alpha = mean_lambda * self.lnr.est.alpha self.lnr.est.alpha = self.t * next_alpha + (1 - self.t) * self.lnr.est.alpha self.lnr.est.eta = np.min([.0001, 1/self.lnr.est.alpha]) print("\n\n\t\t Updated alpha: " + str(self.lnr.est.alpha)) print("\t\t Lambda was: " + str(mean_lambda)) for key in results.keys(): results[key] = np.array(results[key]) self.compute_results(results) return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } trajs = [] beta = self.params['beta'] snapshots = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) if i in self.params['update']: self.lnr.train(verbose=True) if i == 0: states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train() else: states, _, _, _ = statistics.collect_traj_beta( self.env, self.sup, self.lnr, T, beta, False) i_actions = [self.sup.intended_action(s) for s in states] states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) beta = beta * self.params['beta'] if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def compute_results(self, results): _, _, _, sup_reward = statistics.collect_traj(self.env, self.sup, self.params['T'], False) results['sup_rewards'] = [sup_reward] * len(results['rewards']) # DYNAMIC REGRET plt.subplot(211) plt.title("Actual loss") plt.plot(results['lnr_costs'], label='lnr costs') plt.plot(results['opt_costs'], label='opt costs') plt.legend() difference = results['lnr_costs'] - results['opt_costs'] plt.subplot(212) plt.title("Difference") plt.plot(difference) plt.tight_layout() filepath = self.path + '.pdf' plt.savefig(filepath) plt.close() plt.cla() plt.clf() # STATIC REGRET plt.subplot(211) plt.title("Batch costs") plt.plot(results['lnr_batch_costs'], label='lnr costs') plt.plot(results['opt_batch_costs'], label='opt costs') plt.legend() plt.subplot(212) plt.title("Static regret (lnr batch - opt batch)") plt.plot(results['static_regret']) plt.tight_layout() filepath = self.path + '_batch.pdf' plt.savefig(filepath) plt.close() plt.cla() plt.clf() plt.subplot(111) plt.title("Rewards") plt.plot(results['rewards'], label='Learner rewards') plt.plot(results['sup_rewards'], label='Supervisor Rewards') plt.legend() filepath = self.path + '_reward.pdf' plt.savefig(filepath) plt.close() plt.cla() plt.clf() filepath = self.path + '.p' f = open(filepath, 'wb') pickle.dump(results, f) f.close()
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], 'biases': [], 'variances': [], 'biases_learner': [], 'variances_learner': [], 'covariate_shifts': [] } snapshots = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) self.lnr.add_data(states, i_actions) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['biases'].append(it_results['biases_mean']) results['variances'].append(it_results['variances_mean']) results['biases_learner'].append(it_results['biases_learner_mean']) results['variances_learner'].append( it_results['variances_learner_mean']) results['covariate_shifts'].append( it_results['covariate_shifts_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } trajs = [] d = self.params['d'] new_cov = np.random.normal(0, 1, (d, d)) new_cov = new_cov.T.dot(new_cov) new_cov = new_cov / np.trace(new_cov) * self.params['trace'] self.sup = GaussianSupervisor(self.net_sup, new_cov) snapshots = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) states, i_actions, _, _ = statistics.collect_traj(self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) self.lnr.add_data(states, i_actions) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def compute_statistics(self, iteration, results): states, tmp_actions, _, reward = statistics.collect_traj( self.env, self.lnr, self.params['T'], False) actions = [self.sup.intended_action(s) for s in states] d = self.env.observation_space.shape[0] # states += [np.zeros(d), np.zeros(d)] # actions += [1, 0] est = LRC(self.lnr.est.alpha, self.inner_eta, intercept=False) lh, ph = est.fit(states, actions) lnr_cost = self.lnr.est.loss(states, actions) opt_cost = est.loss(states, actions) print("\tlnr_cost: " + str(lnr_cost)) print("\topt_cost: " + str(opt_cost)) results['lnr_costs'].append(lnr_cost) results['opt_costs'].append(opt_cost) results['rewards'].append(reward) results['alphas'].append(self.lnr.est.alpha) curr_coef_ = self.lnr.est.coef_.copy() curr_opt_coef_ = est.coef_.copy() results['param_norms'].append(np.linalg.norm(curr_coef_)) results['opt_param_norms'].append(np.linalg.norm(curr_opt_coef_)) if not iteration is 0: variation = np.linalg.norm(self.last_coef_ - curr_coef_) opt_variation = np.linalg.norm(self.last_opt_coef_ - curr_opt_coef_) last_gradient = est.gradient(self.last_states, self.last_actions, curr_coef_) curr_gradient = est.gradient(states, actions, curr_coef_) beta = np.linalg.norm(last_gradient - curr_gradient) / variation results['variations'].append(variation) results['opt_variations'].append(opt_variation) results['lambdas'].append(opt_variation / variation) results['betas'].append(beta) self.last_coef_ = curr_coef_.copy() self.last_opt_coef_ = curr_opt_coef_.copy() self.last_states = states self.last_actions = actions static_est = LRC(self.lnr.est.alpha, self.inner_eta, intercept=False) batch_states = self.data_states + states batch_actions = self.data_actions + actions lh_batch, ph_batch = static_est.fit(batch_states, batch_actions) opt_batch_cost = static_est.loss(batch_states, batch_actions) lnr_batch_cost = np.mean(results['lnr_costs']) static_regret = lnr_batch_cost - opt_batch_cost print("\tlnr_batch_cost: " + str(lnr_batch_cost)) print("\topt_batch_cost: " + str(opt_batch_cost)) print() results['lnr_batch_costs'].append(lnr_batch_cost) results['opt_batch_costs'].append(opt_batch_cost) results['static_regret'].append(static_regret) return results
def run_iters(self): T = self.params['t'] partition = self.params['partition'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } start_time = timer.time() trajs = [] traj_snapshots = [] self.optimized_data = 0 data_states = [] data_actions = [] train_states = [] train_i_actions = [] supervisors = [] iteration = 0 last_data_update = 0 while len(data_states) < self.params['max_data']: log("\tIteration: " + str(iteration)) log("\tData states: " + str(len(data_states))) assert (len(data_states) == len(data_actions)) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) data_states += states data_actions += i_actions supervisors += [self.sup] * len(states) rang = np.arange(0, len(states)) np.random.shuffle(rang) partition_cutoff = int(partition * len(states)) noise_states, noise_actions = [ states[k] for k in rang[:partition_cutoff] ], [i_actions[k] for k in rang[:partition_cutoff]] states, i_actions = [states[k] for k in rang[partition_cutoff:]], [ i_actions[k] for k in rang[partition_cutoff:] ] train_states += states train_i_actions += i_actions self.lnr.set_data(train_states, train_i_actions) trajs.append((noise_states, noise_actions)) if iteration == 0 or len(data_states) >= ( last_data_update + self.params['update_period']): self.sup = self.update_noise(iteration, trajs) difference = (len(data_states) - last_data_update) / self.params['update_period'] last_data_update += difference * self.params['update_period'] iteration += 1 end_time = timer.time() for sr in self.snapshot_ranges: # # Uncomment for actual evaluations snapshot_states = data_states[:sr] snapshot_actions = data_actions[:sr] self.lnr.set_data(snapshot_states, snapshot_actions) self.lnr.train(verbose=True) self.sup = supervisors[sr - 1] log("\nData from snapshot: " + str(sr)) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(sr) # Uncomment for time trials # results['sup_rewards'].append(0) # results['rewards'].append(0) # results['surr_losses'].append(0) # results['sup_losses'].append(0) # results['sim_errs'].append(0) # results['data_used'].append(0) log("\tTrain data: " + str(len(train_i_actions))) log("\tNoise opt data: " + str(self.count_states(trajs))) for key in results.keys(): results[key] = np.array(results[key]) results['total_time'] = end_time - start_time return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], 'biases': [], 'variances': [], 'biases_learner': [], 'variances_learner': [], 'covariate_shifts': [] } trajs = [] snapshots = [] switch_idxs = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) if i == 0: states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train() else: post_switch_states, post_switch_sup_actions, pre_switch_states, switch_idx, _ = statistics.collect_traj_mixed( self.env, self.sup, self.lnr, T, i, self.params['iters'][-1], False) if self.params['dagger_mixed']: i_actions_dagger = [ self.sup.intended_action(s) for s in pre_switch_states ] states = pre_switch_states + post_switch_states i_actions = i_actions_dagger + post_switch_sup_actions else: states = post_switch_states i_actions = post_switch_sup_actions states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train(verbose=True) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) switch_idxs.append(switch_idx) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation( mixed_switch_idx=switch_idxs[j]) results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['biases'].append(it_results['biases_mean']) results['variances'].append(it_results['variances_mean']) results['biases_learner'].append(it_results['biases_learner_mean']) results['variances_learner'].append( it_results['variances_learner_mean']) results['covariate_shifts'].append( it_results['covariate_shifts_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def run_iters(self): T = self.params['t'] partition = self.params['partition'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } trajs = [] snapshots = [] traj_snapshots = [] self.optimized_data = 0 for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) self.sup = self.update_noise(i, trajs) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, (held_out_states, held_out_actions) = utils.filter_data( self.params, states, i_actions) rang = np.arange(0, len(held_out_states)) np.random.shuffle(rang) noise_states, noise_actions = [ held_out_states[k] for k in rang[:partition] ], [held_out_actions[k] for k in rang[:partition]] trajs.append((noise_states, noise_actions)) self.lnr.add_data(states, i_actions) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) traj_snapshots.append(self.optimized_data) for j in range(len(snapshots)): X, y = snapshots[j] optimized_data = traj_snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(len(y) + optimized_data) print "\nTrain data: " + str(len(y)) print "\n Optimize data: " + str(optimized_data) for key in results.keys(): results[key] = np.array(results[key]) return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } start_time = timer.time() trajs = [] beta = self.params['beta'] data_states = [] data_actions = [] iteration = 0 last_data_update = 0 while len(data_states) < self.params['max_data']: log("\tIteration: " + str(iteration)) log("\tData states: " + str(len(data_states))) assert (len(data_states) == len(data_actions)) if iteration == 0: states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) else: states, tmp_actions, _, _ = statistics.collect_traj_beta( self.env, self.sup, self.lnr, T, beta, False) states, _, _ = utils.filter_data(self.params, states, tmp_actions) i_actions = [self.sup.intended_action(s) for s in states] beta = beta * self.params['beta'] data_states += states data_actions += i_actions self.lnr.set_data(data_states, data_actions) if iteration == 0 or len(data_states) >= ( last_data_update + self.params['update_period']): self.lnr.train(verbose=True) difference = (len(data_states) - last_data_update) / self.params['update_period'] last_data_update += difference * self.params['update_period'] iteration += 1 end_time = timer.time() for sr in self.snapshot_ranges: # # Uncomment for actual evaluations snapshot_states = data_states[:sr] snapshot_actions = data_actions[:sr] self.lnr.set_data(snapshot_states, snapshot_actions) self.lnr.train(verbose=True) log("\nData from snapshot: " + str(sr)) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(sr) # Uncomment for time trials # results['sup_rewards'].append(0) # results['rewards'].append(0) # results['surr_losses'].append(0) # results['sup_losses'].append(0) # results['sim_errs'].append(0) # results['data_used'].append(0) for key in results.keys(): results[key] = np.array(results[key]) results['total_time'] = end_time - start_time return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } start_time = timer.time() data_states = [] data_actions = [] iteration = 0 while len(data_states) < self.params['max_data']: log("\tIteration: " + str(iteration)) log("\tData states: " + str(len(data_states))) assert (len(data_states) == len(data_actions)) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) data_states += states data_actions += i_actions self.lnr.set_data(data_states, data_actions) iteration += 1 end_time = timer.time() for sr in self.snapshot_ranges: # # Uncomment for actual evaluations snapshot_states = data_states[:sr] snapshot_actions = data_actions[:sr] self.lnr.set_data(snapshot_states, snapshot_actions) self.lnr.train(verbose=True) log("\nData from snapshot: " + str(sr)) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(sr) # Uncomment for time trials # results['sup_rewards'].append(0) # results['rewards'].append(0) # results['surr_losses'].append(0) # results['sup_losses'].append(0) # results['sim_errs'].append(0) # results['data_used'].append(0) for key in results.keys(): results[key] = np.array(results[key]) results['total_time'] = end_time - start_time return results
alpha = 0.1 eta = 1.0 t = .01 regret = True sup = FluidsSupervisor() lnr = FluidsLearner(LRC(alpha, eta, intercept=False), sup) env = FluidsEnv(fluids.OBS_GRID) data_states = [] data_actions = [] sup_reward_arr = [] reward_arr = [] for iteration in range(iterations): states, intended_actions, taken_actions, reward, infos = statistics.collect_traj( env, sup, 10, True) sup_reward_arr.append(reward) states, intended_actions, taken_actions, reward, infos = statistics.collect_traj( env, lnr, 10, True) reward_arr.append(reward) i_actions = [] for i in range(len(states)): i_actions += [sup.intended_action(states[i], infos[i])] data_states += states data_actions += i_actions lnr.set_data(data_states, data_actions) lnr.train() plt.subplot(111)
def run_iters(self): results = { 'lnr_costs': [], 'opt_costs': [], 'variations': [], 'opt_variations': [], 'param_norms': [], 'opt_param_norms': [], 'lambdas': [], 'lnr_batch_costs': [], 'opt_batch_costs': [], 'static_regret': [], 'rewards': [], 'betas': [], 'alphas': [], } d = self.env.observation_space.shape[0] # self.data_states = [np.zeros(d), np.zeros(d)] # self.data_actions = [1, 0] self.data_states = [] self.data_actions = [] for iteration in range(self.iters): print("\tIteration: " + str(iteration)) print("\tData states: " + str(len(self.data_states))) if len(self.data_states) > 0: X = np.array(states) y = np.array(tmp_actions) print("\t Coef norm: " + str( np.linalg.norm(self.lnr.est.coef_) / (X.shape[1] * y.shape[1]))) # if iteration == 0 or iteration % 25 == 0: # IPython.embed() self.compute_statistics(iteration, results) states, tmp_actions, _, _ = statistics.collect_traj( self.env, self.lnr, self.params['T']) i_actions = [self.sup.intended_action(s) for s in states] self.data_states += states self.data_actions += i_actions self.lnr.set_update(states, i_actions) self.lnr.multiple_update(iteration) # Adaptive regularization: if self.reg and (iteration + 1) % 10 == 0: mean_lambda = np.mean(results['lambdas'][-10:] + self.lambda_prior) mean_ratio = np.mean( np.array(results['opt_costs'][-10:]) / np.array(results['lnr_costs'][-10:])) if mean_ratio < .998: next_alpha = mean_lambda * self.lnr.est.alpha # self.lnr.est.alpha = (1 - mean_ratio) * next_alpha + mean_ratio * self.lnr.est.alpha self.lnr.est.alpha = self.t * next_alpha + ( 1 - self.t) * self.lnr.est.alpha self.lnr.est.eta = np.min( [.01, 1.0 / self.lnr.est.alpha / 10.0]) print("\n\n\t\t Updated alpha: " + str(self.lnr.est.alpha)) print("\t\t Mean ratio: " + str(mean_ratio)) print("\t\t Lambda was: " + str(mean_lambda)) print("\t\t Eta: " + str(self.lnr.est.eta)) for key in results.keys(): results[key] = np.array(results[key]) self.compute_results(results) return results
import argparse import fluids from fluids_env import FluidsEnv, FluidsVelEnv import IPython fluids.OBS_GRID fluids.OBS_BIRDSEYE fluids.OBS_GRID fluids.OBS_NONE if __name__ == '__main__': sup = FluidsSupervisor() env = FluidsEnv(fluids.OBS_GRID) states, int_actions, _, reward, infos = statistics.collect_traj( env, sup, 100, False) IPython.embed() # lnr_rewards = [] # for i in range(iterations): # env = gym.make("fluids-v2") # sup = FluidsSupervisor(gym_fluids.agents.fluids_supervisor) # states, tmp_actions, _, reward = statistics.collect_traj(env, sup, 100, True) # # train model # # # states, tmp_actions, _, lnr_reward = statistics.collect_traj(env, lnr, 100, True) # lnr_rewards.append(lnr_reward) # IPython.embed()
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], 'biases': [], 'variances': [], 'biases_learner': [], 'variances_learner': [], 'covariate_shifts': [] } trajs = [] snapshots = [] dist_gen_agents = [] learner_bias, learner_variance = None, None for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) if i == 0: states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train() learner_last = False dist_gen_agent = self.sup else: # if was learner last time and variance > some quantity switch to supervisor if learner_last and float(learner_variance) / ( float(learner_bias) + float(learner_variance) ) > 0.5: # TODO: can modify this threshold in various ways as see fit... states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train() learner_last = False dist_gen_agent = self.sup else: states, _, _, _ = statistics.collect_traj( self.env, self.lnr, T, False) i_actions = [self.sup.intended_action(s) for s in states] states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train(verbose=True) learner_last = True learner_bias, learner_variance = statistics.evaluate_bias_variance_learner_cont( self.env, self.lnr, self.sup, T, num_samples=20) dist_gen_agent = self.lnr if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) dist_gen_agents.append(dist_gen_agent) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation( dist_gen_agent=dist_gen_agents[j]) results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['biases'].append(it_results['biases_mean']) results['variances'].append(it_results['variances_mean']) results['biases_learner'].append(it_results['biases_learner_mean']) results['variances_learner'].append( it_results['variances_learner_mean']) results['covariate_shifts'].append( it_results['covariate_shifts_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results