def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } trajs = [] beta = self.params['beta'] snapshots = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) if i in self.params['update']: self.lnr.train(verbose=True) if i == 0: states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train() else: states, _, _, _ = statistics.collect_traj_beta( self.env, self.sup, self.lnr, T, beta, False) i_actions = [self.sup.intended_action(s) for s in states] states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) beta = beta * self.params['beta'] if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], 'biases': [], 'variances': [], 'biases_learner': [], 'variances_learner': [], 'covariate_shifts': [] } snapshots = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) self.lnr.add_data(states, i_actions) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['biases'].append(it_results['biases_mean']) results['variances'].append(it_results['variances_mean']) results['biases_learner'].append(it_results['biases_learner_mean']) results['variances_learner'].append( it_results['variances_learner_mean']) results['covariate_shifts'].append( it_results['covariate_shifts_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } trajs = [] d = self.params['d'] new_cov = np.random.normal(0, 1, (d, d)) new_cov = new_cov.T.dot(new_cov) new_cov = new_cov / np.trace(new_cov) * self.params['trace'] self.sup = GaussianSupervisor(self.net_sup, new_cov) snapshots = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) states, i_actions, _, _ = statistics.collect_traj(self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) self.lnr.add_data(states, i_actions) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def run_iters(self): T = self.params['t'] partition = self.params['partition'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } start_time = timer.time() trajs = [] traj_snapshots = [] self.optimized_data = 0 data_states = [] data_actions = [] train_states = [] train_i_actions = [] supervisors = [] iteration = 0 last_data_update = 0 while len(data_states) < self.params['max_data']: log("\tIteration: " + str(iteration)) log("\tData states: " + str(len(data_states))) assert (len(data_states) == len(data_actions)) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) data_states += states data_actions += i_actions supervisors += [self.sup] * len(states) rang = np.arange(0, len(states)) np.random.shuffle(rang) partition_cutoff = int(partition * len(states)) noise_states, noise_actions = [ states[k] for k in rang[:partition_cutoff] ], [i_actions[k] for k in rang[:partition_cutoff]] states, i_actions = [states[k] for k in rang[partition_cutoff:]], [ i_actions[k] for k in rang[partition_cutoff:] ] train_states += states train_i_actions += i_actions self.lnr.set_data(train_states, train_i_actions) trajs.append((noise_states, noise_actions)) if iteration == 0 or len(data_states) >= ( last_data_update + self.params['update_period']): self.sup = self.update_noise(iteration, trajs) difference = (len(data_states) - last_data_update) / self.params['update_period'] last_data_update += difference * self.params['update_period'] iteration += 1 end_time = timer.time() for sr in self.snapshot_ranges: # # Uncomment for actual evaluations snapshot_states = data_states[:sr] snapshot_actions = data_actions[:sr] self.lnr.set_data(snapshot_states, snapshot_actions) self.lnr.train(verbose=True) self.sup = supervisors[sr - 1] log("\nData from snapshot: " + str(sr)) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(sr) # Uncomment for time trials # results['sup_rewards'].append(0) # results['rewards'].append(0) # results['surr_losses'].append(0) # results['sup_losses'].append(0) # results['sim_errs'].append(0) # results['data_used'].append(0) log("\tTrain data: " + str(len(train_i_actions))) log("\tNoise opt data: " + str(self.count_states(trajs))) for key in results.keys(): results[key] = np.array(results[key]) results['total_time'] = end_time - start_time return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], 'biases': [], 'variances': [], 'biases_learner': [], 'variances_learner': [], 'covariate_shifts': [] } trajs = [] snapshots = [] switch_idxs = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) if i == 0: states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train() else: post_switch_states, post_switch_sup_actions, pre_switch_states, switch_idx, _ = statistics.collect_traj_mixed( self.env, self.sup, self.lnr, T, i, self.params['iters'][-1], False) if self.params['dagger_mixed']: i_actions_dagger = [ self.sup.intended_action(s) for s in pre_switch_states ] states = pre_switch_states + post_switch_states i_actions = i_actions_dagger + post_switch_sup_actions else: states = post_switch_states i_actions = post_switch_sup_actions states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train(verbose=True) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) switch_idxs.append(switch_idx) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation( mixed_switch_idx=switch_idxs[j]) results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['biases'].append(it_results['biases_mean']) results['variances'].append(it_results['variances_mean']) results['biases_learner'].append(it_results['biases_learner_mean']) results['variances_learner'].append( it_results['variances_learner_mean']) results['covariate_shifts'].append( it_results['covariate_shifts_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def run_iters(self): T = self.params['t'] partition = self.params['partition'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } trajs = [] snapshots = [] traj_snapshots = [] self.optimized_data = 0 for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) self.sup = self.update_noise(i, trajs) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, (held_out_states, held_out_actions) = utils.filter_data( self.params, states, i_actions) rang = np.arange(0, len(held_out_states)) np.random.shuffle(rang) noise_states, noise_actions = [ held_out_states[k] for k in rang[:partition] ], [held_out_actions[k] for k in rang[:partition]] trajs.append((noise_states, noise_actions)) self.lnr.add_data(states, i_actions) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) traj_snapshots.append(self.optimized_data) for j in range(len(snapshots)): X, y = snapshots[j] optimized_data = traj_snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(len(y) + optimized_data) print "\nTrain data: " + str(len(y)) print "\n Optimize data: " + str(optimized_data) for key in results.keys(): results[key] = np.array(results[key]) return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } start_time = timer.time() trajs = [] beta = self.params['beta'] data_states = [] data_actions = [] iteration = 0 last_data_update = 0 while len(data_states) < self.params['max_data']: log("\tIteration: " + str(iteration)) log("\tData states: " + str(len(data_states))) assert (len(data_states) == len(data_actions)) if iteration == 0: states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) else: states, tmp_actions, _, _ = statistics.collect_traj_beta( self.env, self.sup, self.lnr, T, beta, False) states, _, _ = utils.filter_data(self.params, states, tmp_actions) i_actions = [self.sup.intended_action(s) for s in states] beta = beta * self.params['beta'] data_states += states data_actions += i_actions self.lnr.set_data(data_states, data_actions) if iteration == 0 or len(data_states) >= ( last_data_update + self.params['update_period']): self.lnr.train(verbose=True) difference = (len(data_states) - last_data_update) / self.params['update_period'] last_data_update += difference * self.params['update_period'] iteration += 1 end_time = timer.time() for sr in self.snapshot_ranges: # # Uncomment for actual evaluations snapshot_states = data_states[:sr] snapshot_actions = data_actions[:sr] self.lnr.set_data(snapshot_states, snapshot_actions) self.lnr.train(verbose=True) log("\nData from snapshot: " + str(sr)) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(sr) # Uncomment for time trials # results['sup_rewards'].append(0) # results['rewards'].append(0) # results['surr_losses'].append(0) # results['sup_losses'].append(0) # results['sim_errs'].append(0) # results['data_used'].append(0) for key in results.keys(): results[key] = np.array(results[key]) results['total_time'] = end_time - start_time return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } start_time = timer.time() data_states = [] data_actions = [] iteration = 0 while len(data_states) < self.params['max_data']: log("\tIteration: " + str(iteration)) log("\tData states: " + str(len(data_states))) assert (len(data_states) == len(data_actions)) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) data_states += states data_actions += i_actions self.lnr.set_data(data_states, data_actions) iteration += 1 end_time = timer.time() for sr in self.snapshot_ranges: # # Uncomment for actual evaluations snapshot_states = data_states[:sr] snapshot_actions = data_actions[:sr] self.lnr.set_data(snapshot_states, snapshot_actions) self.lnr.train(verbose=True) log("\nData from snapshot: " + str(sr)) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(sr) # Uncomment for time trials # results['sup_rewards'].append(0) # results['rewards'].append(0) # results['surr_losses'].append(0) # results['sup_losses'].append(0) # results['sim_errs'].append(0) # results['data_used'].append(0) for key in results.keys(): results[key] = np.array(results[key]) results['total_time'] = end_time - start_time return results
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], 'biases': [], 'variances': [], 'biases_learner': [], 'variances_learner': [], 'covariate_shifts': [] } trajs = [] snapshots = [] dist_gen_agents = [] learner_bias, learner_variance = None, None for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) if i == 0: states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train() learner_last = False dist_gen_agent = self.sup else: # if was learner last time and variance > some quantity switch to supervisor if learner_last and float(learner_variance) / ( float(learner_bias) + float(learner_variance) ) > 0.5: # TODO: can modify this threshold in various ways as see fit... states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train() learner_last = False dist_gen_agent = self.sup else: states, _, _, _ = statistics.collect_traj( self.env, self.lnr, T, False) i_actions = [self.sup.intended_action(s) for s in states] states, i_actions, _ = utils.filter_data( self.params, states, i_actions) self.lnr.add_data(states, i_actions) self.lnr.train(verbose=True) learner_last = True learner_bias, learner_variance = statistics.evaluate_bias_variance_learner_cont( self.env, self.lnr, self.sup, T, num_samples=20) dist_gen_agent = self.lnr if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) dist_gen_agents.append(dist_gen_agent) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation( dist_gen_agent=dist_gen_agents[j]) results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['biases'].append(it_results['biases_mean']) results['variances'].append(it_results['variances_mean']) results['biases_learner'].append(it_results['biases_learner_mean']) results['variances_learner'].append( it_results['variances_learner_mean']) results['covariate_shifts'].append( it_results['covariate_shifts_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results