def update_noise(self, i, trajs): self.lnr.train() new_cov = noise.sample_covariance_trajs(self.env, self.lnr, trajs, self.params['t']) log("Estimated covariance matrix: ") log(new_cov) log("Trace: " + str(np.trace(new_cov))) # d = env.action_space.shape[0] self.sup = GaussianSupervisor(self.net_sup, new_cov) return self.sup
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } trajs = [] d = self.params['d'] new_cov = np.random.normal(0, 1, (d, d)) new_cov = new_cov.T.dot(new_cov) new_cov = new_cov / np.trace(new_cov) * self.params['trace'] self.sup = GaussianSupervisor(self.net_sup, new_cov) snapshots = [] for i in range(self.params['iters'][-1]): print "\tIteration: " + str(i) states, i_actions, _, _ = statistics.collect_traj(self.env, self.sup, T, False) trajs.append((states, i_actions)) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) self.lnr.add_data(states, i_actions) if ((i + 1) in self.params['iters']): snapshots.append((self.lnr.X[:], self.lnr.y[:])) for j in range(len(snapshots)): X, y = snapshots[j] self.lnr.X, self.lnr.y = X, y self.lnr.train(verbose=True) print "\nData from snapshot: " + str(self.params['iters'][j]) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(len(y)) for key in results.keys(): results[key] = np.array(results[key]) return results
def update_noise(self, i, trajs, reg_penalty): if i in self.params['update']: self.optimized_data = self.count_states(trajs) self.lnr.train() new_cov = noise.sample_covariance_trajs(self.env, self.lnr, trajs, 5, self.params['t']) new_cov = new_cov * reg_penalty print "Estimated covariance matrix: " print new_cov print np.trace(new_cov) self.sup = GaussianSupervisor(self.net_sup, new_cov) return self.sup else: return self.sup
def prologue(self): """ Preprocess hyperparameters and initialize learner and supervisor """ self.params['filename'] = './experts/' + self.params['envname'] + '.pkl' self.env = gym.envs.make(self.params['envname']) self.params['d'] = self.env.action_space.shape[0] sess = tf.Session() policy = load_policy.load_policy(self.params['filename']) net_sup = Supervisor(policy, sess) init_cov = np.zeros((self.params['d'], self.params['d'])) sup = GaussianSupervisor(net_sup, init_cov) est, lnr = self.reset_learner(self.params) self.lnr, self.sup, self.net_sup = lnr, sup, net_sup return self.params
def run_iters(self): T = self.params['t'] results = { 'rewards': [], 'sup_rewards': [], 'surr_losses': [], 'sup_losses': [], 'sim_errs': [], 'data_used': [], } start_time = timer.time() d = self.params['d'] new_cov = np.random.normal(0, 1, (d, d)) new_cov = new_cov.T.dot(new_cov) new_cov = new_cov / np.trace(new_cov) * self.params['trace'] self.sup = GaussianSupervisor(self.net_sup, new_cov) data_states = [] data_actions = [] iteration = 0 while len(data_states) < self.params['max_data']: log("\tIteration: " + str(iteration)) log("\tData states: " + str(len(data_states))) assert (len(data_states) == len(data_actions)) states, i_actions, _, _ = statistics.collect_traj( self.env, self.sup, T, False) states, i_actions, _ = utils.filter_data(self.params, states, i_actions) data_states += states data_actions += i_actions self.lnr.set_data(data_states, data_actions) iteration += 1 end_time = timer.time() for sr in self.snapshot_ranges: # # Uncomment for actual evaluations snapshot_states = data_states[:sr] snapshot_actions = data_actions[:sr] self.lnr.set_data(snapshot_states, snapshot_actions) self.lnr.train(verbose=True) log("\nData from snapshot: " + str(sr)) it_results = self.iteration_evaluation() results['sup_rewards'].append(it_results['sup_reward_mean']) results['rewards'].append(it_results['reward_mean']) results['surr_losses'].append(it_results['surr_loss_mean']) results['sup_losses'].append(it_results['sup_loss_mean']) results['sim_errs'].append(it_results['sim_err_mean']) results['data_used'].append(sr) # Uncomment for time trials # results['sup_rewards'].append(0) # results['rewards'].append(0) # results['surr_losses'].append(0) # results['sup_losses'].append(0) # results['sim_errs'].append(0) # results['data_used'].append(0) for key in results.keys(): results[key] = np.array(results[key]) results['total_time'] = end_time - start_time return results