Example #1
0
 def update_noise(self, i, trajs):
     self.lnr.train()
     new_cov = noise.sample_covariance_trajs(self.env, self.lnr, trajs,
                                             self.params['t'])
     log("Estimated covariance matrix: ")
     log(new_cov)
     log("Trace: " + str(np.trace(new_cov)))
     # d = env.action_space.shape[0]
     self.sup = GaussianSupervisor(self.net_sup, new_cov)
     return self.sup
Example #2
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        trajs = []

        d = self.params['d']
        new_cov = np.random.normal(0, 1, (d, d))
        new_cov = new_cov.T.dot(new_cov)
        new_cov = new_cov / np.trace(new_cov) * self.params['trace']
        self.sup = GaussianSupervisor(self.net_sup, new_cov)

        snapshots = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)


            states, i_actions, _, _ = statistics.collect_traj(self.env, self.sup, T, False)
            trajs.append((states, i_actions))
            states, i_actions, _ = utils.filter_data(self.params, states, i_actions)
            
            self.lnr.add_data(states, i_actions)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()
            
            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(len(y))


        for key in results.keys():
            results[key] = np.array(results[key])
        return results
Example #3
0
    def update_noise(self, i, trajs, reg_penalty):

        if i in self.params['update']:
            self.optimized_data = self.count_states(trajs)
            self.lnr.train()
            new_cov = noise.sample_covariance_trajs(self.env, self.lnr, trajs, 5, self.params['t'])
            new_cov = new_cov * reg_penalty
            print "Estimated covariance matrix: "
            print new_cov
            print np.trace(new_cov)
            self.sup = GaussianSupervisor(self.net_sup, new_cov)
            return self.sup
        else:
            return self.sup
Example #4
0
    def prologue(self):
        """
            Preprocess hyperparameters and initialize learner and supervisor
        """
        self.params['filename'] = './experts/' + self.params['envname'] + '.pkl'
        self.env = gym.envs.make(self.params['envname'])

        self.params['d'] = self.env.action_space.shape[0]

        sess = tf.Session()
        policy = load_policy.load_policy(self.params['filename'])
        net_sup = Supervisor(policy, sess)
        init_cov = np.zeros((self.params['d'], self.params['d']))
        sup = GaussianSupervisor(net_sup, init_cov)
        est, lnr = self.reset_learner(self.params)

        self.lnr, self.sup, self.net_sup = lnr, sup, net_sup
        return self.params
Example #5
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        start_time = timer.time()

        d = self.params['d']
        new_cov = np.random.normal(0, 1, (d, d))
        new_cov = new_cov.T.dot(new_cov)
        new_cov = new_cov / np.trace(new_cov) * self.params['trace']
        self.sup = GaussianSupervisor(self.net_sup, new_cov)

        data_states = []
        data_actions = []

        iteration = 0
        while len(data_states) < self.params['max_data']:
            log("\tIteration: " + str(iteration))
            log("\tData states: " + str(len(data_states)))
            assert (len(data_states) == len(data_actions))

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, _ = utils.filter_data(self.params, states,
                                                     i_actions)

            data_states += states
            data_actions += i_actions

            self.lnr.set_data(data_states, data_actions)

            iteration += 1

        end_time = timer.time()

        for sr in self.snapshot_ranges:

            # # Uncomment for actual evaluations
            snapshot_states = data_states[:sr]
            snapshot_actions = data_actions[:sr]

            self.lnr.set_data(snapshot_states, snapshot_actions)
            self.lnr.train(verbose=True)
            log("\nData from snapshot: " + str(sr))
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(sr)

            # Uncomment for time trials
            # results['sup_rewards'].append(0)
            # results['rewards'].append(0)
            # results['surr_losses'].append(0)
            # results['sup_losses'].append(0)
            # results['sim_errs'].append(0)
            # results['data_used'].append(0)

        for key in results.keys():
            results[key] = np.array(results[key])
        results['total_time'] = end_time - start_time
        return results