Exemple #1
0
    def run_iters(self):

        results = {
            'lnr_costs': [],
            'opt_costs': [],
            'variations': [],
            'opt_variations': [],
            'param_norms': [],
            'opt_param_norms': [],
            'lambdas': [],
            'lnr_batch_costs': [],
            'opt_batch_costs': [],
            'static_regret': [],
            'rewards': [],
            'betas': [],
            'alphas': [],

        }

        d = self.env.observation_space.shape[0]
        # self.data_states = [np.zeros(d), np.zeros(d)]
        # self.data_actions = [1, 0]
        self.data_states = []
        self.data_actions = []

        for iteration in range(self.iters):
            print("\tIteration: " + str(iteration))
            print("\tData states: " + str(len(self.data_states)))
            print("\tParameters: " + str(self.lnr.est.coef_))
            self.compute_statistics(iteration, results)

            states, tmp_actions, _, _ = statistics.collect_traj(self.env, self.lnr, self.params['T'])
            i_actions = [self.sup.intended_action(s) for s in states]


            self.data_states += states
            self.data_actions += i_actions

            self.lnr.set_update(states, i_actions)
            self.lnr.update(iteration)
            
            # Adaptive regularization:
            if self.reg and (iteration + 1) % 20  == 0:
                mean_lambda = np.mean(results['lambdas'][-10:] + self.lambda_prior)
                next_alpha = mean_lambda * self.lnr.est.alpha
                self.lnr.est.alpha = self.t * next_alpha + (1 - self.t) * self.lnr.est.alpha
                self.lnr.est.eta = np.min([.0001, 1/self.lnr.est.alpha])

                print("\n\n\t\t Updated alpha: " + str(self.lnr.est.alpha))
                print("\t\t Lambda was: " + str(mean_lambda))




        for key in results.keys():
            results[key] = np.array(results[key])

        self.compute_results(results)

        return results
Exemple #2
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        trajs = []

        beta = self.params['beta']

        snapshots = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            if i in self.params['update']:
                self.lnr.train(verbose=True)

            if i == 0:
                states, i_actions, _, _ = statistics.collect_traj(
                    self.env, self.sup, T, False)
                trajs.append((states, i_actions))
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                self.lnr.train()

            else:
                states, _, _, _ = statistics.collect_traj_beta(
                    self.env, self.sup, self.lnr, T, beta, False)
                i_actions = [self.sup.intended_action(s) for s in states]
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                beta = beta * self.params['beta']

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(len(y))

        for key in results.keys():
            results[key] = np.array(results[key])
        return results
Exemple #3
0
    def compute_results(self, results):
        _, _, _, sup_reward = statistics.collect_traj(self.env, self.sup, self.params['T'], False)
        results['sup_rewards'] = [sup_reward] * len(results['rewards'])

        # DYNAMIC REGRET
        plt.subplot(211)
        plt.title("Actual loss")
        plt.plot(results['lnr_costs'], label='lnr costs')
        plt.plot(results['opt_costs'], label='opt costs')
        plt.legend()


        difference = results['lnr_costs'] - results['opt_costs']
        plt.subplot(212)
        plt.title("Difference")
        plt.plot(difference)
        plt.tight_layout()
        
        filepath = self.path + '.pdf'
        plt.savefig(filepath)
        plt.close()
        plt.cla()
        plt.clf()

        # STATIC REGRET 
        plt.subplot(211)
        plt.title("Batch costs")
        plt.plot(results['lnr_batch_costs'], label='lnr costs')
        plt.plot(results['opt_batch_costs'], label='opt costs')
        plt.legend()


        plt.subplot(212)
        plt.title("Static regret (lnr batch - opt batch)")
        plt.plot(results['static_regret'])
        plt.tight_layout()
        
        filepath = self.path + '_batch.pdf'
        plt.savefig(filepath)
        plt.close()
        plt.cla()
        plt.clf()


        plt.subplot(111)
        plt.title("Rewards")
        plt.plot(results['rewards'], label='Learner rewards')
        plt.plot(results['sup_rewards'], label='Supervisor Rewards')
        plt.legend()
        filepath = self.path + '_reward.pdf'
        plt.savefig(filepath)
        plt.close()
        plt.cla()
        plt.clf()


        filepath = self.path + '.p'
        f = open(filepath, 'wb')
        pickle.dump(results, f)
        f.close()
Exemple #4
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
            'biases': [],
            'variances': [],
            'biases_learner': [],
            'variances_learner': [],
            'covariate_shifts': []
        }

        snapshots = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, _ = utils.filter_data(self.params, states,
                                                     i_actions)
            self.lnr.add_data(states, i_actions)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['biases'].append(it_results['biases_mean'])
            results['variances'].append(it_results['variances_mean'])
            results['biases_learner'].append(it_results['biases_learner_mean'])
            results['variances_learner'].append(
                it_results['variances_learner_mean'])
            results['covariate_shifts'].append(
                it_results['covariate_shifts_mean'])
            results['data_used'].append(len(y))

        for key in results.keys():
            results[key] = np.array(results[key])
        return results
Exemple #5
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        trajs = []

        d = self.params['d']
        new_cov = np.random.normal(0, 1, (d, d))
        new_cov = new_cov.T.dot(new_cov)
        new_cov = new_cov / np.trace(new_cov) * self.params['trace']
        self.sup = GaussianSupervisor(self.net_sup, new_cov)

        snapshots = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)


            states, i_actions, _, _ = statistics.collect_traj(self.env, self.sup, T, False)
            trajs.append((states, i_actions))
            states, i_actions, _ = utils.filter_data(self.params, states, i_actions)
            
            self.lnr.add_data(states, i_actions)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()
            
            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(len(y))


        for key in results.keys():
            results[key] = np.array(results[key])
        return results
Exemple #6
0
    def compute_statistics(self, iteration, results):

        states, tmp_actions, _, reward = statistics.collect_traj(
            self.env, self.lnr, self.params['T'], False)
        actions = [self.sup.intended_action(s) for s in states]
        d = self.env.observation_space.shape[0]
        # states += [np.zeros(d), np.zeros(d)]
        # actions += [1, 0]

        est = LRC(self.lnr.est.alpha, self.inner_eta, intercept=False)
        lh, ph = est.fit(states, actions)

        lnr_cost = self.lnr.est.loss(states, actions)
        opt_cost = est.loss(states, actions)

        print("\tlnr_cost: " + str(lnr_cost))
        print("\topt_cost: " + str(opt_cost))

        results['lnr_costs'].append(lnr_cost)
        results['opt_costs'].append(opt_cost)
        results['rewards'].append(reward)
        results['alphas'].append(self.lnr.est.alpha)

        curr_coef_ = self.lnr.est.coef_.copy()
        curr_opt_coef_ = est.coef_.copy()

        results['param_norms'].append(np.linalg.norm(curr_coef_))
        results['opt_param_norms'].append(np.linalg.norm(curr_opt_coef_))

        if not iteration is 0:

            variation = np.linalg.norm(self.last_coef_ - curr_coef_)
            opt_variation = np.linalg.norm(self.last_opt_coef_ -
                                           curr_opt_coef_)

            last_gradient = est.gradient(self.last_states, self.last_actions,
                                         curr_coef_)
            curr_gradient = est.gradient(states, actions, curr_coef_)
            beta = np.linalg.norm(last_gradient - curr_gradient) / variation

            results['variations'].append(variation)
            results['opt_variations'].append(opt_variation)
            results['lambdas'].append(opt_variation / variation)
            results['betas'].append(beta)

        self.last_coef_ = curr_coef_.copy()
        self.last_opt_coef_ = curr_opt_coef_.copy()
        self.last_states = states
        self.last_actions = actions

        static_est = LRC(self.lnr.est.alpha, self.inner_eta, intercept=False)
        batch_states = self.data_states + states
        batch_actions = self.data_actions + actions

        lh_batch, ph_batch = static_est.fit(batch_states, batch_actions)
        opt_batch_cost = static_est.loss(batch_states, batch_actions)
        lnr_batch_cost = np.mean(results['lnr_costs'])
        static_regret = lnr_batch_cost - opt_batch_cost

        print("\tlnr_batch_cost: " + str(lnr_batch_cost))
        print("\topt_batch_cost: " + str(opt_batch_cost))
        print()

        results['lnr_batch_costs'].append(lnr_batch_cost)
        results['opt_batch_costs'].append(opt_batch_cost)
        results['static_regret'].append(static_regret)

        return results
Exemple #7
0
    def run_iters(self):
        T = self.params['t']
        partition = self.params['partition']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        start_time = timer.time()

        trajs = []
        traj_snapshots = []
        self.optimized_data = 0

        data_states = []
        data_actions = []

        train_states = []
        train_i_actions = []

        supervisors = []

        iteration = 0
        last_data_update = 0

        while len(data_states) < self.params['max_data']:
            log("\tIteration: " + str(iteration))
            log("\tData states: " + str(len(data_states)))
            assert (len(data_states) == len(data_actions))

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, _ = utils.filter_data(self.params, states,
                                                     i_actions)

            data_states += states
            data_actions += i_actions
            supervisors += [self.sup] * len(states)

            rang = np.arange(0, len(states))
            np.random.shuffle(rang)

            partition_cutoff = int(partition * len(states))
            noise_states, noise_actions = [
                states[k] for k in rang[:partition_cutoff]
            ], [i_actions[k] for k in rang[:partition_cutoff]]
            states, i_actions = [states[k] for k in rang[partition_cutoff:]], [
                i_actions[k] for k in rang[partition_cutoff:]
            ]

            train_states += states
            train_i_actions += i_actions

            self.lnr.set_data(train_states, train_i_actions)
            trajs.append((noise_states, noise_actions))

            if iteration == 0 or len(data_states) >= (
                    last_data_update + self.params['update_period']):
                self.sup = self.update_noise(iteration, trajs)

                difference = (len(data_states) -
                              last_data_update) / self.params['update_period']
                last_data_update += difference * self.params['update_period']

            iteration += 1

        end_time = timer.time()

        for sr in self.snapshot_ranges:
            # # Uncomment for actual evaluations
            snapshot_states = data_states[:sr]
            snapshot_actions = data_actions[:sr]

            self.lnr.set_data(snapshot_states, snapshot_actions)
            self.lnr.train(verbose=True)
            self.sup = supervisors[sr - 1]
            log("\nData from snapshot: " + str(sr))
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(sr)

            # Uncomment for time trials
            # results['sup_rewards'].append(0)
            # results['rewards'].append(0)
            # results['surr_losses'].append(0)
            # results['sup_losses'].append(0)
            # results['sim_errs'].append(0)
            # results['data_used'].append(0)

        log("\tTrain data: " + str(len(train_i_actions)))
        log("\tNoise opt data: " + str(self.count_states(trajs)))

        for key in results.keys():
            results[key] = np.array(results[key])
        results['total_time'] = end_time - start_time
        return results
Exemple #8
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
            'biases': [],
            'variances': [],
            'biases_learner': [],
            'variances_learner': [],
            'covariate_shifts': []
        }

        trajs = []

        snapshots = []
        switch_idxs = []
        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            if i == 0:
                states, i_actions, _, _ = statistics.collect_traj(
                    self.env, self.sup, T, False)
                trajs.append((states, i_actions))
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                self.lnr.train()

            else:
                post_switch_states, post_switch_sup_actions, pre_switch_states, switch_idx, _ = statistics.collect_traj_mixed(
                    self.env, self.sup, self.lnr, T, i,
                    self.params['iters'][-1], False)

                if self.params['dagger_mixed']:
                    i_actions_dagger = [
                        self.sup.intended_action(s) for s in pre_switch_states
                    ]
                    states = pre_switch_states + post_switch_states
                    i_actions = i_actions_dagger + post_switch_sup_actions
                else:
                    states = post_switch_states
                    i_actions = post_switch_sup_actions

                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                self.lnr.train(verbose=True)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))
                switch_idxs.append(switch_idx)

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation(
                mixed_switch_idx=switch_idxs[j])

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['biases'].append(it_results['biases_mean'])
            results['variances'].append(it_results['variances_mean'])
            results['biases_learner'].append(it_results['biases_learner_mean'])
            results['variances_learner'].append(
                it_results['variances_learner_mean'])
            results['covariate_shifts'].append(
                it_results['covariate_shifts_mean'])
            results['data_used'].append(len(y))

        for key in results.keys():
            results[key] = np.array(results[key])
        return results
Exemple #9
0
    def run_iters(self):
        T = self.params['t']
        partition = self.params['partition']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        trajs = []
        snapshots = []
        traj_snapshots = []
        self.optimized_data = 0

        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            self.sup = self.update_noise(i, trajs)

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, (held_out_states,
                                held_out_actions) = utils.filter_data(
                                    self.params, states, i_actions)

            rang = np.arange(0, len(held_out_states))
            np.random.shuffle(rang)
            noise_states, noise_actions = [
                held_out_states[k] for k in rang[:partition]
            ], [held_out_actions[k] for k in rang[:partition]]

            trajs.append((noise_states, noise_actions))
            self.lnr.add_data(states, i_actions)

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))
                traj_snapshots.append(self.optimized_data)

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            optimized_data = traj_snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(len(y) + optimized_data)
            print "\nTrain data: " + str(len(y))
            print "\n Optimize data: " + str(optimized_data)

        for key in results.keys():
            results[key] = np.array(results[key])
        return results
Exemple #10
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }
        start_time = timer.time()
        trajs = []

        beta = self.params['beta']

        data_states = []
        data_actions = []

        iteration = 0
        last_data_update = 0

        while len(data_states) < self.params['max_data']:
            log("\tIteration: " + str(iteration))
            log("\tData states: " + str(len(data_states)))
            assert (len(data_states) == len(data_actions))

            if iteration == 0:
                states, i_actions, _, _ = statistics.collect_traj(
                    self.env, self.sup, T, False)
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
            else:
                states, tmp_actions, _, _ = statistics.collect_traj_beta(
                    self.env, self.sup, self.lnr, T, beta, False)
                states, _, _ = utils.filter_data(self.params, states,
                                                 tmp_actions)
                i_actions = [self.sup.intended_action(s) for s in states]
                beta = beta * self.params['beta']

            data_states += states
            data_actions += i_actions

            self.lnr.set_data(data_states, data_actions)

            if iteration == 0 or len(data_states) >= (
                    last_data_update + self.params['update_period']):
                self.lnr.train(verbose=True)

                difference = (len(data_states) -
                              last_data_update) / self.params['update_period']
                last_data_update += difference * self.params['update_period']

            iteration += 1

        end_time = timer.time()

        for sr in self.snapshot_ranges:
            # # Uncomment for actual evaluations
            snapshot_states = data_states[:sr]
            snapshot_actions = data_actions[:sr]

            self.lnr.set_data(snapshot_states, snapshot_actions)
            self.lnr.train(verbose=True)
            log("\nData from snapshot: " + str(sr))
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(sr)

            # Uncomment for time trials
            # results['sup_rewards'].append(0)
            # results['rewards'].append(0)
            # results['surr_losses'].append(0)
            # results['sup_losses'].append(0)
            # results['sim_errs'].append(0)
            # results['data_used'].append(0)

        for key in results.keys():
            results[key] = np.array(results[key])
        results['total_time'] = end_time - start_time
        return results
Exemple #11
0
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
        }

        start_time = timer.time()
        data_states = []
        data_actions = []

        iteration = 0
        while len(data_states) < self.params['max_data']:
            log("\tIteration: " + str(iteration))
            log("\tData states: " + str(len(data_states)))
            assert (len(data_states) == len(data_actions))

            states, i_actions, _, _ = statistics.collect_traj(
                self.env, self.sup, T, False)
            states, i_actions, _ = utils.filter_data(self.params, states,
                                                     i_actions)

            data_states += states
            data_actions += i_actions

            self.lnr.set_data(data_states, data_actions)

            iteration += 1

        end_time = timer.time()

        for sr in self.snapshot_ranges:

            # # Uncomment for actual evaluations
            snapshot_states = data_states[:sr]
            snapshot_actions = data_actions[:sr]

            self.lnr.set_data(snapshot_states, snapshot_actions)
            self.lnr.train(verbose=True)
            log("\nData from snapshot: " + str(sr))
            it_results = self.iteration_evaluation()

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['data_used'].append(sr)

            # Uncomment for time trials
            # results['sup_rewards'].append(0)
            # results['rewards'].append(0)
            # results['surr_losses'].append(0)
            # results['sup_losses'].append(0)
            # results['sim_errs'].append(0)
            # results['data_used'].append(0)

        for key in results.keys():
            results[key] = np.array(results[key])
        results['total_time'] = end_time - start_time

        return results
Exemple #12
0
alpha = 0.1
eta = 1.0
t = .01
regret = True

sup = FluidsSupervisor()
lnr = FluidsLearner(LRC(alpha, eta, intercept=False), sup)
env = FluidsEnv(fluids.OBS_GRID)

data_states = []
data_actions = []
sup_reward_arr = []
reward_arr = []

for iteration in range(iterations):
    states, intended_actions, taken_actions, reward, infos = statistics.collect_traj(
        env, sup, 10, True)
    sup_reward_arr.append(reward)
    states, intended_actions, taken_actions, reward, infos = statistics.collect_traj(
        env, lnr, 10, True)
    reward_arr.append(reward)
    i_actions = []
    for i in range(len(states)):
        i_actions += [sup.intended_action(states[i], infos[i])]

    data_states += states
    data_actions += i_actions

    lnr.set_data(data_states, data_actions)
    lnr.train()

plt.subplot(111)
Exemple #13
0
    def run_iters(self):

        results = {
            'lnr_costs': [],
            'opt_costs': [],
            'variations': [],
            'opt_variations': [],
            'param_norms': [],
            'opt_param_norms': [],
            'lambdas': [],
            'lnr_batch_costs': [],
            'opt_batch_costs': [],
            'static_regret': [],
            'rewards': [],
            'betas': [],
            'alphas': [],
        }

        d = self.env.observation_space.shape[0]
        # self.data_states = [np.zeros(d), np.zeros(d)]
        # self.data_actions = [1, 0]
        self.data_states = []
        self.data_actions = []

        for iteration in range(self.iters):
            print("\tIteration: " + str(iteration))
            print("\tData states: " + str(len(self.data_states)))
            if len(self.data_states) > 0:
                X = np.array(states)
                y = np.array(tmp_actions)
                print("\t Coef norm: " + str(
                    np.linalg.norm(self.lnr.est.coef_) /
                    (X.shape[1] * y.shape[1])))

            # if iteration == 0 or iteration % 25 == 0:
            #     IPython.embed()

            self.compute_statistics(iteration, results)

            states, tmp_actions, _, _ = statistics.collect_traj(
                self.env, self.lnr, self.params['T'])
            i_actions = [self.sup.intended_action(s) for s in states]

            self.data_states += states
            self.data_actions += i_actions

            self.lnr.set_update(states, i_actions)
            self.lnr.multiple_update(iteration)

            # Adaptive regularization:
            if self.reg and (iteration + 1) % 10 == 0:
                mean_lambda = np.mean(results['lambdas'][-10:] +
                                      self.lambda_prior)

                mean_ratio = np.mean(
                    np.array(results['opt_costs'][-10:]) /
                    np.array(results['lnr_costs'][-10:]))

                if mean_ratio < .998:
                    next_alpha = mean_lambda * self.lnr.est.alpha
                    # self.lnr.est.alpha = (1 - mean_ratio) * next_alpha + mean_ratio * self.lnr.est.alpha
                    self.lnr.est.alpha = self.t * next_alpha + (
                        1 - self.t) * self.lnr.est.alpha
                    self.lnr.est.eta = np.min(
                        [.01, 1.0 / self.lnr.est.alpha / 10.0])

                print("\n\n\t\t Updated alpha: " + str(self.lnr.est.alpha))
                print("\t\t Mean ratio: " + str(mean_ratio))
                print("\t\t Lambda was: " + str(mean_lambda))
                print("\t\t Eta: " + str(self.lnr.est.eta))

        for key in results.keys():
            results[key] = np.array(results[key])

        self.compute_results(results)

        return results
Exemple #14
0
import argparse
import fluids
from fluids_env import FluidsEnv, FluidsVelEnv
import IPython

fluids.OBS_GRID
fluids.OBS_BIRDSEYE
fluids.OBS_GRID
fluids.OBS_NONE

if __name__ == '__main__':

    sup = FluidsSupervisor()
    env = FluidsEnv(fluids.OBS_GRID)

    states, int_actions, _, reward, infos = statistics.collect_traj(
        env, sup, 100, False)
    IPython.embed()

    # lnr_rewards = []
    # for i in range(iterations):
    #     env = gym.make("fluids-v2")
    #     sup = FluidsSupervisor(gym_fluids.agents.fluids_supervisor)
    #     states, tmp_actions, _, reward = statistics.collect_traj(env, sup, 100, True)

    #     # train model
    #     #

    #     states, tmp_actions, _, lnr_reward = statistics.collect_traj(env, lnr, 100, True)
    #     lnr_rewards.append(lnr_reward)

    # IPython.embed()
    def run_iters(self):
        T = self.params['t']

        results = {
            'rewards': [],
            'sup_rewards': [],
            'surr_losses': [],
            'sup_losses': [],
            'sim_errs': [],
            'data_used': [],
            'biases': [],
            'variances': [],
            'biases_learner': [],
            'variances_learner': [],
            'covariate_shifts': []
        }

        trajs = []
        snapshots = []
        dist_gen_agents = []
        learner_bias, learner_variance = None, None

        for i in range(self.params['iters'][-1]):
            print "\tIteration: " + str(i)

            if i == 0:
                states, i_actions, _, _ = statistics.collect_traj(
                    self.env, self.sup, T, False)
                trajs.append((states, i_actions))
                states, i_actions, _ = utils.filter_data(
                    self.params, states, i_actions)
                self.lnr.add_data(states, i_actions)
                self.lnr.train()
                learner_last = False
                dist_gen_agent = self.sup
            else:
                # if was learner last time and variance > some quantity switch to supervisor
                if learner_last and float(learner_variance) / (
                        float(learner_bias) + float(learner_variance)
                ) > 0.5:  # TODO: can modify this threshold in various ways as see fit...
                    states, i_actions, _, _ = statistics.collect_traj(
                        self.env, self.sup, T, False)
                    trajs.append((states, i_actions))
                    states, i_actions, _ = utils.filter_data(
                        self.params, states, i_actions)
                    self.lnr.add_data(states, i_actions)
                    self.lnr.train()
                    learner_last = False
                    dist_gen_agent = self.sup
                else:
                    states, _, _, _ = statistics.collect_traj(
                        self.env, self.lnr, T, False)
                    i_actions = [self.sup.intended_action(s) for s in states]
                    states, i_actions, _ = utils.filter_data(
                        self.params, states, i_actions)
                    self.lnr.add_data(states, i_actions)
                    self.lnr.train(verbose=True)
                    learner_last = True
                    learner_bias, learner_variance = statistics.evaluate_bias_variance_learner_cont(
                        self.env, self.lnr, self.sup, T, num_samples=20)
                    dist_gen_agent = self.lnr

            if ((i + 1) in self.params['iters']):
                snapshots.append((self.lnr.X[:], self.lnr.y[:]))
                dist_gen_agents.append(dist_gen_agent)

        for j in range(len(snapshots)):
            X, y = snapshots[j]
            self.lnr.X, self.lnr.y = X, y
            self.lnr.train(verbose=True)
            print "\nData from snapshot: " + str(self.params['iters'][j])
            it_results = self.iteration_evaluation(
                dist_gen_agent=dist_gen_agents[j])

            results['sup_rewards'].append(it_results['sup_reward_mean'])
            results['rewards'].append(it_results['reward_mean'])
            results['surr_losses'].append(it_results['surr_loss_mean'])
            results['sup_losses'].append(it_results['sup_loss_mean'])
            results['sim_errs'].append(it_results['sim_err_mean'])
            results['biases'].append(it_results['biases_mean'])
            results['variances'].append(it_results['variances_mean'])
            results['biases_learner'].append(it_results['biases_learner_mean'])
            results['variances_learner'].append(
                it_results['variances_learner_mean'])
            results['covariate_shifts'].append(
                it_results['covariate_shifts_mean'])
            results['data_used'].append(len(y))

        for key in results.keys():
            results[key] = np.array(results[key])
        return results