Esempio n. 1
0
    def run(self, controls: np.ndarray) -> traj.Trajectory:
        c = np.array([[0.] * self.control_size] * self.time_steps)
        num_intervals = len(controls)//self.control_size
        interval_length = self.time_steps//num_intervals

        assert interval_length * num_intervals == self.time_steps, "Number of generated controls must be divisible by total time steps."

        j = 0
        for i in range(num_intervals):
            c[i * interval_length: (i + 1) * interval_length] = [controls[j + i] for i in range(self.control_size)]
            j += self.control_size

        obser = self.reset()
        s = [obser]
        for i in range(self.time_steps):
            try:
                results = self.env.step(c[i])
            except:
                print("Caught unstable simulation; skipping.")
                return traj.Trajectory(None, None, null=True)
            if isinstance(self, Fetch):
                obser = self.state
            else:
                obser = results[0]
            s.append(obser)
            if results[2]:
                break
        if len(s) <= self.time_steps:
            c = c[:len(s), :]
        else:
            c = np.append(c, [np.zeros(self.control_size)], axis=0)
        return traj.Trajectory(np.array([s]), np.array([c]))
Esempio n. 2
0
    def build_traj(self, query: int) -> traj.Trajectory:
        # states[time][agent][state]
        # states = np.array([np.array([self.xs[query][t][i]
        #                              for i in range(self.domain.num_agents)
        #                              ])
        #                    for t in range(self.query_length)])

        # states[agent][time][state]
        states = np.array([
            np.array([
                self.xs[query][t][i].eval() for t in range(self.query_length)
            ]) for i in range(self.domain.num_agents)
        ])

        # controls[time][agent][control]
        # controls = np.array([np.array([self.us[query][t][i]
        #                                for i in range(self.domain.num_agents)
        #                                ])
        #                      for t in range(self.query_length)])

        # controls[agent][time][state]
        controls = np.array([
            np.array([
                self.us[query][t][i].eval() for t in range(self.query_length)
            ]) for i in range(self.domain.num_agents)
        ])

        return traj.Trajectory(states, controls)
Esempio n. 3
0
 def watch(self, t: traj.Trajectory, seed: int = None):
     if len(t.controls[0][0]) == 1:
         mapping = {1: [0, -1], 2: [1, 0], 3: [0, 1], 0: [0, 0]}
         controls = []
         for i in range(len(t.controls[0])):
             controls.append(mapping[t.controls[0][i][0]])
         t = traj.Trajectory(t.states, np.array([controls]))
     super(LunarLander, self).watch(t, seed)
Esempio n. 4
0
 def fetch_to_mujoco(self, t: traj.Trajectory) -> traj.Trajectory:
     self.reset()
     new_states = []
     for i in range(len(t.states[0][1:])):
         self.state = t.states[0][i]
         results = self.env.step(t.controls[0][i-1])
         self.state.extend(results[:3])
         new_states.append(self.state)
     self.reset()
     return traj.Trajectory(np.array([new_states]), np.array([t.controls[0][1:]]))
Esempio n. 5
0
    def simulate(self, weights: np.ndarray, w: world.World=None, fixed_ctrl: typing.List=None, iter_count:int =0, random_start:bool =False) -> traj.Trajectory:
        """
        Simulates the behavior generated by a simpleOptimizerCar behaving according to w_true in the given world. If no
        world or fixed_ctrl is provided, will run with self.w and self.fixed_ctrl.

        :param weights: the true weight of the reward function according to which the car will behave.
        :param w: the world on which to simulate the car's behavior.
        :param fixed_ctrl: the fixed behavior of the other car in the world.
        :return:
        """
        if not w:
            w = self.w
        if not random_start:
            w.cars[0] = car.SimpleOptimizerCar(self.dynamics_function, w.initial_state[0], color='orange')
        else:
            state = w.initial_state[0]
            state[0] = np.random.uniform(-1, 1)
            state[1] = np.random.uniform(-0.5, 0.5)
            w.cars[0] = car.SimpleOptimizerCar(self.dynamics_function, state, color='orange')
        w.cars[1] = car.UserControlledCar(self.dynamics_function, w.initial_state[1], color='white')
        w.cars[1].fix_control(fixed_ctrl)

        def reward(weights):
            def f(t, x, u):
                return tt.dot(self.tt_features(x, [w.cars[1].x]), weights) - 0.01 * (u[0] ** 2 + u[1] ** 2)
            return f

        weights = weights/np.linalg.norm(weights)
        w.cars[0].reward = reward(weights)
        states = [[] for _ in w.cars]
        controls = [[] for _ in w.cars]

        for i in range(self.time_steps):
            w.cars[0].control(0, 0)
            w.cars[1].control(0, 0)
            for c, hist in zip(w.cars, controls):
                hist.append(c.u)
            for c in w.cars:
                c.move()
            for c, hist in zip(w.cars, states):
                hist.append(c.x)

        t = traj.Trajectory(np.array(states), np.array(controls))
        return t
Esempio n. 6
0
 def mujoco_to_fetch(self, t: traj.Trajectory) -> traj.Trajectory:
     new_states = []
     for x in t.states[0]:
         new_states.append(x[:7])
     return traj.Trajectory(np.array([new_states]), t.controls)
Esempio n. 7
0
    def run(self, n_iters: int = 1) -> Tuple[pd.DataFrame, List]:
        """
        Runs the algorithm n_iters times and returns a data frame with all the data from the experiment.

        :param n_iters: Number of times to run the algorithm.
        :param verbose: Prints status messages about the progress of the algorithm if true.
        :return: (self.config, df); config contains the parameters of the run and df is a data frame containing all the
            data from the run.
        """
        ### Creating data frame to store data in
        # run corresponds to the iteration of the whole experiment
        # pref_iter correponds to the iteration of the preference loop in the particular run
        # run is the type of data being stored; options are "mean", "var", "m"
        # value is the actual value being stored
        df = pd.DataFrame(columns=["run #", "pref_iter", "type", "value"])

        ### Creating query generator
        if isinstance(self.domain,
                      domain.Car):  # using exact QG when dynamics is available
            if self.update_func == "pick_best":
                obj_fn = query_generation.pick_best
            elif self.update_func == "approx":
                obj_fn = query_generation.approx
            elif self.update_func == "rank":
                obj_fn = query_generation.rank
            qg = query_generation.QueryGenerator(
                dom=self.domain,
                num_queries=self.n_query,
                query_length=self.query_length,
                num_expectation_samples=self.n_samples_exp,
                include_previous_query=self.inc_prev_query,
                generate_scenario=self.gen_scenario,
                objective_fn=obj_fn,
                beta_pref=self.beta_pref)
        else:  # using approx QG when dynamics is not available
            qg = query_generation.ApproxQueryGenerator(
                dom=self.domain,
                num_queries=self.n_query,
                query_length=self.query_length,
                num_expectation_samples=self.n_samples_exp,
                include_previous_query=self.inc_prev_query,
                generate_scenario=self.gen_scenario,
                update_func=self.update_func,
                beta_pref=self.beta_pref)

        ### Creating human
        humans = {
            "opt":
            human.OptimalHuman(self.domain, self.update_func,
                               self.true_weight),
            "btz":
            human.BoltzmannHuman(self.domain, self.update_func,
                                 self.true_weight, self.beta_human),
            "term":
            human.TerminalHuman(self.domain, self.update_func)
        }
        H = humans[self.human_type]

        ### Iterating to build confidence intervals
        for i in range(n_iters):
            ### Processing demonstrations
            sampler = sampling.Sampler(n_query=self.n_query,
                                       dim_features=self.domain.feature_size,
                                       update_func=self.update_func,
                                       beta_demo=self.beta_demo,
                                       beta_pref=self.beta_pref)
            if self.n_demos > 0:
                if self.gen_demos:
                    self.demos = [
                        self.domain.simulate(self.true_weight,
                                             iter_count=self.sim_iter_count)
                        for _ in range(self.n_demos)
                    ]
                phi_demos = [self.domain.np_features(x) for x in self.demos]
                sampler.load_demo(np.array(phi_demos))
                if self.inc_prev_query and isinstance(self.domain, domain.Car):
                    cleaned_demos = [
                        d.trim(self.query_length, self.trim_start)
                        for d in self.demos
                    ]
                else:
                    cleaned_demos = self.demos
                if self.inc_prev_query:
                    last_query_picked = [d for d in cleaned_demos]
            else:
                last_query_picked = [
                    traj.Trajectory(states=None, controls=None, null=True)
                ]

            ## Computing initial estimates
            samples = sampler.sample(N=self.n_samples_summ)
            mean_w = np.mean(samples, axis=0)
            mean_w = mean_w / np.linalg.norm(mean_w)
            var_w = np.var(samples, axis=0)
            data = [[i + 1, 0, "mean", mean_w], [i + 1, 0, "var", var_w]]
            print("Estimate of w: " +
                  str(mean_w))  # TODO: Add different levels of verbose mode
            print("Estimate of variance: " + str(sum(var_w)))
            # computing convergence measure if we are in simulation
            if self.human_type != "term":
                m = np.mean([
                    np.dot(w, self.true_weight) / np.linalg.norm(w) /
                    np.linalg.norm(self.true_weight) for w in samples
                ])
                data.append([i + 1, 0, "m", m])
                print("Estimate of m: " + str(m) + "\n\n")
            df = df.append(pd.DataFrame(
                data, columns=["run #", "pref_iter", "type", "value"]),
                           ignore_index=True)

            ### Preferences loop
            for j in range(self.n_pref_iters):
                print("\n\n*** Preferences Loop %d\n" % (j))

                ## Get last_query
                if self.inc_prev_query:
                    if len(self.demos) > 0:
                        random_scenario_index = np.random.randint(
                            len(self.demos))
                    else:
                        random_scenario_index = 0
                    last_query = last_query_picked[random_scenario_index]

                ## Generate queries while ensuring that features of queries are epsilon apart
                query_diff = 0
                print("Generating queries")
                while query_diff <= self.epsilon:
                    if self.inc_prev_query:
                        if last_query.null:
                            queries = qg.queries(samples, blank_traj=True)
                        else:
                            queries = qg.queries(samples, last_query)
                    else:
                        queries = qg.queries(samples)
                    query_diffs = []
                    for m in range(len(queries)):
                        for n in range(m):
                            query_diffs.append(
                                np.linalg.norm(
                                    self.domain.np_features(queries[m]) -
                                    self.domain.np_features(queries[n])))
                    query_diff = max(query_diffs)

                ## Querying human
                if self.human_type == "term":
                    print('\a')
                rank = H.input(queries)
                if self.update_func == "rank":
                    best = rank[0]
                else:
                    if rank == -1:
                        return df, self.config
                    best = rank

                if self.inc_prev_query:
                    last_query_picked[random_scenario_index] = queries[best]

                ## Creating dictionary mapping rankings to features of queries and loading into sampler
                features = [self.domain.np_features(x) for x in queries]
                phi = {k: features[k] for k in range(len(queries))}
                sampler.load_prefs(phi, rank)

                ## Recording data from this run
                samples = sampler.sample(N=self.n_samples_summ)
                mean_w = np.mean(samples, axis=0)
                mean_w = mean_w / np.linalg.norm(mean_w)
                var_w = np.var(samples, axis=0)
                data = [[i + 1, j + 1, "mean", mean_w],
                        [i + 1, j + 1, "var", var_w]]
                print("Estimate of w: " + str(mean_w))
                print("Estimate of variance: " + str(sum(var_w)))
                if self.human_type != "term":
                    m = np.mean([
                        np.dot(w, self.true_weight) / np.linalg.norm(w) /
                        np.linalg.norm(self.true_weight) for w in samples
                    ])
                    data.append([i + 1, j + 1, "m", m])
                    print("Estimate of m: " + str(m) + "\n\n")
                df = df.append(pd.DataFrame(
                    data, columns=["run #", "pref_iter", "type", "value"]),
                               ignore_index=True)
            ## Resetting for next run
            sampler.clear_pref()
            if self.inc_prev_query and self.n_demos > 0:
                last_query_picked = [d for d in cleaned_demos]

        return df, self.config
Esempio n. 8
0
def play(name: str):
    states = pickle.load(open(f'generated_demos/{name}.pickle', 'rb'))
    t = traj.Trajectory(np.array([states]), np.array([states]))
    dom.watch(t, on_real_robot=True)