def update_parameter_estimate(self,
                                  randomized_env,
                                  policy=None,
                                  reference_env=None):
        random_params = np.random.uniform(size=(self.nenvs, self.nparams))
        randomized_env.randomize(randomized_values=random_params)

        randomized_trajectory = evaluate_actions(randomized_env,
                                                 self.reference_actions)

        random_params = np.repeat(random_params[:, :, np.newaxis],
                                  len(self.reference_actions),
                                  axis=2)
        random_params = np.transpose(random_params, (0, 2, 1))
        flattened_params = [random_params[i] for i in range(self.nenvs)]
        flattened_params = np.concatenate(flattened_params)

        flattened_randomized = [
            randomized_trajectory[i] for i in range(self.nenvs)
        ]
        flattened_randomized = np.concatenate(flattened_randomized)

        self.model.train(x_data=flattened_randomized,
                         y_data=flattened_params,
                         nepoch=10,
                         batch_size=100)
Ejemplo n.º 2
0
    def update_parameter_estimate(self,
                                  randomized_env,
                                  policy=None,
                                  reference_env=None):
        random_params = np.random.uniform(size=(self.nenvs,
                                                self.randomization_dim))
        randomized_env.randomize(randomized_values=random_params)

        if policy is not None:
            randomized_trajectory, _ = run_training_episode(
                randomized_env, policy, self.agent_timesteps)
        else:
            randomized_trajectory = evaluate_actions(randomized_env,
                                                     self.reference_actions)

        random_params = np.repeat(random_params[:, :, np.newaxis],
                                  len(self.reference_actions),
                                  axis=2)
        random_params = np.transpose(random_params, (0, 2, 1))
        flattened_params = [random_params[i] for i in range(self.nenvs)]
        flattened_params = np.concatenate(flattened_params)

        flattened_randomized = [
            randomized_trajectory[i] for i in range(self.nenvs)
        ]
        flattened_randomized = np.concatenate(flattened_randomized)

        self.regressor.train_regressor(flattened_randomized,
                                       flattened_params,
                                       iterations=10)
        self.agent_timesteps += len(flattened_randomized)
    def update_parameter_estimate(self,
                                  randomized_env,
                                  policy=None,
                                  reference_env=None):
        for _ in range(self.reps_updates):
            mean, cov = self._get_current_reps_state()
            parameter_estimates = np.random.multivariate_normal(
                mean, cov, self.nenvs)
            parameter_estimates = np.clip(parameter_estimates, 0, 1)

            randomized_env.randomize(randomized_values=parameter_estimates)
            if policy is not None:
                randomized_trajectory, randomized_actions = run_training_episode(
                    randomized_env, policy, self.agent_timesteps)
                self.reference_trajectory = evaluate_actions(
                    reference_env, np.transpose(randomized_actions, (1, 0, 2)))
            else:
                randomized_trajectory = evaluate_actions(
                    randomized_env, self.reference_actions)

            costs = self.rewarder(randomized_trajectory,
                                  self.reference_trajectory)

            try:
                self.reps.learn(parameter_estimates, costs)
            except:
                self.reps.current_cov = self.cov_init
                self.reps.learn(parameter_estimates, costs)

            # flatten and combine all randomized and reference trajectories for discriminator
            flattened_reference = [
                self.reference_trajectory[i] for i in range(self.nenvs)
            ]
            flattened_reference = np.concatenate(flattened_reference)

            flattened_randomized = [
                randomized_trajectory[i] for i in range(self.nenvs)
            ]
            flattened_randomized = np.concatenate(flattened_randomized)

            self.agent_timesteps += len(flattened_randomized)

            if self.learned_reward:
                # Train discriminator based on state action pairs for agent env. steps
                self.rewarder.train_discriminator(flattened_reference,
                                                  flattened_randomized,
                                                  iterations=150)
Ejemplo n.º 4
0
 def load_trajectory(self, reference_env, reference_action_fp):
     actions = np.load(reference_action_fp)
     self.reference_actions = np.repeat(actions[:, np.newaxis],
                                        self.nenvs,
                                        axis=1)
     self.reference_trajectory = evaluate_actions(reference_env,
                                                  self.reference_actions)
     self.flattened_reference = np.squeeze(self.reference_trajectory)
Ejemplo n.º 5
0
    def update_parameter_estimate(self, randomized_env):
        random_params = np.random.uniform(size=(randomized_env.nenvs,
                                                self.randomization_dim))
        random_params = np.repeat(random_params[:, :, np.newaxis],
                                  len(self.reference_actions),
                                  axis=2)
        randomized_env.randomize(randomized_values=random_params)
        randomized_trajectory = evaluate_actions(randomized_env,
                                                 self.reference_actions)

        self.regressor.train_regressor(randomized_trajectory,
                                       np.transpose(random_params, (0, 2, 1)),
                                       iterations=10,
                                       costs=costs)
    def update_parameter_estimate(self, randomized_env, policy=None, reference_env=None):
        parameter_estimate = self.bayesianoptimizer.suggest(self.utility)
        randomized_env.randomize(randomized_values=[list(parameter_estimate.values())])

        if policy is not None:
            randomized_trajectory, randomized_actions = run_training_episode(randomized_env, policy, self.agent_timesteps)
            self.reference_trajectory = evaluate_actions(reference_env, np.transpose(randomized_actions, (1, 0, 2)))
        else:
            randomized_trajectory = evaluate_actions(randomized_env, self.reference_actions)

        cost = self.statedifference_rewarder(randomized_trajectory, self.reference_trajectory)

        registered_cost = self.registered_points.get(tuple(parameter_estimate.values()), cost[0])
        try:
            self.bayesianoptimizer.register(params=parameter_estimate, target=registered_cost)
        except:
            pass

        self.registered_points[tuple(parameter_estimate.values())] = registered_cost

        flattened_randomized = [randomized_trajectory[i] for i in range(self.nenvs)]
        flattened_randomized = np.concatenate(flattened_randomized)
        self.agent_timesteps += len(flattened_randomized)
Ejemplo n.º 7
0
def _get_batch(randomized_env, actions):
    # Select amplitude and phase for the task
    nenvs = randomized_env.nenvs

    random_params = onp.random.uniform(
        size=(nenvs, randomized_env.randomization_space.shape[0]))
    randomized_env.randomize(randomized_values=random_params)
    randomized_trajectory = evaluate_actions(randomized_env, actions)

    random_params = onp.repeat(random_params[:, :, np.newaxis],
                               len(actions),
                               axis=2)
    random_params = onp.transpose(random_params, (0, 2, 1))
    flattened_params = [random_params[i] for i in range(nenvs)]
    flattened_params = np.concatenate(flattened_params)

    flattened_randomized = [randomized_trajectory[i] for i in range(nenvs)]
    flattened_randomized = np.concatenate(flattened_randomized)

    return flattened_randomized, flattened_params
Ejemplo n.º 8
0
 def load_trajectory(self, reference_env, reference_action_fp):
     self.reference_actions = np.load(reference_action_fp)
     self.reference_trajectory = evaluate_actions(reference_env,
                                                  self.reference_actions)
     self.flattened_reference = np.squeeze(self.reference_trajectory)
    def update_parameter_estimate(self,
                                  randomized_env,
                                  policy=None,
                                  reference_env=None):
        """Select an action based on SVPG policy, where an action is the delta in each dimension.
        Update the counts and statistics after training agent,
        rolling out policies, and calculating simulator reward.
        """
        if self.svpg_timesteps >= self.initial_svpg_steps:
            # Get sim instances from SVPG policy
            simulation_instances = self.svpg.step()

            index = self.svpg_timesteps % self.svpg_horizon
            self.simulation_instances_full_horizon[:,
                                                   index, :, :] = simulation_instances

        else:
            # Creates completely randomized environment
            simulation_instances = np.ones(
                (self.nagents, self.svpg.svpg_rollout_length,
                 self.svpg.nparams)) * -1

        assert (self.nagents, self.svpg.svpg_rollout_length,
                self.svpg.nparams) == simulation_instances.shape

        # Create placeholders for trajectories
        randomized_trajectories = [[] for _ in range(self.nagents)]

        # Create placeholder for rewards
        rewards = np.zeros(simulation_instances.shape[:2])

        # Reshape to work with vectorized environments
        simulation_instances = np.transpose(simulation_instances, (1, 0, 2))

        # Create environment instances with vectorized env, and rollout agent_policy in both
        for t in range(self.svpg.svpg_rollout_length):
            agent_timesteps_current_iteration = 0

            # TODO: Double check shape here
            randomized_env.randomize(randomized_values=simulation_instances[t])
            if policy is not None:
                randomized_trajectory, randomized_actions = run_training_episode(
                    randomized_env, policy, self.agent_timesteps)
                self.reference_trajectory = evaluate_actions(
                    reference_env, np.transpose(randomized_actions, (1, 0, 2)))
            else:
                randomized_trajectory = evaluate_actions(
                    randomized_env, self.reference_actions)

            for i in range(self.nagents):
                agent_timesteps_current_iteration += len(
                    randomized_trajectory[i])
                self.agent_timesteps += len(randomized_trajectory[i])
                randomized_trajectories[i].append(randomized_trajectory[i])
                simulator_reward = self.rewarder(
                    randomized_trajectories[i][t][np.newaxis, :],
                    self.reference_trajectory[0][np.newaxis, :])
                rewards[i][t] = simulator_reward

            if self.learned_reward:
                # flatten and combine all randomized and reference trajectories for discriminator
                flattened_reference = [
                    self.reference_trajectory[i] for i in range(self.nagents)
                ]
                flattened_reference = np.concatenate(flattened_reference)
                flattened_randomized = [
                    randomized_trajectories[i][t] for i in range(self.nagents)
                ]
                flattened_randomized = np.concatenate(flattened_randomized)

                # Train discriminator based on state action pairs for agent env. steps
                self.rewarder.train_discriminator(flattened_reference,
                                                  flattened_randomized,
                                                  iterations=150)

        # Calculate discriminator based reward, pass it back to SVPG policy
        if self.svpg_timesteps >= self.initial_svpg_steps:
            self.svpg.train(rewards)

        self.svpg_timesteps += 1