def update_parameter_estimate(self, randomized_env, policy=None, reference_env=None): random_params = np.random.uniform(size=(self.nenvs, self.nparams)) randomized_env.randomize(randomized_values=random_params) randomized_trajectory = evaluate_actions(randomized_env, self.reference_actions) random_params = np.repeat(random_params[:, :, np.newaxis], len(self.reference_actions), axis=2) random_params = np.transpose(random_params, (0, 2, 1)) flattened_params = [random_params[i] for i in range(self.nenvs)] flattened_params = np.concatenate(flattened_params) flattened_randomized = [ randomized_trajectory[i] for i in range(self.nenvs) ] flattened_randomized = np.concatenate(flattened_randomized) self.model.train(x_data=flattened_randomized, y_data=flattened_params, nepoch=10, batch_size=100)
def update_parameter_estimate(self, randomized_env, policy=None, reference_env=None): random_params = np.random.uniform(size=(self.nenvs, self.randomization_dim)) randomized_env.randomize(randomized_values=random_params) if policy is not None: randomized_trajectory, _ = run_training_episode( randomized_env, policy, self.agent_timesteps) else: randomized_trajectory = evaluate_actions(randomized_env, self.reference_actions) random_params = np.repeat(random_params[:, :, np.newaxis], len(self.reference_actions), axis=2) random_params = np.transpose(random_params, (0, 2, 1)) flattened_params = [random_params[i] for i in range(self.nenvs)] flattened_params = np.concatenate(flattened_params) flattened_randomized = [ randomized_trajectory[i] for i in range(self.nenvs) ] flattened_randomized = np.concatenate(flattened_randomized) self.regressor.train_regressor(flattened_randomized, flattened_params, iterations=10) self.agent_timesteps += len(flattened_randomized)
def update_parameter_estimate(self, randomized_env, policy=None, reference_env=None): for _ in range(self.reps_updates): mean, cov = self._get_current_reps_state() parameter_estimates = np.random.multivariate_normal( mean, cov, self.nenvs) parameter_estimates = np.clip(parameter_estimates, 0, 1) randomized_env.randomize(randomized_values=parameter_estimates) if policy is not None: randomized_trajectory, randomized_actions = run_training_episode( randomized_env, policy, self.agent_timesteps) self.reference_trajectory = evaluate_actions( reference_env, np.transpose(randomized_actions, (1, 0, 2))) else: randomized_trajectory = evaluate_actions( randomized_env, self.reference_actions) costs = self.rewarder(randomized_trajectory, self.reference_trajectory) try: self.reps.learn(parameter_estimates, costs) except: self.reps.current_cov = self.cov_init self.reps.learn(parameter_estimates, costs) # flatten and combine all randomized and reference trajectories for discriminator flattened_reference = [ self.reference_trajectory[i] for i in range(self.nenvs) ] flattened_reference = np.concatenate(flattened_reference) flattened_randomized = [ randomized_trajectory[i] for i in range(self.nenvs) ] flattened_randomized = np.concatenate(flattened_randomized) self.agent_timesteps += len(flattened_randomized) if self.learned_reward: # Train discriminator based on state action pairs for agent env. steps self.rewarder.train_discriminator(flattened_reference, flattened_randomized, iterations=150)
def load_trajectory(self, reference_env, reference_action_fp): actions = np.load(reference_action_fp) self.reference_actions = np.repeat(actions[:, np.newaxis], self.nenvs, axis=1) self.reference_trajectory = evaluate_actions(reference_env, self.reference_actions) self.flattened_reference = np.squeeze(self.reference_trajectory)
def update_parameter_estimate(self, randomized_env): random_params = np.random.uniform(size=(randomized_env.nenvs, self.randomization_dim)) random_params = np.repeat(random_params[:, :, np.newaxis], len(self.reference_actions), axis=2) randomized_env.randomize(randomized_values=random_params) randomized_trajectory = evaluate_actions(randomized_env, self.reference_actions) self.regressor.train_regressor(randomized_trajectory, np.transpose(random_params, (0, 2, 1)), iterations=10, costs=costs)
def update_parameter_estimate(self, randomized_env, policy=None, reference_env=None): parameter_estimate = self.bayesianoptimizer.suggest(self.utility) randomized_env.randomize(randomized_values=[list(parameter_estimate.values())]) if policy is not None: randomized_trajectory, randomized_actions = run_training_episode(randomized_env, policy, self.agent_timesteps) self.reference_trajectory = evaluate_actions(reference_env, np.transpose(randomized_actions, (1, 0, 2))) else: randomized_trajectory = evaluate_actions(randomized_env, self.reference_actions) cost = self.statedifference_rewarder(randomized_trajectory, self.reference_trajectory) registered_cost = self.registered_points.get(tuple(parameter_estimate.values()), cost[0]) try: self.bayesianoptimizer.register(params=parameter_estimate, target=registered_cost) except: pass self.registered_points[tuple(parameter_estimate.values())] = registered_cost flattened_randomized = [randomized_trajectory[i] for i in range(self.nenvs)] flattened_randomized = np.concatenate(flattened_randomized) self.agent_timesteps += len(flattened_randomized)
def _get_batch(randomized_env, actions): # Select amplitude and phase for the task nenvs = randomized_env.nenvs random_params = onp.random.uniform( size=(nenvs, randomized_env.randomization_space.shape[0])) randomized_env.randomize(randomized_values=random_params) randomized_trajectory = evaluate_actions(randomized_env, actions) random_params = onp.repeat(random_params[:, :, np.newaxis], len(actions), axis=2) random_params = onp.transpose(random_params, (0, 2, 1)) flattened_params = [random_params[i] for i in range(nenvs)] flattened_params = np.concatenate(flattened_params) flattened_randomized = [randomized_trajectory[i] for i in range(nenvs)] flattened_randomized = np.concatenate(flattened_randomized) return flattened_randomized, flattened_params
def load_trajectory(self, reference_env, reference_action_fp): self.reference_actions = np.load(reference_action_fp) self.reference_trajectory = evaluate_actions(reference_env, self.reference_actions) self.flattened_reference = np.squeeze(self.reference_trajectory)
def update_parameter_estimate(self, randomized_env, policy=None, reference_env=None): """Select an action based on SVPG policy, where an action is the delta in each dimension. Update the counts and statistics after training agent, rolling out policies, and calculating simulator reward. """ if self.svpg_timesteps >= self.initial_svpg_steps: # Get sim instances from SVPG policy simulation_instances = self.svpg.step() index = self.svpg_timesteps % self.svpg_horizon self.simulation_instances_full_horizon[:, index, :, :] = simulation_instances else: # Creates completely randomized environment simulation_instances = np.ones( (self.nagents, self.svpg.svpg_rollout_length, self.svpg.nparams)) * -1 assert (self.nagents, self.svpg.svpg_rollout_length, self.svpg.nparams) == simulation_instances.shape # Create placeholders for trajectories randomized_trajectories = [[] for _ in range(self.nagents)] # Create placeholder for rewards rewards = np.zeros(simulation_instances.shape[:2]) # Reshape to work with vectorized environments simulation_instances = np.transpose(simulation_instances, (1, 0, 2)) # Create environment instances with vectorized env, and rollout agent_policy in both for t in range(self.svpg.svpg_rollout_length): agent_timesteps_current_iteration = 0 # TODO: Double check shape here randomized_env.randomize(randomized_values=simulation_instances[t]) if policy is not None: randomized_trajectory, randomized_actions = run_training_episode( randomized_env, policy, self.agent_timesteps) self.reference_trajectory = evaluate_actions( reference_env, np.transpose(randomized_actions, (1, 0, 2))) else: randomized_trajectory = evaluate_actions( randomized_env, self.reference_actions) for i in range(self.nagents): agent_timesteps_current_iteration += len( randomized_trajectory[i]) self.agent_timesteps += len(randomized_trajectory[i]) randomized_trajectories[i].append(randomized_trajectory[i]) simulator_reward = self.rewarder( randomized_trajectories[i][t][np.newaxis, :], self.reference_trajectory[0][np.newaxis, :]) rewards[i][t] = simulator_reward if self.learned_reward: # flatten and combine all randomized and reference trajectories for discriminator flattened_reference = [ self.reference_trajectory[i] for i in range(self.nagents) ] flattened_reference = np.concatenate(flattened_reference) flattened_randomized = [ randomized_trajectories[i][t] for i in range(self.nagents) ] flattened_randomized = np.concatenate(flattened_randomized) # Train discriminator based on state action pairs for agent env. steps self.rewarder.train_discriminator(flattened_reference, flattened_randomized, iterations=150) # Calculate discriminator based reward, pass it back to SVPG policy if self.svpg_timesteps >= self.initial_svpg_steps: self.svpg.train(rewards) self.svpg_timesteps += 1