Example #1
0
    def __init__(self, nparticles, nparams, state_dim, action_dim, temperature,
                 svpg_rollout_length, svpg_horizon, max_step_length,
                 reward_scale, initial_svpg_steps, seed,
                 discriminator_batchsz):
        assert nparticles > 2

        self.nparticles = nparticles
        self.nparams = nparams

        self.svpg_rollout_length = svpg_rollout_length
        self.svpg_horizon = svpg_horizon
        self.initial_svpg_steps = initial_svpg_steps

        self.seed = seed
        self.svpg_timesteps = 0

        self.discriminator_rewarder = DiscriminatorRewarder(
            state_dim=state_dim,
            action_dim=action_dim,
            discriminator_batchsz=discriminator_batchsz,
            reward_scale=reward_scale,
        )

        self.svpg = SVPG(
            nparticles=nparticles,
            nparams=self.nparams,
            max_step_length=max_step_length,
            svpg_rollout_length=svpg_rollout_length,
            svpg_horizon=svpg_horizon,
            temperature=temperature,
        )

        self.parameter_settings = np.ones(
            (self.nparticles, self.svpg_horizon, self.svpg.svpg_rollout_length,
             self.svpg.nparams)) * -1
    def __init__(self,
                 video_size_file_dir,
                 reference_agent_policy,
                 randomized_env_id,
                 randomized_eval_env_id,
                 agent_name,
                 nagents,
                 nparams,
                 temperature,
                 svpg_rollout_length,
                 svpg_horizon,
                 max_step_length,
                 reward_scale,
                 initial_svpg_steps,
                 max_env_timesteps,
                 episodes_per_instance,
                 discrete_svpg,
                 load_discriminator,
                 freeze_discriminator,
                 freeze_agent,
                 seed,
                 train_svpg=True,
                 particle_path="",
                 discriminator_batchsz=320,
                 randomized_eval_episodes=3):
        """

        Args
            video_size_file_dir(str): path to all video size files.
            reference_agent_policy: a reference ABR algorithm.
        """

        # TODO: Weird bug
        assert nagents > 2
        self.reference_agent_policy = reference_agent_policy

        self.randomized_env_id = randomized_env_id
        self.randomized_eval_env_id = randomized_eval_env_id
        self.agent_name = agent_name

        # TODO verify whether we need to log distances
        self.log_distances = False

        self.randomized_eval_episodes = randomized_eval_episodes

        # Vectorized environments - step with nagents in parallel
        self.randomized_env = MultiEnv([
            Environment(video_size_file_dir,
                        self.randomized_env_id,
                        seed,
                        trace_video_same_duration_flag=True)
            for _ in range(nagents)
        ])

        # fix the observation/state shape and action shape
        self.state_dim = S_LEN
        self.action_dim = A_DIM

        self.hard_env = MultiEnv([
            Environment(video_size_file_dir,
                        self.randomized_env_id,
                        seed,
                        trace_video_same_duration_flag=True)
            for _ in range(nagents)
        ])

        self.sampled_regions = [[] for _ in range(nparams)]

        self.nagents = nagents
        # TODO: set 1 here because we only have 1 variable changing and
        # randomization space need to be figured out in the future
        self.nparams = 1  # self.randomized_env.randomization_space.shape[0]
        assert self.nparams == nparams, "Double check number of parameters: " \
            "Args: {}, Env: {}".format(nparams, self.nparams)

        # variables for agent policy
        self.freeze_agent = freeze_agent
        self.agent_eval_frequency = max_env_timesteps * nagents
        self.agent_timesteps = 0
        self.agent_timesteps_since_eval = 0
        self.seed = seed

        # variables for discriminator
        self.freeze_discriminator = freeze_discriminator
        self.discriminator_rewarder = DiscriminatorRewarder(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            discriminator_batchsz=discriminator_batchsz,
            reward_scale=reward_scale,
            load_discriminator=load_discriminator,
        )

        # variables for SVPG
        self.svpg_horizon = svpg_horizon
        self.initial_svpg_steps = initial_svpg_steps
        self.max_env_timesteps = max_env_timesteps
        self.episodes_per_instance = episodes_per_instance
        self.discrete_svpg = discrete_svpg
        self.train_svpg = train_svpg
        self.svpg_timesteps = 0
        self.svpg = SVPG(nagents=nagents,
                         nparams=self.nparams,
                         max_step_length=max_step_length,
                         svpg_rollout_length=svpg_rollout_length,
                         svpg_horizon=svpg_horizon,
                         temperature=temperature,
                         discrete=self.discrete_svpg,
                         kld_coefficient=0.0)

        if particle_path != "":
            logger.info("Loading particles from: {}".format(particle_path))
            self.svpg.load(directory=particle_path)
class SVPGSimulatorAgent(object):
    """Simulation object.

    Create randomized environments based on specified params, handles
    SVPG-based policy search to create envs, and evaluates controller policies
    in those environments.
    """
    def __init__(self,
                 video_size_file_dir,
                 reference_agent_policy,
                 randomized_env_id,
                 randomized_eval_env_id,
                 agent_name,
                 nagents,
                 nparams,
                 temperature,
                 svpg_rollout_length,
                 svpg_horizon,
                 max_step_length,
                 reward_scale,
                 initial_svpg_steps,
                 max_env_timesteps,
                 episodes_per_instance,
                 discrete_svpg,
                 load_discriminator,
                 freeze_discriminator,
                 freeze_agent,
                 seed,
                 train_svpg=True,
                 particle_path="",
                 discriminator_batchsz=320,
                 randomized_eval_episodes=3):
        """

        Args
            video_size_file_dir(str): path to all video size files.
            reference_agent_policy: a reference ABR algorithm.
        """

        # TODO: Weird bug
        assert nagents > 2
        self.reference_agent_policy = reference_agent_policy

        self.randomized_env_id = randomized_env_id
        self.randomized_eval_env_id = randomized_eval_env_id
        self.agent_name = agent_name

        # TODO verify whether we need to log distances
        self.log_distances = False

        self.randomized_eval_episodes = randomized_eval_episodes

        # Vectorized environments - step with nagents in parallel
        self.randomized_env = MultiEnv([
            Environment(video_size_file_dir,
                        self.randomized_env_id,
                        seed,
                        trace_video_same_duration_flag=True)
            for _ in range(nagents)
        ])

        # fix the observation/state shape and action shape
        self.state_dim = S_LEN
        self.action_dim = A_DIM

        self.hard_env = MultiEnv([
            Environment(video_size_file_dir,
                        self.randomized_env_id,
                        seed,
                        trace_video_same_duration_flag=True)
            for _ in range(nagents)
        ])

        self.sampled_regions = [[] for _ in range(nparams)]

        self.nagents = nagents
        # TODO: set 1 here because we only have 1 variable changing and
        # randomization space need to be figured out in the future
        self.nparams = 1  # self.randomized_env.randomization_space.shape[0]
        assert self.nparams == nparams, "Double check number of parameters: " \
            "Args: {}, Env: {}".format(nparams, self.nparams)

        # variables for agent policy
        self.freeze_agent = freeze_agent
        self.agent_eval_frequency = max_env_timesteps * nagents
        self.agent_timesteps = 0
        self.agent_timesteps_since_eval = 0
        self.seed = seed

        # variables for discriminator
        self.freeze_discriminator = freeze_discriminator
        self.discriminator_rewarder = DiscriminatorRewarder(
            state_dim=self.state_dim,
            action_dim=self.action_dim,
            discriminator_batchsz=discriminator_batchsz,
            reward_scale=reward_scale,
            load_discriminator=load_discriminator,
        )

        # variables for SVPG
        self.svpg_horizon = svpg_horizon
        self.initial_svpg_steps = initial_svpg_steps
        self.max_env_timesteps = max_env_timesteps
        self.episodes_per_instance = episodes_per_instance
        self.discrete_svpg = discrete_svpg
        self.train_svpg = train_svpg
        self.svpg_timesteps = 0
        self.svpg = SVPG(nagents=nagents,
                         nparams=self.nparams,
                         max_step_length=max_step_length,
                         svpg_rollout_length=svpg_rollout_length,
                         svpg_horizon=svpg_horizon,
                         temperature=temperature,
                         discrete=self.discrete_svpg,
                         kld_coefficient=0.0)

        if particle_path != "":
            logger.info("Loading particles from: {}".format(particle_path))
            self.svpg.load(directory=particle_path)

    def select_action(self, agent_policy):
        """Select an action based on SVPG policy.

        An action is the delta in each dimension.  Update the counts and
        statistics after training agent, rolling out policies, and calculating
        simulator reward.
        """
        if self.svpg_timesteps >= self.initial_svpg_steps:
            # Get sim instances from SVPG policy
            simulation_instances = self.svpg.step()

        else:
            # Creates completely randomized environment
            simulation_instances = np.ones(
                (self.nagents, self.svpg.svpg_rollout_length,
                 self.svpg.nparams)) * -1

        assert (self.nagents, self.svpg.svpg_rollout_length,
                self.svpg.nparams) == simulation_instances.shape

        # Create placeholders for trajectories
        randomized_trajectories = [[] for _ in range(self.nagents)]
        reference_trajectories = [[] for _ in range(self.nagents)]

        # Create placeholder for rewards
        rewards = np.zeros(simulation_instances.shape[:2])

        # Discriminator debugging
        randomized_discrim_score_mean = 0
        reference_discrim_score_mean = 0
        randomized_discrim_score_median = 0
        reference_discrim_score_median = 0

        # Reshape to work with vectorized environments
        simulation_instances = np.transpose(simulation_instances, (1, 0, 2))

        # Create environment instances with vectorized env, and rollout
        # agent_policy in both
        for t in range(self.svpg.svpg_rollout_length):
            agent_timesteps_current_iteration = 0
            logging.info('Iteration t: {}/{}'.format(
                t, self.svpg.svpg_rollout_length))

            reference_trajectory = self.rollout_agent(agent_policy)

            self.randomized_env.randomize(
                randomized_values=simulation_instances[t])

            randomized_trajectory = self.rollout_agent(agent_policy,
                                                       reference=False)

            for i in range(self.nagents):
                agent_timesteps_current_iteration += len(
                    randomized_trajectory[i])

                reference_trajectories[i].append(reference_trajectory[i])
                randomized_trajectories[i].append(randomized_trajectory[i])

                self.agent_timesteps += len(randomized_trajectory[i])
                self.agent_timesteps_since_eval += len(
                    randomized_trajectory[i])

                simulator_reward = \
                    self.discriminator_rewarder.calculate_rewards(
                        randomized_trajectories[i][t])
                rewards[i][t] = simulator_reward

                logger.info('Setting: {}, Score: {}'.format(
                    simulation_instances[t][i], simulator_reward))

            if not self.freeze_discriminator:
                # flatten and combine all randomized and reference trajectories
                # for discriminator
                flattened_randomized = [
                    randomized_trajectories[i][t] for i in range(self.nagents)
                ]
                flattened_randomized = np.concatenate(flattened_randomized)

                flattened_reference = [
                    reference_trajectories[i][t] for i in range(self.nagents)
                ]
                flattened_reference = np.concatenate(flattened_reference)

                randomized_discrim_score_mean, \
                    randomized_discrim_score_median, \
                    randomized_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_randomized)
                reference_discrim_score_mean, \
                    reference_discrim_score_median, \
                    reference_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_reference)

                # Train discriminator based on state action pairs for agent
                # env. steps
                # TODO: Train more?
                print('start train discriminator')
                self.discriminator_rewarder.train_discriminator(
                    flattened_reference,
                    flattened_randomized,
                    iterations=agent_timesteps_current_iteration)
                print('end train discriminator')

                randomized_discrim_score_mean, \
                    randomized_discrim_score_median, \
                    randomized_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_randomized)
                reference_discrim_score_mean, \
                    reference_discrim_score_median, \
                    reference_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_reference)

        # Calculate discriminator based reward, pass it back to SVPG policy
        if self.svpg_timesteps >= self.initial_svpg_steps:
            if self.train_svpg:
                print('start train svpg')
                self.svpg.train(rewards)
                print('end train svpg')

            for dimension in range(self.nparams):
                self.sampled_regions[dimension] = np.concatenate([
                    self.sampled_regions[dimension],
                    simulation_instances[:, :, dimension].flatten()
                ])

        solved_reference = info = None
        # if self.agent_timesteps_since_eval > self.agent_eval_frequency:
        #     self.agent_timesteps_since_eval %= self.agent_eval_frequency
        #     logger.info("Evaluating for {} episodes afer timesteps: {} (SVPG), {} (Agent)".format(
        #         self.randomized_eval_episodes * self.nagents, self.svpg_timesteps, self.agent_timesteps))
        #
        #     agent_reference_eval_rewards = []
        #     agent_randomized_eval_rewards = []
        #
        #     final_dist_ref = []
        #     final_dist_rand = []
        #
        #     for _ in range(self.randomized_eval_episodes):
        #         rewards_ref, dist_ref = evaluate_policy(
        #             nagents=self.nagents, net_envs=self.reference_env,
        #             agent_policy=agent_policy,  # replay_buffer=None,
        #             eval_episodes=1, max_steps=self.max_env_timesteps,
        #             return_rewards=True, add_noise=False,
        #             log_distances=self.log_distances)
        #
        #         full_random_settings = np.ones(
        #             (self.nagents, self.nparams)) * -1
        #         self.randomized_env.randomize(
        #             randomized_values=full_random_settings)
        #
        #         rewards_rand, dist_rand = evaluate_policy(
        #             nagents=self.nagents, net_envs=self.randomized_env,
        #             agent_policy=agent_policy,  # replay_buffer=None,
        #             eval_episodes=1, max_steps=self.max_env_timesteps,
        #             return_rewards=True, add_noise=False,
        #             log_distances=self.log_distances)
        #
        #         agent_reference_eval_rewards += list(rewards_ref)
        #         agent_randomized_eval_rewards += list(rewards_rand)
        #         final_dist_ref += [dist_ref]
        #         final_dist_rand += [dist_rand]
        #
        #     evaluation_criteria_reference = agent_reference_eval_rewards
        #     evaluation_criteria_randomized = agent_randomized_eval_rewards
        #
        #     if self.log_distances:
        #         evaluation_criteria_reference = final_dist_ref
        #         evaluation_criteria_randomized = final_dist_rand

        # solved_reference = check_solved(
        #     self.reference_env_id, evaluation_criteria_reference)
        # solved_randomized = check_solved(
        #     self.randomized_eval_env_id, evaluation_criteria_randomized)
        #
        # info = {
        #     'solved': str(solved_reference),
        #     'solved_randomized': str(solved_randomized),
        #     'svpg_steps': self.svpg_timesteps,
        #     'agent_timesteps': self.agent_timesteps,
        #     'final_dist_ref_mean': np.mean(final_dist_ref),
        #     'final_dist_ref_std': np.std(final_dist_ref),
        #     'final_dist_ref_median': np.median(final_dist_ref),
        #     'final_dist_rand_mean': np.mean(final_dist_rand),
        #     'final_dist_rand_std': np.std(final_dist_rand),
        #     'final_dist_rand_median': np.median(final_dist_rand),
        #     'agent_reference_eval_rewards_mean': np.mean(agent_reference_eval_rewards),
        #     'agent_reference_eval_rewards_std': np.std(agent_reference_eval_rewards),
        #     'agent_reference_eval_rewards_median': np.median(agent_reference_eval_rewards),
        #     'agent_reference_eval_rewards_min': np.min(agent_reference_eval_rewards),
        #     'agent_reference_eval_rewards_max': np.max(agent_reference_eval_rewards),
        #     'agent_randomized_eval_rewards_mean': np.mean(agent_randomized_eval_rewards),
        #     'agent_randomized_eval_rewards_std': np.std(agent_randomized_eval_rewards),
        #     'agent_randomized_eval_rewards_median': np.median(agent_randomized_eval_rewards),
        #     'agent_randomized_eval_rewards_min': np.min(agent_randomized_eval_rewards),
        #     'agent_randomized_eval_rewards_max': np.max(agent_randomized_eval_rewards),
        #     'randomized_discrim_score_mean': str(randomized_discrim_score_mean),
        #     'reference_discrim_score_mean': str(reference_discrim_score_mean),
        #     'randomized_discrim_score_median': str(randomized_discrim_score_median),
        #     'reference_discrim_score_median': str(reference_discrim_score_median),
        #
        # }
        #
        # agent_hard_eval_rewards, final_dist_hard = evaluate_policy(
        #     nagents=self.nagents, env=self.hard_env,
        #     agent_policy=agent_policy,  # replay_buffer=None,
        #     eval_episodes=1, max_steps=self.max_env_timesteps,
        #     return_rewards=True, add_noise=False,
        #     log_distances=self.log_distances)
        # info_hard = {
        #     'final_dist_hard_mean': np.mean(final_dist_hard),
        #     'final_dist_hard_std': np.std(final_dist_hard),
        #     'final_dist_hard_median': np.median(final_dist_hard),
        #     'agent_hard_eval_rewards_median': np.median(agent_hard_eval_rewards),
        #     'agent_hard_eval_rewards_mean': np.mean(agent_hard_eval_rewards),
        #     'agent_hard_eval_rewards_std': np.std(agent_hard_eval_rewards),
        # }
        #
        # info.update(info_hard)

        self.svpg_timesteps += 1
        return solved_reference, info

    # def rollout_agent(self, agent_policy, reference=True, eval_episodes=None):
    #     """Roll out agent_policy in the specified environment."""
    #     if reference:
    #         if eval_episodes is None:
    #             eval_episodes = self.episodes_per_instance
    #         trajectory = evaluate_policy(
    #             nagents=self.nagents, net_envs=self.reference_env,
    #             agent_policy=agent_policy,  # replay_buffer=None,
    #             eval_episodes=eval_episodes, max_steps=self.max_env_timesteps,
    #             freeze_agent=True, add_noise=False,
    #             log_distances=self.log_distances)
    #     else:
    #         trajectory = evaluate_policy(
    #             nagents=self.nagents, net_envs=self.randomized_env,
    #             agent_policy=agent_policy,  # replay_buffer=self.replay_buffer,
    #             eval_episodes=self.episodes_per_instance,
    #             max_steps=self.max_env_timesteps,
    #             freeze_agent=self.freeze_agent, add_noise=True,
    #             log_distances=self.log_distances)
    #
    #     return trajectory

    def sample_trajectories(self, batch_size):
        indices = np.random.randint(0,
                                    len(self.extracted_trajectories['states']),
                                    batch_size)

        states = self.extracted_trajectories['states']
        actions = self.extracted_trajectories['actions']
        next_states = self.extracted_trajectories['next_states']

        trajectories = []
        for i in indices:
            trajectories.append(
                np.concatenate([
                    np.array(states[i]),
                    np.array(actions[i]),
                    np.array(next_states[i])
                ],
                               axis=-1))
        return trajectories
Example #4
0
    def __init__(
        self,
        reference_env_id,
        randomized_env_id,
        randomized_eval_env_id,
        agent_name,
        nagents,
        nparams,
        temperature,
        svpg_rollout_length,
        svpg_horizon,
        max_step_length,
        reward_scale,
        initial_svpg_steps,
        max_env_timesteps,
        episodes_per_instance,
        discrete_svpg,
        load_discriminator,
        freeze_discriminator,
        freeze_agent,
        seed,
        train_svpg=True,
        particle_path="",
        discriminator_batchsz=320,
        randomized_eval_episodes=3,
    ):

        # TODO: Weird bug
        assert nagents > 2

        self.reference_env_id = reference_env_id
        self.randomized_env_id = randomized_env_id
        self.randomized_eval_env_id = randomized_eval_env_id
        self.agent_name = agent_name

        self.log_distances = reference_env_id.find('Lunar') == -1

        self.randomized_eval_episodes = randomized_eval_episodes

        # Vectorized environments - step with nagents in parallel
        self.reference_env = make_vec_envs(reference_env_id, seed, nagents)
        self.randomized_env = make_vec_envs(randomized_env_id, seed, nagents)

        self.state_dim = self.reference_env.observation_space.shape[0]
        self.action_dim = self.reference_env.action_space.shape[0]

        if reference_env_id.find('Pusher') != -1:
            self.hard_env = make_vec_envs('Pusher3DOFHard-v0', seed, nagents)
        elif reference_env_id.find('Lunar') != -1:
            self.hard_env = make_vec_envs('LunarLander10-v0', seed, nagents)
        elif reference_env_id.find('Backlash') != -1:
            self.hard_env = make_vec_envs(
                'ErgoReacherRandomizedBacklashHard-v0', seed, nagents)
        else:
            self.hard_env = make_vec_envs('ErgoReacher4DOFRandomizedHard-v0',
                                          seed, nagents)

        self.sampled_regions = [[] for _ in range(nparams)]

        self.nagents = nagents
        self.nparams = self.randomized_env.randomization_space.shape[0]
        assert self.nparams == nparams, "Double check number of parameters: Args: {}, Env: {}".format(
            nparams, self.nparams)

        self.svpg_horizon = svpg_horizon
        self.initial_svpg_steps = initial_svpg_steps
        self.max_env_timesteps = max_env_timesteps
        self.episodes_per_instance = episodes_per_instance
        self.discrete_svpg = discrete_svpg

        self.freeze_discriminator = freeze_discriminator
        self.freeze_agent = freeze_agent

        self.train_svpg = train_svpg

        self.agent_eval_frequency = max_env_timesteps * nagents

        self.seed = seed
        self.svpg_timesteps = 0
        self.agent_timesteps = 0
        self.agent_timesteps_since_eval = 0

        self.discriminator_rewarder = DiscriminatorRewarder(
            reference_env=self.reference_env,
            randomized_env_id=randomized_env_id,
            discriminator_batchsz=discriminator_batchsz,
            reward_scale=reward_scale,
            load_discriminator=load_discriminator,
        )

        if not self.freeze_agent:
            self.replay_buffer = ReplayBuffer()
        else:
            self.replay_buffer = None

        self.svpg = SVPG(nagents=nagents,
                         nparams=self.nparams,
                         max_step_length=max_step_length,
                         svpg_rollout_length=svpg_rollout_length,
                         svpg_horizon=svpg_horizon,
                         temperature=temperature,
                         discrete=self.discrete_svpg,
                         kld_coefficient=0.0)

        if particle_path != "":
            logger.info("Loading particles from: {}".format(particle_path))
            self.svpg.load(directory=particle_path)

        self.simulation_instances_full_horizon = np.ones(
            (self.nagents, self.svpg_horizon, self.svpg.svpg_rollout_length,
             self.svpg.nparams)) * -1
Example #5
0
class SVPGSimulatorAgent(object):
    """Simulation object which creates randomized environments based on specified params, 
    handles SVPG-based policy search to create envs, 
    and evaluates controller policies in those environments
    """
    def __init__(
        self,
        reference_env_id,
        randomized_env_id,
        randomized_eval_env_id,
        agent_name,
        nagents,
        nparams,
        temperature,
        svpg_rollout_length,
        svpg_horizon,
        max_step_length,
        reward_scale,
        initial_svpg_steps,
        max_env_timesteps,
        episodes_per_instance,
        discrete_svpg,
        load_discriminator,
        freeze_discriminator,
        freeze_agent,
        seed,
        train_svpg=True,
        particle_path="",
        discriminator_batchsz=320,
        randomized_eval_episodes=3,
    ):

        # TODO: Weird bug
        assert nagents > 2

        self.reference_env_id = reference_env_id
        self.randomized_env_id = randomized_env_id
        self.randomized_eval_env_id = randomized_eval_env_id
        self.agent_name = agent_name

        self.log_distances = reference_env_id.find('Lunar') == -1

        self.randomized_eval_episodes = randomized_eval_episodes

        # Vectorized environments - step with nagents in parallel
        self.reference_env = make_vec_envs(reference_env_id, seed, nagents)
        self.randomized_env = make_vec_envs(randomized_env_id, seed, nagents)

        self.state_dim = self.reference_env.observation_space.shape[0]
        self.action_dim = self.reference_env.action_space.shape[0]

        if reference_env_id.find('Pusher') != -1:
            self.hard_env = make_vec_envs('Pusher3DOFHard-v0', seed, nagents)
        elif reference_env_id.find('Lunar') != -1:
            self.hard_env = make_vec_envs('LunarLander10-v0', seed, nagents)
        elif reference_env_id.find('Backlash') != -1:
            self.hard_env = make_vec_envs(
                'ErgoReacherRandomizedBacklashHard-v0', seed, nagents)
        else:
            self.hard_env = make_vec_envs('ErgoReacher4DOFRandomizedHard-v0',
                                          seed, nagents)

        self.sampled_regions = [[] for _ in range(nparams)]

        self.nagents = nagents
        self.nparams = self.randomized_env.randomization_space.shape[0]
        assert self.nparams == nparams, "Double check number of parameters: Args: {}, Env: {}".format(
            nparams, self.nparams)

        self.svpg_horizon = svpg_horizon
        self.initial_svpg_steps = initial_svpg_steps
        self.max_env_timesteps = max_env_timesteps
        self.episodes_per_instance = episodes_per_instance
        self.discrete_svpg = discrete_svpg

        self.freeze_discriminator = freeze_discriminator
        self.freeze_agent = freeze_agent

        self.train_svpg = train_svpg

        self.agent_eval_frequency = max_env_timesteps * nagents

        self.seed = seed
        self.svpg_timesteps = 0
        self.agent_timesteps = 0
        self.agent_timesteps_since_eval = 0

        self.discriminator_rewarder = DiscriminatorRewarder(
            reference_env=self.reference_env,
            randomized_env_id=randomized_env_id,
            discriminator_batchsz=discriminator_batchsz,
            reward_scale=reward_scale,
            load_discriminator=load_discriminator,
        )

        if not self.freeze_agent:
            self.replay_buffer = ReplayBuffer()
        else:
            self.replay_buffer = None

        self.svpg = SVPG(nagents=nagents,
                         nparams=self.nparams,
                         max_step_length=max_step_length,
                         svpg_rollout_length=svpg_rollout_length,
                         svpg_horizon=svpg_horizon,
                         temperature=temperature,
                         discrete=self.discrete_svpg,
                         kld_coefficient=0.0)

        if particle_path != "":
            logger.info("Loading particles from: {}".format(particle_path))
            self.svpg.load(directory=particle_path)

        self.simulation_instances_full_horizon = np.ones(
            (self.nagents, self.svpg_horizon, self.svpg.svpg_rollout_length,
             self.svpg.nparams)) * -1

    def select_action(self, agent_policy):
        """Select an action based on SVPG policy, where an action is the delta in each dimension.
        Update the counts and statistics after training agent,
        rolling out policies, and calculating simulator reward.
        """
        if self.svpg_timesteps >= self.initial_svpg_steps:
            # Get sim instances from SVPG policy
            simulation_instances = self.svpg.step()

            index = self.svpg_timesteps % self.svpg_horizon
            self.simulation_instances_full_horizon[:,
                                                   index, :, :] = simulation_instances

        else:
            # Creates completely randomized environment
            simulation_instances = np.ones(
                (self.nagents, self.svpg.svpg_rollout_length,
                 self.svpg.nparams)) * -1
            small_ranges = np.linspace(0, 1, self.nagents + 1)
            for i in range(self.nagents):
                miu = (small_ranges[i] + small_ranges[i + 1]) / 2
                sigma = (small_ranges[0 + 1] + small_ranges[0]) / 6
                row = np.random.normal(
                    miu, sigma, (self.svpg.svpg_rollout_length, self.nparams))
                row[row < 0] = 0
                row[row > 1] = 1
                simulation_instances[i] = row

        assert (self.nagents, self.svpg.svpg_rollout_length,
                self.svpg.nparams) == simulation_instances.shape

        # Create placeholders for trajectories
        randomized_trajectories = [[] for _ in range(self.nagents)]
        reference_trajectories = [[] for _ in range(self.nagents)]

        # Create placeholder for rewards
        rewards = np.zeros(simulation_instances.shape[:2])

        # Discriminator debugging
        randomized_discrim_score_mean = 0
        reference_discrim_score_mean = 0
        randomized_discrim_score_median = 0
        reference_discrim_score_median = 0

        # Reshape to work with vectorized environments
        simulation_instances = np.transpose(simulation_instances, (1, 0, 2))
        log_path = os.path.join(PARA_LOG,
                                'parameter_log_{}'.format(self.svpg_timesteps))
        log_file = open(log_path, 'w', 1)

        # Create environment instances with vectorized env, and rollout agent_policy in both
        for t in range(self.svpg.svpg_rollout_length):
            agent_timesteps_current_iteration = 0
            logging.info('Iteration t: {}/{}'.format(
                t, self.svpg.svpg_rollout_length))

            reference_trajectory = self.rollout_agent(agent_policy)

            self.randomized_env.randomize(
                randomized_values=simulation_instances[t])
            env_params = self.randomized_env.get_current_params()
            log_file.write(' '.join([str(val)
                                     for val in env_params[:, 0]]) + '\n')

            randomized_trajectory = self.rollout_agent(agent_policy,
                                                       reference=False)

            for i in range(self.nagents):
                agent_timesteps_current_iteration += len(
                    randomized_trajectory[i])

                reference_trajectories[i].append(reference_trajectory[i])
                randomized_trajectories[i].append(randomized_trajectory[i])

                self.agent_timesteps += len(randomized_trajectory[i])
                self.agent_timesteps_since_eval += len(
                    randomized_trajectory[i])

                simulator_reward = self.discriminator_rewarder.calculate_rewards(
                    randomized_trajectories[i][t])
                rewards[i][t] = simulator_reward

                logger.info('Setting: {}, Score: {}'.format(
                    simulation_instances[t][i], simulator_reward))

            if not self.freeze_discriminator:
                # flatten and combine all randomized and reference trajectories for discriminator
                flattened_randomized = [
                    randomized_trajectories[i][t] for i in range(self.nagents)
                ]
                flattened_randomized = np.concatenate(flattened_randomized)

                flattened_reference = [
                    reference_trajectories[i][t] for i in range(self.nagents)
                ]
                flattened_reference = np.concatenate(flattened_reference)

                randomized_discrim_score_mean, randomized_discrim_score_median, randomized_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_randomized)
                reference_discrim_score_mean, reference_discrim_score_median, reference_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_reference)

                # Train discriminator based on state action pairs for agent env. steps
                # TODO: Train more?
                self.discriminator_rewarder.train_discriminator(
                    flattened_reference,
                    flattened_randomized,
                    iterations=agent_timesteps_current_iteration)

                randomized_discrim_score_mean, randomized_discrim_score_median, randomized_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_randomized)
                reference_discrim_score_mean, reference_discrim_score_median, reference_discrim_score_sum = \
                    self.discriminator_rewarder.get_score(flattened_reference)

        # Calculate discriminator based reward, pass it back to SVPG policy
        if self.svpg_timesteps >= self.initial_svpg_steps:
            if self.train_svpg:
                self.svpg.train(rewards)

            for dimension in range(self.nparams):
                self.sampled_regions[dimension] = np.concatenate([
                    self.sampled_regions[dimension],
                    simulation_instances[:, :, dimension].flatten()
                ])

        solved_reference = info = None
        if self.agent_timesteps_since_eval > self.agent_eval_frequency:
            self.agent_timesteps_since_eval %= self.agent_eval_frequency
            logger.info(
                "Evaluating for {} episodes afer timesteps: {} (SVPG), {} (Agent)"
                .format(self.randomized_eval_episodes * self.nagents,
                        self.svpg_timesteps, self.agent_timesteps))

            agent_reference_eval_rewards = []
            agent_randomized_eval_rewards = []

            final_dist_ref = []
            final_dist_rand = []

            for _ in range(self.randomized_eval_episodes):
                rewards_ref, dist_ref = evaluate_policy(
                    nagents=self.nagents,
                    env=self.reference_env,
                    agent_policy=agent_policy,
                    replay_buffer=None,
                    eval_episodes=1,
                    max_steps=self.max_env_timesteps,
                    return_rewards=True,
                    add_noise=False,
                    log_distances=self.log_distances)

                full_random_settings = np.ones(
                    (self.nagents, self.nparams)) * -1
                self.randomized_env.randomize(
                    randomized_values=full_random_settings)

                rewards_rand, dist_rand = evaluate_policy(
                    nagents=self.nagents,
                    env=self.randomized_env,
                    agent_policy=agent_policy,
                    replay_buffer=None,
                    eval_episodes=1,
                    max_steps=self.max_env_timesteps,
                    return_rewards=True,
                    add_noise=False,
                    log_distances=self.log_distances)

                agent_reference_eval_rewards += list(rewards_ref)
                agent_randomized_eval_rewards += list(rewards_rand)
                final_dist_ref += [dist_ref]
                final_dist_rand += [dist_rand]

            evaluation_criteria_reference = agent_reference_eval_rewards
            evaluation_criteria_randomized = agent_randomized_eval_rewards

            if self.log_distances:
                evaluation_criteria_reference = final_dist_ref
                evaluation_criteria_randomized = final_dist_rand

            solved_reference = check_solved(self.reference_env_id,
                                            evaluation_criteria_reference)
            solved_randomized = check_solved(self.randomized_eval_env_id,
                                             evaluation_criteria_randomized)

            info = {
                'solved':
                str(solved_reference),
                'solved_randomized':
                str(solved_randomized),
                'svpg_steps':
                self.svpg_timesteps,
                'agent_timesteps':
                self.agent_timesteps,
                'final_dist_ref_mean':
                np.mean(final_dist_ref),
                'final_dist_ref_std':
                np.std(final_dist_ref),
                'final_dist_ref_median':
                np.median(final_dist_ref),
                'final_dist_rand_mean':
                np.mean(final_dist_rand),
                'final_dist_rand_std':
                np.std(final_dist_rand),
                'final_dist_rand_median':
                np.median(final_dist_rand),
                'agent_reference_eval_rewards_mean':
                np.mean(agent_reference_eval_rewards),
                'agent_reference_eval_rewards_std':
                np.std(agent_reference_eval_rewards),
                'agent_reference_eval_rewards_median':
                np.median(agent_reference_eval_rewards),
                'agent_reference_eval_rewards_min':
                np.min(agent_reference_eval_rewards),
                'agent_reference_eval_rewards_max':
                np.max(agent_reference_eval_rewards),
                'agent_randomized_eval_rewards_mean':
                np.mean(agent_randomized_eval_rewards),
                'agent_randomized_eval_rewards_std':
                np.std(agent_randomized_eval_rewards),
                'agent_randomized_eval_rewards_median':
                np.median(agent_randomized_eval_rewards),
                'agent_randomized_eval_rewards_min':
                np.min(agent_randomized_eval_rewards),
                'agent_randomized_eval_rewards_max':
                np.max(agent_randomized_eval_rewards),
                'randomized_discrim_score_mean':
                str(randomized_discrim_score_mean),
                'reference_discrim_score_mean':
                str(reference_discrim_score_mean),
                'randomized_discrim_score_median':
                str(randomized_discrim_score_median),
                'reference_discrim_score_median':
                str(reference_discrim_score_median),
            }

            agent_hard_eval_rewards, final_dist_hard = evaluate_policy(
                nagents=self.nagents,
                env=self.hard_env,
                agent_policy=agent_policy,
                replay_buffer=None,
                eval_episodes=1,
                max_steps=self.max_env_timesteps,
                return_rewards=True,
                add_noise=False,
                log_distances=self.log_distances)
            info_hard = {
                'final_dist_hard_mean':
                np.mean(final_dist_hard),
                'final_dist_hard_std':
                np.std(final_dist_hard),
                'final_dist_hard_median':
                np.median(final_dist_hard),
                'agent_hard_eval_rewards_median':
                np.median(agent_hard_eval_rewards),
                'agent_hard_eval_rewards_mean':
                np.mean(agent_hard_eval_rewards),
                'agent_hard_eval_rewards_std':
                np.std(agent_hard_eval_rewards),
            }

            info.update(info_hard)

        self.svpg_timesteps += 1
        return solved_reference, info

    def rollout_agent(self, agent_policy, reference=True, eval_episodes=None):
        """Rolls out agent_policy in the specified environment
        """
        if reference:
            if eval_episodes is None:
                eval_episodes = self.episodes_per_instance
            trajectory = evaluate_policy(nagents=self.nagents,
                                         env=self.reference_env,
                                         agent_policy=agent_policy,
                                         replay_buffer=None,
                                         eval_episodes=eval_episodes,
                                         max_steps=self.max_env_timesteps,
                                         freeze_agent=True,
                                         add_noise=False,
                                         log_distances=self.log_distances)
        else:
            trajectory = evaluate_policy(
                nagents=self.nagents,
                env=self.randomized_env,
                agent_policy=agent_policy,
                replay_buffer=self.replay_buffer,
                eval_episodes=self.episodes_per_instance,
                max_steps=self.max_env_timesteps,
                freeze_agent=self.freeze_agent,
                add_noise=True,
                log_distances=self.log_distances)

        return trajectory

    def sample_trajectories(self, batch_size):
        indices = np.random.randint(0,
                                    len(self.extracted_trajectories['states']),
                                    batch_size)

        states = self.extracted_trajectories['states']
        actions = self.extracted_trajectories['actions']
        next_states = self.extracted_trajectories['next_states']

        trajectories = []
        for i in indices:
            trajectories.append(
                np.concatenate([
                    np.array(states[i]),
                    np.array(actions[i]),
                    np.array(next_states[i])
                ],
                               axis=-1))
        return trajectories
import numpy as np
from common.svpg.svpg import SVPG
from common.envs.randomized_vecenv import make_vec_envs

def _create_envs(seed, nagents, reference_env_id='LunarLanderDefault-v0', 
    randomized_env_id='LunarLanderRandomized-v0'):
    
    reference_env = make_vec_envs(reference_env_id, seed, nagents)
    randomized_env = make_vec_envs(randomized_env_id, seed, nagents)

    return reference_env, randomized_env

nagents = 3
svpg = SVPG(nagents)
reference_env, randomized_env = _create_envs(seed=123, nagents=nagents)

simulation_settings = svpg.step()
assert (nagents, svpg.svpg_rollout_length, svpg.nparams) == simulation_settings.shape

simulation_settings = np.transpose(simulation_settings, (1, 0, 2))

for t in range(svpg.svpg_rollout_length):
    print("Current Timestep: {}".format(t))
    print([simulation_settings[t]])
    randomized_env.randomize(randomized_values=simulation_settings[t])
    print(randomized_env.get_current_params())


Example #7
0
class ADR:
    def __init__(self, nparticles, nparams, state_dim, action_dim, temperature,
                 svpg_rollout_length, svpg_horizon, max_step_length,
                 reward_scale, initial_svpg_steps, seed,
                 discriminator_batchsz):
        assert nparticles > 2

        self.nparticles = nparticles
        self.nparams = nparams

        self.svpg_rollout_length = svpg_rollout_length
        self.svpg_horizon = svpg_horizon
        self.initial_svpg_steps = initial_svpg_steps

        self.seed = seed
        self.svpg_timesteps = 0

        self.discriminator_rewarder = DiscriminatorRewarder(
            state_dim=state_dim,
            action_dim=action_dim,
            discriminator_batchsz=discriminator_batchsz,
            reward_scale=reward_scale,
        )

        self.svpg = SVPG(
            nparticles=nparticles,
            nparams=self.nparams,
            max_step_length=max_step_length,
            svpg_rollout_length=svpg_rollout_length,
            svpg_horizon=svpg_horizon,
            temperature=temperature,
        )

        self.parameter_settings = np.ones(
            (self.nparticles, self.svpg_horizon, self.svpg.svpg_rollout_length,
             self.svpg.nparams)) * -1

    def score_trajectories(self, randomized_trajectories):
        rewards = np.zeros((self.nparticles, self.svpg.svpg_rollout_length))

        for i in range(self.nparticles):
            for t in range(self.svpg.svpg_rollout_length):
                # flatten and combine all randomized and reference trajectories for discriminator
                randomized_discrim_score_mean, _, _ = \
                    self.discriminator_rewarder.get_score(randomized_trajectories[i][t])

                rewards[i][t] = randomized_discrim_score_mean

        return rewards

    def step_particles(self):
        if self.svpg_timesteps >= self.initial_svpg_steps:
            # Get sim instances from SVPG policy
            simulation_instances = self.svpg.step()

            index = self.svpg_timesteps % self.svpg_horizon
            self.parameter_settings[:, index, :, :] = simulation_instances

        else:
            # Creates completely randomized environment
            simulation_instances = np.ones(
                (self.nparticles, self.svpg.svpg_rollout_length,
                 self.svpg.nparams)) * -1

        assert (self.nparticles, self.svpg.svpg_rollout_length, self.svpg.nparams) \
            == simulation_instances.shape

        # Reshape to work with vectorized environments
        simulation_instances = np.transpose(simulation_instances, (1, 0, 2))

        self.svpg_timesteps += 1
        return simulation_instances

    def train(self, reference_trajectories, randomized_trajectories):
        rewards = self.score_trajectories(randomized_trajectories)
        self._train_particles(rewards)
        self._train_discriminator(reference_trajectories,
                                  randomized_trajectories)

    def _train_discriminator(self, reference_trajectories,
                             randomized_trajectories):
        flattened_randomized = [
            randomized_trajectories[i][t] for i in range(self.nagents)
        ]
        flattened_randomized = np.concatenate(flattened_randomized)

        flattened_reference = [
            reference_trajectories[i][t] for i in range(self.nagents)
        ]
        flattened_reference = np.concatenate(flattened_reference)

        self.discriminator_rewarder.train_discriminator(
            flattened_reference,
            flattened_randomized,
            iterations=len(flattened_randomized))

    def _train_particles(self, rewards):
        self.svpg.train(rewards)