Esempio n. 1
0
 def action_space(self):
     lat_dim = self.low_policy_latent_dim
     if self.discrete_actions:
         return spaces.Discrete(lat_dim)  # the action is now just a selection
     else:
         ub = 1e6 * np.ones(lat_dim)
         return spaces.Box(-1 * ub, ub)
Esempio n. 2
0
    def __init__(self):
        EzPickle.__init__(self)
        self.seed()
        self.viewer = None

        self.world = Box2D.b2World()
        self.moon = None
        self.lander = None
        self.particles = []

        self.prev_reward = None

        # useful range is -1 .. +1, but spikes can be higher
        self.observation_space = spaces.Box(-np.inf, np.inf, shape=(8, ))

        if self.continuous:
            # Action is two floats [main engine, left-right engines].
            # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power.
            # Left-right:  -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off
            self.action_space = spaces.Box(-1, +1, (2, ))
        else:
            # Nop, fire left engine, main engine, right engine
            self.action_space = spaces.Discrete(4)

        self.reset()
Esempio n. 3
0
 def __init__(self, env, base_policy, num_skills, steps_per_option=100):
     Serializable.quick_init(self, locals())
     self._base_policy = base_policy
     self._env = env
     self._steps_per_option = steps_per_option
     self._num_skills = num_skills
     self.observation_space = self._env.observation_space
     self.action_space = spaces.Discrete(num_skills)
     self.spec = EnvSpec(self.observation_space, self.action_space)
     self._obs = self.reset()
    def __init__(
        self,
        env_spec,
        env,  # the inner one, I believe
        pkl_path=None,  # for the entire hierarchical policy
        snn_pkl_path=None,
        snn_json_path=None,
        manager_pkl_path=None,  # default is to initialize a new manager from scratch
        period=2,  # how often the manager chooses latent skill
        latent_dim=6,
        bilinear_integration=True,
        trainable_snn=True,
        trainable_manager=True,
        hidden_sizes_snn=(64, 64),
        hidden_sizes_selector=(32, 32)):
        StochasticPolicy.__init__(self, env_spec)
        self.env = env
        self.period = period
        self.latent_dim = latent_dim  # unsure
        self.bilinear_integration = bilinear_integration  # unsure
        self.count = 0  # keep track of how long it's been since sampling a latent skill
        self.curr_latent = None  # something
        self.outer_action_space = spaces.Discrete(latent_dim)
        self.trainable_manager = trainable_manager

        if pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
            policy = data['policy']
            self.manager = policy.manager
            self.low_policy = policy.low_policy

            #following two lines used for random manager
            # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space)
            # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )
        else:
            self.low_policy = GaussianMLPPolicy_snn_hier(
                env_spec=env.spec,
                env=env,
                pkl_path=snn_pkl_path,
                json_path=snn_json_path,
                trainable_snn=trainable_snn,
                latent_dim=latent_dim,
                bilinear_integration=bilinear_integration,
                external_latent=True,
                hidden_sizes_snn=hidden_sizes_snn,
                hidden_sizes_selector=hidden_sizes_selector)

            # loading manager from pkl file
            if manager_pkl_path:
                manager_data = joblib.load(
                    os.path.join(config.PROJECT_PATH, manager_pkl_path))
                self.manager = manager_data['policy']
                print("loaded manager")
            else:
                # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path)
                outer_env_spec = EnvSpec(
                    observation_space=self.env.observation_space,
                    action_space=self.outer_action_space)
                self.manager = CategoricalMLPPolicy(
                    env_spec=outer_env_spec,
                    latent_dim=latent_dim,
                )
        Serializable.quick_init(self,
                                locals())  # todo: is this where this belongs?
Esempio n. 5
0
 def action_space(self):
     init = np.zeros([0]).shape
     ub = np.array([3])
     lb = np.zeros_like(ub)
     return spaces.Discrete(4)
Esempio n. 6
0
 def observation_space(self):
     return spaces.Product([
         spaces.Discrete(self.numrow),
         spaces.Discrete(self.numcol),
         spaces.Discrete(2)
     ])
Esempio n. 7
0
 def action_space(self):
     return spaces.Discrete(4)
    def __init__(
        self,
        env_spec,
        env,  # the inner one, I believe
        pkl_path=None,  # for the entire hierarchical policy, can take in npz too!
        snn_pkl_path=None,  # can actually be either pkl or npz
        snn_json_path=None,
        period=10,  # how often the manager chooses latent skill
        latent_dim=6,
        bilinear_integration=True,
        trainable_snn=True,
        trainable_manager=True,
        hidden_sizes_snn=(64, 64),
        hidden_sizes_manager=(32, 32)):
        StochasticPolicy.__init__(self, env_spec)
        self.env = env
        self.period = period
        self.latent_dim = latent_dim  # unsure
        self.bilinear_integration = bilinear_integration  # unsure
        self.count = 0  # keep track of how long it's been since sampling a latent skill
        self.curr_latent = None
        self.curr_manager_obs = None
        self.outer_action_space = spaces.Discrete(latent_dim)
        self.trainable_manager = trainable_manager
        self.trainable_snn = trainable_snn

        if pkl_path and '.npz' not in pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
            policy = data['policy']
            self.manager = policy.manager
            self.low_policy = policy.low_policy
            #todo: the above is wrong, need to figure out how to warm start the params

            #following two lines used for random manager
            # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space)
            # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )
        else:
            if snn_pkl_path is not None and '.npz' in snn_pkl_path:
                npz_path = snn_pkl_path
                snn_pkl_path = None
            else:
                npz_path = None

            self.low_policy = GaussianMLPPolicy_snn_hier(
                env_spec=env.spec,
                env=env,
                pkl_path=snn_pkl_path,
                npz_path=npz_path,
                json_path=snn_json_path,
                trainable_snn=trainable_snn,
                latent_dim=latent_dim,
                bilinear_integration=bilinear_integration,
                external_latent=True,
                hidden_sizes_snn=hidden_sizes_snn,
                hidden_sizes_selector=hidden_sizes_selector)

            # loading manager from pkl file
            if manager_pkl_path:
                manager_data = joblib.load(
                    os.path.join(config.PROJECT_PATH, manager_pkl_path))
                self.manager = manager_data['policy']
                print("loaded manager")
            else:
                # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path)

                if self.continuous_latent:
                    outer_env_spec = EnvSpec(
                        observation_space=self.env.observation_space,
                        action_space=spaces.Box(-1.0,
                                                1.0,
                                                shape=(latent_dim, )))
                    self.manager = GaussianMLPPolicy(env_spec=outer_env_spec)
                else:
                    outer_env_spec = EnvSpec(
                        observation_space=self.env.observation_space,
                        action_space=self.outer_action_space)
                    self.manager = CategoricalMLPPolicy(
                        env_spec=outer_env_spec,
                        latent_dim=latent_dim,
                    )
                # import ipdb; ipdb.set_trace()
                if pkl_path is not None and '.npz' in pkl_path:
                    param_dict = dict(
                        np.load(os.path.join(config.PROJECT_PATH, pkl_path)))
                    param_values = param_dict['params']
                    self.set_param_values(param_values)

        Serializable.quick_init(self,
                                locals())  # todo: is this where this belongs?
    def __init__(
            self,
            env_spec,
            env,  # the inner one, I believe
            pkl_path=None,  # for the entire hierarchical policy
            snn_pkl_path=None,
            snn_json_path=None,
            manager_pkl_path=None,  # default is to initialize a new manager from scratch
            max_period=10,  # possible periods
            latent_dim=6,
            bilinear_integration=True,
            trainable_snn=True,
            trainable_manager=True,
            hidden_sizes_snn=(64, 64),
            hidden_sizes_selector=(32, 32)):
        StochasticPolicy.__init__(self, env_spec)
        self.env = env
        self.periods = np.arange(1, max_period + 1)
        assert len(self.periods) > 0
        self.curr_period = self.periods[0]
        self.max_period = max(self.periods)
        self.latent_dim = latent_dim  # unsure
        self.bilinear_integration = bilinear_integration  # unsure
        self.count = 0  # keep track of how long it's been since sampling a latent skill
        self.curr_latent = None  # something
        self.outer_action_space = spaces.Discrete(latent_dim)
        self.trainable_manager = trainable_manager
        self.random_period = True
        self.fake_env = PeriodVaryingEnv(env)

        if pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
            policy = data['policy']
            self.manager = policy.manager
            self.low_policy = policy.low_policy

            # following two lines used for random manager
            # outer_env_spec = EnvSpec(observation_space=self.env.observation_space, action_space=self.outer_action_space)
            # self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )
        else:
            # env spec that includes the extra parameter for time
            self.low_policy = GaussianMLPPolicy_snn_hier(
                env_spec=self.fake_env.spec,
                env=self.fake_env,
                pkl_path=snn_pkl_path,
                json_path=snn_json_path,
                trainable_snn=trainable_snn,
                latent_dim=latent_dim,
                bilinear_integration=bilinear_integration,
                external_latent=True,
                hidden_sizes_snn=hidden_sizes_snn,
                hidden_sizes_selector=hidden_sizes_selector
            )

            # loading manager from pkl file
            if manager_pkl_path:
                manager_data = joblib.load(os.path.join(config.PROJECT_PATH, manager_pkl_path))
                self.manager = manager_data['policy']
                print("loaded manager")
            else:
                # self.outer_env = hierarchize_snn(self.env, time_steps_agg=10, pkl_path=snn_pkl_path)
                outer_env_spec = EnvSpec(observation_space=self.fake_env.observation_space,
                                         action_space=self.outer_action_space)
                self.manager = CategoricalMLPPolicy(env_spec=outer_env_spec, latent_dim=latent_dim, )

        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        Serializable.quick_init(self, locals())  # todo: ask if this fixes my problem