def __init__(self): self.mean = 0 self.std = 1 self.dims = 52 self.lb = -1 * np.ones(self.dims) self.ub = 1 * np.ones(self.dims) self.counter = 0 self.env = FlattenObservation( FilterObservation(gym.make('FetchReach-v1'), ['observation', 'desired_goal'])) self.num_rollouts = 3 self.render = False self.policy_shape = (4, 13) #tunable hyper-parameters in LA-MCTS self.Cp = 10 self.leaf_size = 100 self.kernel_type = "linear" self.gamma_type = "auto" self.ninits = 30 print("===========initialization===========") print("mean:", self.mean) print("std:", self.std) print("dims:", self.dims) print("policy:", self.policy_shape)
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, logger_dir=None): wrapper_kwargs = wrapper_kwargs or {} if env_type == 'atari': env = make_atari(env_id) elif env_type == 'retro': import retro gamestate = gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro(game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate) else: env = gym.make(env_id) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): keys = env.observation_space.spaces.keys() env = FlattenObservation(env, dict_keys=list(keys)) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if env_type == 'atari': env = wrap_deepmind(env, **wrapper_kwargs) elif env_type == 'retro': if 'frame_stack' not in wrapper_kwargs: wrapper_kwargs['frame_stack'] = 1 env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
class Reacher: def __init__(self): self.mean = 0 self.std = 1 self.dims = 52 self.lb = -1 * np.ones(self.dims) self.ub = 1 * np.ones(self.dims) self.counter = 0 self.env = FlattenObservation( FilterObservation(gym.make('FetchReach-v1'), ['observation', 'desired_goal'])) self.num_rollouts = 3 self.render = False self.policy_shape = (4, 13) #tunable hyper-parameters in LA-MCTS self.Cp = 10 self.leaf_size = 100 self.kernel_type = "linear" self.gamma_type = "auto" self.ninits = 30 print("===========initialization===========") print("mean:", self.mean) print("std:", self.std) print("dims:", self.dims) print("policy:", self.policy_shape) def __call__(self, x): self.counter += 1 assert len(x) == self.dims assert x.ndim == 1 assert np.all(x <= self.ub) and np.all(x >= self.lb) M = x.reshape(self.policy_shape) returns = [] observations = [] actions = [] for i in range(self.num_rollouts): obs = self.env.reset() done = False totalr = 0. steps = 0 while not done: # M = self.policy inputs = (obs - self.mean) / self.std action = np.dot(M, inputs) observations.append(obs) actions.append(action) obs, r, done, _ = self.env.step(action) totalr += r steps += 1 if self.render: self.env.render() returns.append(totalr) return np.mean(returns) * -1
def test_flatten_observation(env_id): env = gym.make(env_id) wrapped_env = FlattenObservation(env) obs = env.reset() wrapped_obs = wrapped_env.reset() assert len(obs.shape) == 3 assert len(wrapped_obs.shape) == 1 assert wrapped_obs.shape[0] == obs.shape[0] * obs.shape[1] * obs.shape[2]
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal'])) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',)) env.seed(seed) return env
def test_flatten_observation(env_id): env = gym.make(env_id) wrapped_env = FlattenObservation(env) obs = env.reset() wrapped_obs = wrapped_env.reset() space = spaces.Tuple( (spaces.Discrete(32), spaces.Discrete(11), spaces.Discrete(2))) wrapped_space = spaces.Box(0, 1, [32 + 11 + 2], dtype=np.int64) assert space.contains(obs) assert wrapped_space.contains(wrapped_obs)
def test_flattened_environment(self, observation_space, ordered_values): """ make sure that flattened observations occur in the order expected """ env = FakeEnvironment(observation_space=observation_space) wrapped_env = FlattenObservation(env) flattened = wrapped_env.reset() unflattened = unflatten(env.observation_space, flattened) original = env.observation self._check_observations(original, flattened, unflattened, ordered_values)
def make_env(with_monitor=False,folder_name='results'): env = gym.make("FetchReach-v1") env.env.reward_type = 'dense' env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal'])) if with_monitor: env = gym.wrappers.Monitor(env, folder_name, force=True) return env
def make_env(env_id, env_type, args, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None): if initializer is not None: initializer(mpi_rank=mpi_rank, subrank=subrank) wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} if ':' in env_id: import re import importlib module_name = re.sub(':.*', '', env_id) env_id = re.sub('.*:', '', env_id) importlib.import_module(module_name) env = gym.make(env_id, **env_kwargs) # Adding RM wrappers if needed if args.alg.endswith("hrm") or args.alg.endswith("dhrm"): env = HierarchicalRMWrapper(env, args.r_min, args.r_max, args.use_self_loops, args.use_rs, args.gamma, args.rs_gamma) if args.use_rs or args.use_crm: env = RewardMachineWrapper(env, args.use_crm, args.use_rs, args.gamma, args.rs_gamma) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): env = FlattenObservation(env) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if isinstance(env.action_space, gym.spaces.Box): env = ClipActionsWrapper(env) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
def test_nested_dicts_size(self, observation_space, flat_shape): env = FakeEnvironment(observation_space=observation_space) # Make sure we are testing the right environment for the test. observation_space = env.observation_space assert isinstance(observation_space, Dict) wrapped_env = FlattenObservation(FilterObservation(env, env.obs_keys)) assert wrapped_env.observation_space.shape == flat_shape assert wrapped_env.observation_space.dtype == np.float32
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None): if initializer is not None: initializer(mpi_rank=mpi_rank, subrank=subrank) wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} if ':' in env_id: import importlib import re module_name = re.sub(':.*','',env_id) env_id = re.sub('.*:', '', env_id) importlib.import_module(module_name) env = gym.make(env_id, **env_kwargs) # if env_id.startswith('Sawyer'): # from mher.algos.multi_world_wrapper import SawyerGoalWrapper # env = SawyerGoalWrapper(env) # if (env_id.startswith('Sawyer') or env_id.startswith('Point2D')) and not hasattr(env, '_max_episode_steps'): # env = gym.wrappers.TimeLimit(env, max_episode_steps=100) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): env = FlattenObservation(env) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if isinstance(env.action_space, gym.spaces.Box): env = ClipActionsWrapper(env) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): """ Create a wrapped, monitored gym.Env for MuJoCo. :param env_id: (str) the environment ID :param seed: (int) the inital seed for RNG :param rank: (int) the rank of the environment (for logging) :param allow_early_resets: (bool) allows early reset of the environment :return: (Gym Environment) The robotic environment """ set_global_seeds(seed) env = gym.make(env_id) keys = ['observation', 'desired_goal'] # TODO: remove try-except once most users are running modern Gym try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success', ), allow_early_resets=allow_early_resets) env.seed(seed) return env
def setup_wrappers(env): obs_shape = env.observation_space.shape is_image = len(obs_shape) == 3 if is_image: from gym.wrappers import GrayScaleObservation from gym.wrappers import FlattenObservation from gym.wrappers import ResizeObservation env = GrayScaleObservation(env) # env = ResizeObservation(env, (obs_shape[0]//3, obs_shape[0]//3)) env = FlattenObservation(env) return env
def create_goal_gym_env(**kwargs): frames = kwargs.pop('frames', 1) name = kwargs.pop('name') limit_steps = kwargs.pop('limit_steps', False) env = gym.make(name, **kwargs) env = FlattenObservation( FilterObservation(env, ['observation', 'desired_goal'])) if frames > 1: env = wrappers.FrameStack(env, frames, False) if limit_steps: env = wrappers.LimitStepsWrapper(env) return env
def test_flatten_observation(env_id): env = gym.make(env_id) wrapped_env = FlattenObservation(env) obs = env.reset() wrapped_obs = wrapped_env.reset() if env_id == 'Blackjack-v0': space = spaces.Tuple( (spaces.Discrete(32), spaces.Discrete(11), spaces.Discrete(2))) wrapped_space = spaces.Box(-np.inf, np.inf, [32 + 11 + 2], dtype=np.float32) elif env_id == 'KellyCoinflip-v0': space = spaces.Tuple( (spaces.Box(0, 250.0, [1], dtype=np.float32), spaces.Discrete(300 + 1))) wrapped_space = spaces.Box(-np.inf, np.inf, [1 + (300 + 1)], dtype=np.float32) assert space.contains(obs) assert wrapped_space.contains(wrapped_obs)
def env_fn(): env = gym.make(env_name) if constraint != None: if use_aug: augmentation_type = 'constraint_state_concat' else: augmentation_type = 'None' use_dense = dense_coeff > 0. env = ConstraintEnv( env, [get_constraint(constraint)(False, use_dense, dense_coeff)], augmentation_type=augmentation_type, log_dir='../tests/' + exp_name) fcenv = FlattenObservation(env) return fcenv
def _thunk(): env = gym.make(env_id) if env_id.find('Fetch') == -1: env = FlattenObservation(env) else: env = FlattenDictWrapper(env, ['achieved_goal', 'desired_goal']) env = RandomizedEnvWrapper(env, seed + rank) env.seed(seed + rank) return env
def train(env, type, timesteps): env.reset() print(check_env(env)) env = FlattenObservation(env) print(env.reward_range) print(env.action_space) if type == "DQN": model = DQN('MlpPolicy', exploration_fraction=0.999, env=env, verbose=1) elif type == "A2C": model = A2C('MlpPolicy', env=env, verbose=1) elif type == "PPO": model = PPO('MlpPolicy', env=env, verbose=1) model.learn(total_timesteps=timesteps) model.save("model_cups")
def make_env(): # wrapped_env -> flatten_observation -> monitor -> clip_action -> scale_reward env = make_wrapped_env() env.seed(seed + subrank if seed is not None else None) if flatten_dict_observations:# and isinstance(env.observation_space, gym.spaces.Dict): env = FlattenObservation(env) env = Monitor(env, osp.join(monitor_log_dir, str(mpi_rank) + '.' + str(subrank)), # training and eval write to same file? allow_early_resets=True) env = ClipActionsWrapper(env) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
def main(): as_gdads = True name = "pointmass" drop_abs_position = True dads_env_fn = envs_fns[name] conf: Conf = CONFS[name] dict_env = as_dict_env(dads_env_fn()) dict_env = TimeLimit(dict_env, max_episode_steps=conf.ep_len) if drop_abs_position: dict_env = DropGoalEnvsAbsoluteLocation(dict_env) if as_gdads: flat_env = SkillWrapper(env=dict_env, skill_reset_steps=conf.ep_len // 2) else: flat_obs_content = ["observation", "desired_goal", "achieved_goal"] if drop_abs_position: flat_obs_content.remove("achieved_goal") # Because always 0 vector flat_env = FlattenObservation(FilterObservation(dict_env, filter_keys=flat_obs_content)) flat_env = TransformReward(flat_env, f=lambda r: r*conf.reward_scaling) flat_env = Monitor(flat_env) filename = f"modelsCommandSkills/{name}-gdads{as_gdads}" if os.path.exists(filename + ".zip"): sac = SAC.load(filename, env=flat_env) if as_gdads: flat_env.load(filename) else: sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr, tensorboard_log=f"{filename}-tb", buffer_size=10000) train(model=sac, conf=conf, save_fname=filename) if as_gdads: flat_env.save(filename) if as_gdads: flat_env.set_sac(sac) eval_dict_env(dict_env=dict_env, model=flat_env, ep_len=conf.ep_len) show(model=sac, env=flat_env, conf=conf)
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None): if initializer is not None: initializer(mpi_rank=mpi_rank, subrank=subrank) wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} if ':' in env_id: import re import importlib module_name = re.sub(':.*','',env_id) env_id = re.sub('.*:', '', env_id) importlib.import_module(module_name) if env_type == 'atari': env = make_atari(env_id) elif env_type == 'retro': import retro gamestate = gamestate or retro.State.DEFAULT env = retro_wrappers.make_retro(game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate) else: env = gym.make(env_id, **env_kwargs) if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict): env = FlattenObservation(env) env.seed(seed + subrank if seed is not None else None) env = Monitor(env, logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) if env_type == 'atari': env = wrap_deepmind(env, **wrapper_kwargs) elif env_type == 'retro': if 'frame_stack' not in wrapper_kwargs: wrapper_kwargs['frame_stack'] = 1 env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs) if isinstance(env.action_space, gym.spaces.Box): env = ClipActionsWrapper(env) if reward_scale != 1: env = retro_wrappers.RewardScaler(env, reward_scale) return env
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) #env = FlattenDictWrapper(env, ['observation', 'desired_goal']) keys = ['observation', 'desired_goal'] # TODO: remove try-except once most users are running modern Gym try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success', )) env.seed(seed) return env
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None): """ Make environment Args: env_id: (str) environment id e.g. 'Reacher-v2' env_type: (str) environment type e.g. 'atari' mpi_rank: (int) rank for mpi; default=0 (disabled on windows for lack of MPI support from pytorch) subrank: (int) subrank; default=0 (disabled on windows for lack of MPI support from pytorch) seed: (int) random seed reward_scale: (float) scale factor for reward (== discount factor??); default=1.0 gamestate: (??) game state to load (for retro games only) flatten_dict_observations: (??) ?? wrapper_kwargs: (dict) dictionary of parameter settings for wrapper env_kwargs: (dict) dictionary of parameter settings for environment logger_dir: (str) logger path initializer: (??) ?? Returns: env: (Env) the set-up environment """ if initializer is not None: initializer(mpi_rank=mpi_rank, subrank=subrank) wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} if ':' in env_id: raise ValueError( "env_id {} does not conform to accepted format!".format(env_id)) if env_type == 'atari': # make atari environments with a wrapper function env = make_atari(env_id) elif env_type == 'retro': raise ValueError("retro environments not supported yet!") else: # make a gym environment with parameter settings env = gym.make(env_id, **env_kwargs) # flatten the observation space if flatten_dict_observations and isinstance(env.observation_spaces, gym.spaces.Dict): env = FlattenObservation(env) # add seed to env env.seed(seed + subrank if seed is not None else None) # set up Monitor (TBD) if env_type == 'atari': env = wrap_deepmind(env, **wrapper_kwargs) elif env_type == 'retro': if 'frame_stack' not in wrapper_kwargs: wrapper_kwargs['frame_stack'] = 1 # wrap retro games env = wrappers_retro.wrap_deepmind_retro(env, **wrapper_kwargs) if isinstance(env.action_space, gym.spaces.Box): # if action_space is Box type, clip the action values to be within the box's boundaries env = wrappers.ClipActionsWrapper(env) if reward_scale != 1: # if reward scaling factor is used, scale the rewards accordingly # very important feature for PPO env = wrappers.RewardScalerWrapper(env, reward_scale) return env
def test_nested_dicts_ravel(self, observation_space, flat_shape): env = FakeEnvironment(observation_space=observation_space) wrapped_env = FlattenObservation(FilterObservation(env, env.obs_keys)) obs = wrapped_env.reset() assert obs.shape == wrapped_env.observation_space.shape
solver_kwargs={}, # Define and parameterize the reference generator for the current reference reference_generator=WienerProcessReferenceGenerator( reference_state='i', sigma_range=(3e-3, 3e-2)), # Defines which variables to plot via the builtin dashboard monitor visualization=MotorDashboard(state_plots=['i', 'omega']), ) # Now, the environment will output states and references separately state, ref = env.reset() # For data processing we sometimes want to flatten the env output, # which means that the env will only output one array that contains states and references consecutively env = FlattenObservation(env) obs = env.reset() # Read the number of possible actions for the given env # this allows us to define a proper learning agent for this task nb_actions = env.action_space.n window_length = 1 # Define an artificial neural network to be used within the agent model = Sequential() # The network's input fits the observation space of the env model.add( Flatten(input_shape=(window_length, ) + env.observation_space.shape)) model.add(Dense(16, activation='relu')) model.add(Dense(16, activation='relu'))
visualization=MotorDashboard(visu_period=0.5, plotted_variables=['omega', 'i', 'u']), converter='Disc-4QC', # Take standard class and pass parameters (Load) a=0, b=.1, c=1.1, j_load=0.4, # Pass a string (with extra parameters) ode_solver='euler', solver_kwargs={}, # Pass a Class with extra parameters reference_generator=WienerProcessReferenceGenerator( reference_state='i', sigma_range=(5e-3, 5e-1))) nb_actions = env.action_space.n env = FlattenObservation(env) model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(4)) model.add(LeakyReLU(alpha=0.05)) model.add(Dense(4)) model.add(LeakyReLU(alpha=0.05)) model.add(Dense(nb_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=15000, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(eps=0.5), 'eps', 0.5, 0.01, 0, 20000) dqn = DQNAgent(model=model, policy=policy, nb_actions=nb_actions,
def env_fn(): env = gym.make(ENV) print(env.observation_space) env = FlattenObservation(env) return env
import multiprocessing import neat import numpy as np import os import pickle import random import time from gym.wrappers import FlattenObservation, FilterObservation import visualize NUM_CORES = 1 env = gym.make('FetchReach-v1') env.env.reward_type = 'dense' env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal'])) print("action space: {0!r}".format(env.action_space)) print("observation space: {0!r}".format(env.observation_space)) env = gym.wrappers.Monitor(env, 'results', force=True) class RoboGenome(neat.DefaultGenome): def __init__(self, key): super().__init__(key) self.discount = None def configure_new(self, config): super().configure_new(config) self.discount = 0.01 + 0.98 * random.random()
def load(environment_name, env_id=None, concat_desired_goal=True, discount=1.0, max_episode_steps=None, sparse_reward=False, use_success_wrapper=True, gym_env_wrappers=(), alf_env_wrappers=(), wrap_with_process=False): """Loads the selected environment and wraps it with the specified wrappers. Note that by default a ``TimeLimit`` wrapper is used to limit episode lengths to the default benchmarks defined by the registered environments. Args: environment_name: Name for the environment to load. env_id: A scalar ``Tensor`` of the environment ID of the time step. discount: Discount to use for the environment. max_episode_steps: If None the ``max_episode_steps`` will be set to the default step limit defined in the environment's spec. No limit is applied if set to 0 or if there is no ``timestep_limit`` set in the environment's spec. sparse_reward (bool): If True, the game ends once the goal is achieved. Rewards will be added by 1, changed from -1/0 to 0/1. use_success_wrapper (bool): If True, wraps the environment with the SuccessWrapper which will record Success info after a specified amount of timesteps. gym_env_wrappers: Iterable with references to wrapper classes to use directly on the gym environment. alf_env_wrappers: Iterable with references to wrapper classes to use on the torch environment. Returns: An AlfEnvironment instance. """ assert (environment_name.startswith("Fetch") or environment_name.startswith("HandManipulate")), ( "This suite only supports OpenAI's Fetch and ShadowHand envs!") _unwrapped_env_checker_.check_and_update(wrap_with_process) gym_spec = gym.spec(environment_name) env = gym_spec.make() if max_episode_steps is None: if gym_spec.max_episode_steps is not None: max_episode_steps = gym_spec.max_episode_steps else: max_episode_steps = 0 def env_ctor(env_id=None): return suite_gym.wrap_env( env, env_id=env_id, discount=discount, max_episode_steps=max_episode_steps, gym_env_wrappers=gym_env_wrappers, alf_env_wrappers=alf_env_wrappers, image_channel_first=False) # concat robot's observation and the goal location if concat_desired_goal: keys = ["observation", "desired_goal"] try: # for modern Gym (>=0.15.4) from gym.wrappers import FilterObservation, FlattenObservation env = FlattenObservation(FilterObservation(env, keys)) except ImportError: # for older gym (<=0.15.3) from gym.wrappers import FlattenDictWrapper # pytype:disable=import-error env = FlattenDictWrapper(env, keys) if use_success_wrapper: env = SuccessWrapper(env, max_episode_steps) env = ObservationClipWrapper(env) if sparse_reward: env = SparseReward(env) if wrap_with_process: process_env = process_environment.ProcessEnvironment( functools.partial(env_ctor)) process_env.start() torch_env = alf_wrappers.AlfEnvironmentBaseWrapper(process_env) else: torch_env = env_ctor(env_id=env_id) return torch_env
def train(self): """Method for training the Network""" for epoch in range(self.n_epochs): for episode in range(self.n_episodes): done = False score = 0 episode_experience = [] # Reset the environment to it's initial state observation = self.env.reset() observation = FlattenObservation( FilterObservation( observation, ['observation', 'achieved_goal', 'desired_goal'])) # Because we are not working with a continous action space, # we are limiting ourselfs to a finite number of timesteps # per episode, other wise the below for loop would be replaced # with `while not done:` for _ in range(self.n_time_steps): self.env.render() action = self.act(observation['observation']) print(action) new_observation, reward, done, info = self.env.step(action) score += reward episode_experience.append( (observation['observation'], action, reward, new_observation['observation'], done)) self.save(np.asarray(observation['observation']), action, reward, new_observation['observation'], done) observation = new_observation self.learn() # break if we finish the environment if done is True: break # HER Algorithm for t in range(len(episode_experience)): for _ in range(self.K): future = np.random.randint(t, len(episode_experience)) goal = episode_experience[future][3] state = episode_experience[t][0] action = episode_experience[t][1] next_state = episode_experience[t][3] done = np.array_equal(next_state, goal) reward = 0 if done else -1 self.save(state, action, reward, next_state, done) # save model every 5 epochs # this is an arbitrary number and will change if epoch % 10 == 0 and epoch > 0: self.save_model