def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None): if "num_population" in args.__dict__: args.num_cpu = args.num_population * 2 assert not (registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \ "Error: cannot have more than 1 CPU for the environment {}".format(args.env) if env_kwargs is not None and env_kwargs.get("use_srl", False): srl_model = MultiprocessSRLModel(args.num_cpu, args.env, env_kwargs) env_kwargs["state_dim"] = srl_model.state_dim env_kwargs["srl_pipe"] = srl_model.pipe envs = [ makeEnv(args.env, args.seed, i, args.log_dir, allow_early_resets=True, env_kwargs=env_kwargs) for i in range(args.num_cpu) ] envs = SubprocVecEnv(envs) envs = VecFrameStack(envs, args.num_stack) if args.srl_model != "raw_pixels" and args.algo_type == "v2": envs = VecNormalize(envs, norm_obs=True, norm_reward=False) envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise) return envs
def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None): # Even though DeepQ is single core only, we need to use the pipe system to work if env_kwargs is not None and env_kwargs.get("use_srl", False): srl_model = MultiprocessSRLModel(1, args.env, env_kwargs) env_kwargs["state_dim"] = srl_model.state_dim env_kwargs["srl_pipe"] = srl_model.pipe env = DummyVecEnv([makeEnv(args.env, args.seed, 0, args.log_dir, env_kwargs=env_kwargs)]) if args.srl_model != "raw_pixels": env = VecNormalize(env, norm_reward=False) env = loadRunningAverage(env, load_path_normalise=load_path_normalise) return env
def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None): # Even though SAC is single core only, we need to use the pipe system to work if env_kwargs is not None and env_kwargs.get("use_srl", False): srl_model = MultiprocessSRLModel(1, args.env, env_kwargs) env_kwargs["state_dim"] = srl_model.state_dim env_kwargs["srl_pipe"] = srl_model.pipe env = CustomDummyVecEnv([makeEnv(args.env, args.seed, 0, args.log_dir, env_kwargs=env_kwargs)]) if args.srl_model != "raw_pixels": env = VecNormalize(env, norm_obs=True, norm_reward=False) env = loadRunningAverage(env, load_path_normalise=load_path_normalise) # Normalize only raw pixels # WARNING: when using framestacking, the memory used by the replay buffer can grow quickly return WrapFrameStack(env, args.num_stack, normalize=args.srl_model == "raw_pixels")
def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None): # Even though DeepQ is single core only, we need to use the pipe system to work if env_kwargs is not None and env_kwargs.get("use_srl", False): srl_model = MultiprocessSRLModel(1, args.env, env_kwargs) env_kwargs["state_dim"] = srl_model.state_dim env_kwargs["srl_pipe"] = srl_model.pipe envs = DummyVecEnv([makeEnv(args.env, args.seed, 0, args.log_dir, env_kwargs=env_kwargs)]) envs = VecFrameStack(envs, args.num_stack) if args.srl_model != "raw_pixels": printYellow("Using MLP policy because working on state representation") envs = VecNormalize(envs, norm_obs=True, norm_reward=False) envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise) return envs
def env_thread(args, thread_num, partition=True): """ Run a session of an environment :param args: (ArgumentParser object) :param thread_num: (int) The thread ID of the environment session :param partition: (bool) If the output should be in multiple parts (default=True) """ env_kwargs = { "max_distance": args.max_distance, "random_target": args.random_target, "force_down": True, "is_discrete": not args.continuous_actions, "renders": thread_num == 0 and args.display, "record_data": not args.no_record_data, "multi_view": args.multi_view, "save_path": args.save_path, "shape_reward": args.shape_reward, "simple_continual_target": args.simple_continual, "circular_continual_move": args.circular_continual, "square_continual_move": args.square_continual, "short_episodes": args.short_episodes } if partition: env_kwargs["name"] = args.name + "_part-" + str(thread_num) else: env_kwargs["name"] = args.name load_path, train_args, algo_name, algo_class = None, None, None, None model = None srl_model = None srl_state_dim = 0 generated_obs = None env_norm = None if args.run_policy in ["walker", "custom"]: if args.latest: args.log_dir = latestPath(args.log_custom_policy) else: args.log_dir = args.log_custom_policy args.render = args.display args.plotting, args.action_proba = False, False train_args, load_path, algo_name, algo_class, _, env_kwargs_extra = loadConfigAndSetup( args) env_kwargs["srl_model"] = env_kwargs_extra["srl_model"] env_kwargs["random_target"] = env_kwargs_extra.get( "random_target", False) env_kwargs["use_srl"] = env_kwargs_extra.get("use_srl", False) # TODO REFACTOR env_kwargs["simple_continual_target"] = env_kwargs_extra.get( "simple_continual_target", False) env_kwargs["circular_continual_move"] = env_kwargs_extra.get( "circular_continual_move", False) env_kwargs["square_continual_move"] = env_kwargs_extra.get( "square_continual_move", False) env_kwargs["eight_continual_move"] = env_kwargs_extra.get( "eight_continual_move", False) eps = 0.2 env_kwargs["state_init_override"] = np.array([MIN_X + eps, MAX_X - eps]) \ if args.run_policy == 'walker' else None if env_kwargs["use_srl"]: env_kwargs["srl_model_path"] = env_kwargs_extra.get( "srl_model_path", None) env_kwargs["state_dim"] = getSRLDim( env_kwargs_extra.get("srl_model_path", None)) srl_model = MultiprocessSRLModel(num_cpu=args.num_cpu, env_id=args.env, env_kwargs=env_kwargs) env_kwargs["srl_pipe"] = srl_model.pipe env_class = registered_env[args.env][0] env = env_class(**env_kwargs) if env_kwargs.get('srl_model', None) not in ["raw_pixels", None]: # TODO: Remove env duplication # This is a dirty trick to normalize the obs. # So for as we override SRL environment functions (step, reset) for on-policy generation & generative replay # using stable-baselines' normalisation wrappers (step & reset) breaks... env_norm = [ makeEnv(args.env, args.seed, i, args.log_dir, allow_early_resets=False, env_kwargs=env_kwargs) for i in range(args.num_cpu) ] env_norm = DummyVecEnv(env_norm) env_norm = VecNormalize(env_norm, norm_obs=True, norm_reward=False) env_norm = loadRunningAverage( env_norm, load_path_normalise=args.log_custom_policy) using_real_omnibot = args.env == "OmnirobotEnv-v0" and USING_OMNIROBOT walker_path = None action_walker = None state_init_for_walker = None kwargs_reset, kwargs_step = {}, {} if args.run_policy in ['custom', 'ppo2', 'walker']: # Additional env when using a trained agent to generate data train_env = vecEnv(env_kwargs, env_class) if args.run_policy == 'ppo2': model = PPO2(CnnPolicy, train_env).learn(args.ppo2_timesteps) else: _, _, algo_args = createEnv(args, train_args, algo_name, algo_class, env_kwargs) tf.reset_default_graph() set_global_seeds(args.seed % 2 ^ 32) printYellow("Compiling Policy function....") model = algo_class.load(load_path, args=algo_args) if args.run_policy == 'walker': walker_path = walkerPath() if len(args.replay_generative_model) > 0: srl_model = loadSRLModel(args.log_generative_model, th.cuda.is_available()) srl_state_dim = srl_model.state_dim srl_model = srl_model.model.model frames = 0 start_time = time.time() # divide evenly, then do an extra one for only some of them in order to get the right count for i_episode in range(args.num_episode // args.num_cpu + 1 * (args.num_episode % args.num_cpu > thread_num)): # seed + position in this slice + size of slice (with reminder if uneven partitions) seed = args.seed + i_episode + args.num_episode // args.num_cpu * thread_num + \ (thread_num if thread_num <= args.num_episode % args.num_cpu else args.num_episode % args.num_cpu) seed = seed % 2 ^ 32 if not (args.run_policy in ['custom', 'walker']): env.seed(seed) env.action_space.seed( seed) # this is for the sample() function from gym.space if len(args.replay_generative_model) > 0: sample = Variable(th.randn(1, srl_state_dim)) if th.cuda.is_available(): sample = sample.cuda() generated_obs = srl_model.decode(sample) generated_obs = generated_obs[0].detach().cpu().numpy() generated_obs = deNormalize(generated_obs) kwargs_reset['generated_observation'] = generated_obs obs = env.reset(**kwargs_reset) done = False action_proba = None t = 0 episode_toward_target_on = False while not done: env.render() # Policy to run on the fly - to be trained before generation if args.run_policy == 'ppo2': action, _ = model.predict([obs]) # Custom pre-trained Policy (SRL or End-to-End) elif args.run_policy in ['custom', 'walker']: obs = env_norm._normalize_observation(obs) action = [model.getAction(obs, done)] action_proba = model.getActionProba(obs, done) if args.run_policy == 'walker': action_walker = np.array(walker_path[t]) # Random Policy else: # Using a target reaching policy (untrained, from camera) when collecting data from real OmniRobot if episode_toward_target_on and np.random.rand() < args.toward_target_timesteps_proportion and \ using_real_omnibot: action = [env.actionPolicyTowardTarget()] else: action = [env.action_space.sample()] # Generative replay +/- for on-policy action if len(args.replay_generative_model) > 0: if args.run_policy == 'custom': obs = obs.reshape(1, srl_state_dim) obs = th.from_numpy(obs.astype(np.float32)).cuda() z = obs generated_obs = srl_model.decode(z) else: sample = Variable(th.randn(1, srl_state_dim)) if th.cuda.is_available(): sample = sample.cuda() generated_obs = srl_model.decode(sample) generated_obs = generated_obs[0].detach().cpu().numpy() generated_obs = deNormalize(generated_obs) action_to_step = action[0] kwargs_step = { k: v for (k, v) in [("generated_observation", generated_obs), ("action_proba", action_proba), ("action_grid_walker", action_walker)] if v is not None } obs, _, done, _ = env.step(action_to_step, **kwargs_step) frames += 1 t += 1 if done: if np.random.rand( ) < args.toward_target_timesteps_proportion and using_real_omnibot: episode_toward_target_on = True else: episode_toward_target_on = False print("Episode finished after {} timesteps".format(t + 1)) if thread_num == 0: print("{:.2f} FPS".format(frames * args.num_cpu / (time.time() - start_time)))