def train(hours): conn = Connection() env = Monitor(SpireEnv(conn), "./tmp/") env.reset() logdir = "./tboard_log" try: model = MODEL_CLASS.load(MODEL_NAME, env=env, tensorboard_log=logdir) except FileNotFoundError: model = MODEL_CLASS(MlpPolicy, env, tensorboard_log=logdir, **KWARGS) start = time.time() steps_per_hour = 7000 steps = steps_per_hour * hours callback = TensorboardCallback(env) model.learn(total_timesteps=steps, reset_num_timesteps=False, callback=callback) model.save(MODEL_NAME) elapsed = time.time() - start print(f"{steps} steps processed") print(f"{timedelta(seconds=elapsed)} time elapsed") print(f"{env.total_floors} floors climbed") print(f"{env.total_games} games played") if env.total_games > 0: print("{:.2f} floors per game".format(env.total_floors / env.total_games))
def train(): best_reward, best_reward_timesteps = None, None save_path = "model_save/"+MODEL_PATH+"/" if save_path is not None: os.makedirs(save_path, exist_ok=True) # log_dir = f"model_save/" log_dir = save_path env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT) env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir) env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval]) # env = VecNormalize(env, norm_obs=True, norm_reward=True, # clip_obs=10.) if PARAM['algo']=='td3': model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ddpg': model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'], learning_starts=PARAM['learning_starts']) elif PARAM['algo']=='ppo': model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed']) eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model', log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'], deterministic=True, render=False) model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500) print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep) model.save(save_path+MODEL_PATH+'_final_timesteps')
def make_envs(env_id, log_dir, gamma, max_train_ep_length, max_eval_ep_length, seed): """Make training and evaluation environments (vectorized envs).""" # Training env train_env = gym.make(env_id) train_env.seed(seed) # Set random seed train_env = TimeLimitWrapper( train_env, max_train_ep_length) # Limit length of training episodes train_env = Monitor(train_env, log_dir) # Monitor training train_env = NormalizeActionWrapper(train_env) # Normalize action space train_env = DummyVecEnv([lambda: train_env]) # Vectorize environment train_env = VecNormalize(train_env, gamma=gamma) # Normalise observations and rewards # Eval env eval_env = gym.make(env_id) eval_env.seed(seed) # Set random seed eval_env = TimeLimitWrapper( eval_env, max_eval_ep_length) # Set a maximum number of timesteps during eval eval_env = Monitor( eval_env ) # Used to ensure original action space is not modified by `NormalizeActionWrapper` eval_env = NormalizeActionWrapper(eval_env) # Normalize action space eval_env = DummyVecEnv([lambda: eval_env]) # Vectorize environment eval_env = VecNormalize(eval_env, gamma=gamma, training=False, norm_reward=False) # Normalise observations # (obs/reward normalization gets synchronised with `train_env` in `EvalCallback`) return train_env, eval_env
def _init_envs(image_observations, num_skip_steps, opponent_pred_obs, adversarial_training): """ Initialize the environments with the necessaary wrappers for training. Wrappers are determined by settings in the arguments. """ # In order to ensure symmetry for the agent when playing on either side, change second agent to red, so both have the same color if image_observations: pong_duel.AGENT_COLORS[1] = 'red' # Initialize environment train_env = gym.make('PongDuel-v0') train_env = RewardZeroToNegativeBiAgentWrapper(train_env) train_env_rule_based = ObservationVectorToImage(train_env, 'p1') train_env_rule_based = MAGymCompatibilityWrapper(train_env_rule_based, num_skip_steps=num_skip_steps, image_observations='main') if adversarial_training is not None: train_env_rule_based = AdversarialTrainingWrapper(train_env_rule_based, adversarial_probability=adversarial_training, img_obs=image_observations) train_env_rule_based = Monitor(train_env_rule_based) train_env = ObservationVectorToImage(train_env, 'both') train_env = MAGymCompatibilityWrapper(train_env, num_skip_steps=num_skip_steps, image_observations='both') if adversarial_training is not None: train_env = AdversarialTrainingWrapper(train_env, adversarial_probability=adversarial_training, img_obs=image_observations) train_env = Monitor(train_env) eval_env_rule_based = gym.make('PongDuel-v0') eval_env_rule_based = ObservationVectorToImage(eval_env_rule_based, 'p1') eval_env_rule_based = MAGymCompatibilityWrapper(eval_env_rule_based, num_skip_steps=num_skip_steps, image_observations='main') eval_op = SimpleRuleBasedAgent(eval_env_rule_based) eval_env_rule_based.set_opponent(eval_op) eval_env = gym.make('PongDuel-v0') eval_env = ObservationVectorToImage(eval_env, 'both') eval_env = MAGymCompatibilityWrapper(eval_env, num_skip_steps=num_skip_steps, image_observations='both') else: # Init for feature observations train_env = gym.make('PongDuel-v0') train_env = ObserveOpponent(train_env, 'both') train_env = RewardZeroToNegativeBiAgentWrapper(train_env) train_env = MAGymCompatibilityWrapper(train_env, num_skip_steps=num_skip_steps, image_observations='none') if opponent_pred_obs: train_env = OpponentPredictionObs(train_env) if adversarial_training is not None: train_env = AdversarialTrainingWrapper(train_env, adversarial_probability=adversarial_training, img_obs=image_observations) train_env = Monitor(train_env) eval_env = gym.make('PongDuel-v0') eval_env = ObserveOpponent(eval_env, 'both') eval_env = MAGymCompatibilityWrapper(eval_env, num_skip_steps=num_skip_steps, image_observations='none') # For feature observations we don't need to separate between environment for rule-based and non-rule-based agents train_env_rule_based = train_env eval_env_rule_based = eval_env eval_op = SimpleRuleBasedAgent(eval_env_rule_based) eval_env_rule_based.set_opponent(eval_op) return eval_env, eval_env_rule_based, eval_op, train_env, train_env_rule_based
def _init() -> gym.Env: env = gym.make(env_id) # Create folder if needed if log_dir is not None: os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir) env.seed(seed + rank) return env
def train_ppo(itr=0, timesteps=1e7, use_dummy_video = True): env = flappy_env.FlappyEnv(use_dummy_video) env = Monitor(env, f"flappy_ppo_{itr}") obs = env.reset() model = PPO( "CnnPolicy", env, verbose=1, learning_rate=1e-5, tensorboard_log = f"./ppo_flappy_tensorboard_{itr}/") model.learn(total_timesteps = timesteps) model.save(f"ppo_flappy_{itr}")
def update_env(self, env, support_multi_env: bool = False, eval_env: Optional[GymEnv] = None, monitor_wrapper: bool = True, reset_optimizers: bool = False, **kwargs): """ Replace current env with new env. :param env: Gym environment (activated, not a string). :param support_multi_env: Whether the algorithm supports training with multiple environments (as in A2C) :param eval_env: Environment to use for evaluation (optional). :param monitor_wrapper: When creating an environment, whether to wrap it or not in a Monitor wrapper. :param reset_optimizers: Whether to reset optimizers (momentums, etc.). :param kwargs: Does nothing, just so more arguments can pass without method failing :return: """ if reset_optimizers: optimizers = [] if self.actor is not None: optimizers.append(self.actor.optimizer) if self.critic is not None: optimizers.append(self.critic.optimizer) if self.ent_coef_optimizer is not None: optimizers.append(self.ent_coef_optimizer) # Reset optimizers: for i_optimizer, optimizer in enumerate(optimizers): optimizer.__init__(optimizer.param_groups[0]['params']) optimizers[i_optimizer] = optimizer if env is not None: if eval_env is not None: self.eval_env = eval_env if monitor_wrapper: self.eval_env = Monitor(self.eval_env, filename=None) if monitor_wrapper: env = Monitor(env, filename=None) env = self._wrap_env(env, self.verbose) self.observation_space = env.observation_space self.action_space = env.action_space self.n_envs = env.num_envs self.env = env if not support_multi_env and self.n_envs > 1: raise ValueError( "Error: the model does not support multiple envs; it requires " "a single vectorized environment.")
def train_dqn(itr = 0, timesteps = 1e7, use_dummy_video = True): env = flappy_env.FlappyEnv(use_dummy_video) env = Monitor(env, f"flappy_dqn_{itr}") obs = env.reset() model = DQN( "CnnPolicy", env, verbose = 1, optimize_memory_usage = True, buffer_size = 500000, learning_rate = 1e-5, tensorboard_log = f"./dqn_flappy_tensorboard_{itr}/") model.learn(total_timesteps = timesteps) model.save(f"dqn_flappy_{itr}")
def train_policy_ppo(path='policy_ppo', org_path='prob_ppo'): """ 学習済み方策をつかった環境を相手にトレーニングを行う 引数: path 学習済みモデルファイルパス org_path 学習元となる方策がロードする学習済みモデルファイルパス """ print(f'train ppo with prob_player path={path}, org_path={org_path}') # 学習済みモデルファイルのロード model = PPO.load(org_path) # じゃんけん環境の構築 env = RockPaperScissorsEnv(AIPlayer(model)) env = Monitor(env, LOGDIR, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # モデルのセット model.set_env(env) # トレーニング実行 elapsed = time.time() model.learn(total_timesteps=1000000) print(f'elapse time: {time.time() - elapsed}sec') # 学習済みモデルの保存 model.save(path) # じゃんけん環境のクローズ env.close()
def make_experiment_env(params, train): clear = MortalKombat2.\ make_mortal_kombat2_env(difficulties=params["difficulties"], arenas=params["arenas"], left_players=params["left_players"], right_players=params["right_players"], controllable_players=params["controllable_players"], actions=params["actions"], state_versions=params["state_versions"]) env = FrameskipWrapper(clear, skip=params["frameskip"]) if params["max_episode_length"]: env = MaxEpLenWrapper(env, max_len=params["params"] // params["frameskip"]) env = WarpFrame(env, 48, 48) if train: env = Monitor(env, info_keywords=("P1_rounds", "P2_rounds", "P1_health", "P2_health", "steps", "difficulty", "arena", "P1", "P2", "state_version")) return env else: return clear, env, env
def train_pa_ppo(path='pa_ppo'): """ 1/3の確率で出を出す環境での学習を行う。 引数: path 学習済みモデルファイルパス 戻り値: なし """ print(f'train ppo with jurina_player path={path}') # じゃんけん環境の構築 env = RockPaperScissorsEnv(JurinaPlayer()) env = Monitor(env, LOGDIR, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # PPOモデルの初期化 model = PPO('MlpPolicy', env, verbose=1) # トレーニング実行 elapsed = time.time() model.learn(total_timesteps=1000000) print(f'elapse time: {time.time() - elapsed}sec') # 学習済みモデルの保存 model.save(path) # じゃんけん環境のクローズ env.close()
def _wrap_env(env: GymEnv, verbose: int = 0, monitor_wrapper: bool = True) -> VecEnv: """ " Wrap environment with the appropriate wrappers if needed. For instance, to have a vectorized environment or to re-order the image channels. :param env: :param verbose: :param monitor_wrapper: Whether to wrap the env in a ``Monitor`` when possible. :return: The wrapped environment. """ if not isinstance(env, VecEnv): if not is_wrapped(env, Monitor) and monitor_wrapper: if verbose >= 1: print("Wrapping the env with a `Monitor` wrapper") env = Monitor(env) if verbose >= 1: print("Wrapping the env in a DummyVecEnv.") env = DummyVecEnv([lambda: env]) if (is_image_space(env.observation_space) and not is_vecenv_wrapped(env, VecTransposeImage) and not is_image_space_channels_first(env.observation_space)): if verbose >= 1: print("Wrapping the env in a VecTransposeImage.") env = VecTransposeImage(env) # check if wrapper for dict support is needed when using HER if isinstance(env.observation_space, gym.spaces.dict.Dict): env = ObsDictWrapper(env) return env
def main(config: str, agent: str): with open(config) as fp: json_data = json.load(fp) config = GameConfig.deserialize(json_data) log_dir = config.agents_config[agent]["save_path"] # if agent == "DQN": # env = make_atari_env(config.game_name, n_envs=1, # seed=0, monitor_dir=log_dir) # elif agent == "PPO": # env = make_atari_env(config.game_name, n_envs=8, # seed=0, monitor_dir=log_dir) # else: # env = make_atari_env(config.game_name, n_envs=16, # seed=0, monitor_dir=log_dir) env = gym_super_mario_bros.make(config.game_name) env = JoypadSpace(env, SIMPLE_MOVEMENT) env = Monitor(env, log_dir) # env = VecFrameStack(env, n_stack=4) agent = AgentLoader.get_agent(agent, config.agents_config, env)
def test_her(model_class, online_sampling, image_obs_space): """ Test Hindsight Experience Replay. """ n_bits = 4 env = BitFlippingEnv( n_bits=n_bits, continuous=not (model_class == DQN), image_obs_space=image_obs_space, ) model = model_class( "MultiInputPolicy", env, replay_buffer_class=HerReplayBuffer, replay_buffer_kwargs=dict( n_sampled_goal=2, goal_selection_strategy="future", online_sampling=online_sampling, max_episode_length=n_bits, ), train_freq=4, gradient_steps=1, policy_kwargs=dict(net_arch=[64]), learning_starts=100, buffer_size=int(2e4), ) model.learn(total_timesteps=150) evaluate_policy(model, Monitor(env))
def Main(): pp = pprint.PrettyPrinter(indent=4) #make environment and wrap env = gym.make('ur5e_reacher-v1') env = Monitor(env, filename="logs", allow_early_resets=True) #***define model*** #hyperparams # n_actions = env.action_space.shape[-1] # action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model_class = DDPG #kwargs are the parameters for DDPG model init kwargs = {"device": "cuda", "action_noise": NormalActionNoise} model = HER( 'MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy='future', verbose=1, learning_rate=0.005, online_sampling=True, #max_episode_steps=4800 **kwargs) #train model train = False if train: model.learn(2 * 10e5) model.save("./her_ur5e_model/model_") #load model, not really necessary evaluate = True
def main(do_render: bool, seed: int, as_gdads: bool, name: str, do_train: bool): drop_abs_position = True conf: Conf = CONFS[name] dict_env = get_env(name=name, drop_abs_position=drop_abs_position, is_training=True) if as_gdads: flat_env = SkillWrapper(env=dict_env) else: flat_env = flatten_env(dict_env, drop_abs_position) flat_env = TransformReward(flat_env, f=lambda r: r * conf.reward_scaling) flat_env = Monitor(flat_env) dict_env = get_env(name=name, drop_abs_position=drop_abs_position, is_training=False) if as_gdads: use_slider = False if use_slider: eval_env = SliderWrapper(env=dict_env) else: eval_env = GDADSEvalWrapper(dict_env, sw=BestSkillProvider(flat_env)) else: eval_env = flatten_env(dict_env=dict_env, drop_abs_position=drop_abs_position) filename = f"modelsCommandSkills/{name}/asGDADS{as_gdads}/resamplingFalse_goalSpaceTrue-seed-{seed}" if os.path.exists(filename + ".zip"): sac = SAC.load(filename + ".zip", env=flat_env) print(f"loaded model {filename}") if as_gdads: flat_env.load(filename) else: sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr, tensorboard_log=filename, buffer_size=conf.buffer_size, batch_size=conf.batch_size, gamma=gamma(conf.ep_len), learning_starts=100 * conf.ep_len, policy_kwargs=dict(log_std_init=-3, net_arch=[conf.layer_size] * 2), seed=seed, device="cuda", train_freq=4) if do_train: train(model=sac, conf=conf, save_fname=filename, eval_env=eval_env) if do_render: show(model=sac, env=eval_env, conf=conf) do_eval = not do_train and not do_render if do_eval: results = ant_grid_evaluation(model=sac, env=eval_env, episode_len=conf.ep_len) dump_ant_grid_evaluation(results)
def train(): train_images, test_images = load_data("dataset") env = Monitor( PuzzleEnv(images=train_images, img_size=IMG_SIZE, channel_num=CHANNEL_NUM, puzzle_size=(3, 3), max_step_num=100, puzzle_type="switch", dist_type="manhattan", penalty_for_step=-0.2, reward_for_completiton=20, positive_reward_coefficient=1.0, obs_conf=OBS_CONF)) policy_kwargs = dict( features_extractor_class=CustomCNN, features_extractor_kwargs=dict(features_dim=128), ) model = PPO('CnnPolicy', env, policy_kwargs=policy_kwargs, verbose=1, learning_rate=0.0005, seed=42) model.learn(total_timesteps=1000000) test(model, test_images)
def new_test(): processed = pd.read_csv( os.path.abspath('./me/datasets/new_data_with_techs_turb.csv'), index_col=0) train = data_split(processed, '2009-01-01', '2018-01-01') trade = data_split(processed, '2018-01-01', '2021-01-01') stock_dimension = len(train.tic.unique()) state_space = 1 + 2 * stock_dimension + len( config.TECHNICAL_INDICATORS_LIST) * stock_dimension print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}") env_kwargs = { "hmax": 100, "initial_amount": 1000000, "transaction_cost_pct": 0.001, "state_space": state_space, "stock_dim": stock_dimension, "tech_indicator_list": config.TECHNICAL_INDICATORS_LIST, "action_space": stock_dimension, "reward_scaling": 1e-4 } e_train_gym = StockTradingEnv(df=train, **env_kwargs) env_train, _ = e_train_gym.get_sb_env() log_dir = "me/tmp/" os.makedirs(log_dir, exist_ok=True) env_train.envs[0] = Monitor(env_train.envs[0], log_dir) agent = DRLAgent(env=env_train) model_a2c = agent.get_model("a2c", verbose=0) trained_a2c = agent.train_model(model=model_a2c, tb_log_name='a2c', total_timesteps=100000) data_turbulence = processed[(processed.date < '2018-01-01') & (processed.date >= '2009-01-01')] insample_turbulence = data_turbulence.drop_duplicates(subset=['date']) turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, 1) e_trade_gym = StockTradingEnv(df=trade, turbulence_threshold=380, **env_kwargs) env_trade, obs_trade = e_trade_gym.get_sb_env() print("BEGIN PREDICTION") df_account_value, df_actions = DRLAgent.DRL_prediction(model=trained_a2c, test_data=trade, test_env=env_trade, test_obs=obs_trade) print(df_account_value) print("END PREDICTION")
def main(args): # 1. Start a W&B run wandb.init(project='pearl', entity='adlr-ss-21-05') wandb.config.update(args) print("wandb name: ", wandb.run.name) log_dir = "tmp/" + wandb.run.name + "/" os.makedirs(log_dir, exist_ok=True) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, check_log=1, log_dir=log_dir, model_name=wandb.run.name) env = gym.make('kuka_iiwa_insertion-v0', use_gui=False, steps_per_action=args.steps_per_action, max_steps=args.max_steps, action_step_size=args.action_step_size) env = Monitor(env, log_dir) model = SAC("MlpPolicy", env, verbose=args.verbosity, train_freq=(args.train_freq_num, args.train_freq_type), batch_size=args.batch_size) i = 0 save_interval = 1000000 while True: i += save_interval model.learn(total_timesteps=save_interval, callback=callback)
def train_model(self): auto_save_callback = SaveOnBestTrainingRewardCallback( log_dir=self.log_dir) auto_save_callback_every_1000_steps = EveryNTimesteps( n_steps=1000, callback=auto_save_callback) self.environment = Monitor(self.environment, self.log_dir) self.model = self.algorithm('MlpPolicy', self.environment, verbose=1, tensorboard_log=self.log_dir) name = self.model_name + "_full_model" checkpoint_callback = SavePerformanceOnCheckpoints( resource_manager=self, name=name, checkpoint_results=self.checkpoint_results) checkpoint_callback_every_1000_steps = EveryNTimesteps( n_steps=1000, callback=checkpoint_callback) with ProgressBarManager(self.training_steps) as progress_callback: self.model.learn(total_timesteps=self.training_steps, callback=[ progress_callback, auto_save_callback_every_1000_steps, checkpoint_callback_every_1000_steps ]) self.save_episode_rewards_as_csv() model_path = os.path.abspath("models/" + name) self.model.save(model_path)
def main(): # Instantiate the env env = Gaze(fitts_W=fitts_W, fitts_D=fitts_D, ocular_std=ocular_std, swapping_std=swapping_std) env = Monitor(env, log_dir) # Train the agent model = PPO('MlpPolicy', env, verbose=0, clip_range=0.15) ''' # Save a checkpoint periodically save_feq_n=timesteps/10 checkpoint_callback = CheckpointCallback(save_freq=save_feq_n, save_path=f'{log_dir}savedmodel/', name_prefix='model_ppo') ''' # Train the agent model.learn(total_timesteps=int(timesteps), callback=checkpoint_callback) # Save the model model.save(f'{log_dir}savedmodel/model_ppo') # Plot the learning curve plot_results2(log_dir) save_learned_behaviour()
def make_eval_env(with_monitor, wrapper_class=gym.Wrapper): # Make eval environment with or without monitor in root, # and additionally wrapped with another wrapper (after Monitor). env = None if vec_env_class is None: # No vecenv, traditional env env = gym.make(env_id) if with_monitor: env = Monitor(env) env = wrapper_class(env) else: if with_monitor: env = vec_env_class([lambda: wrapper_class(Monitor(gym.make(env_id)))] * n_envs) else: env = vec_env_class([lambda: wrapper_class(gym.make(env_id))] * n_envs) return env
def _init(): if isinstance(env_id, str): env = gym.make(env_id, **env_kwargs) else: env = env_id(**env_kwargs) if seed is not None: env.seed(seed + rank) env.action_space.seed(seed + rank) # Hide the score env = HideScore(env) # Wrap the env in a Monitor wrapper # to have additional training information monitor_path = os.path.join( monitor_dir, str(rank)) if monitor_dir is not None else None # Create the monitor folder if needed if monitor_path is not None: os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, filename=monitor_path) # Optionally, wrap the environment with the provided wrapper if wrapper_class is not None: env = wrapper_class(env) return env
def create(self, n_envs=1): """Create the agent""" self.env = self.agent_helper.env log_dir = self.agent_helper.config_dir os.makedirs(log_dir, exist_ok=True) self.env = Monitor(self.env, log_dir) #TODO: # Create DDPG policy and define its hyper parameter here! even the action space and observation space. # add policy policy_name = self.agent_helper.config['policy'] self.policy = eval(policy_name) # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) n_actions = int(self.agent_helper.env.action_space.shape[0]) action_noise = NormalActionNoise( mean=np.zeros(n_actions), sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions)) #FIXME: test: # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path) # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct? # activ_function_name = self.agent_helper.config['nn_activ'] # activ_function = eval(activ_function_name) # policy_kwargs = dict(activation_fn=activ_function, # net_arch=[dict(pi=[32, 32], qf=[32, 32])]) policy_kwargs = dict(net_arch=self.agent_helper.config['layers']) self.model = OffPolicyAlgorithm( self.policy, self.env, learning_rate=self.agent_helper.config['learning_rate'], buffer_size=self.agent_helper.config['buffer_size'], batch_size=self.agent_helper.config['batch_size'], tau=self.agent_helper.config['tau'], gamma=self.agent_helper.config['gamma'], gradient_steps=self.agent_helper.config['gradient_steps'], action_noise=action_noise, optimize_memory_usage=self.agent_helper. config['optimize_memory_usage'], create_eval_env=self.agent_helper.config['create_eval_env'], policy_kwargs=policy_kwargs, verbose=self.agent_helper.config['verbose'], learning_starts=self.agent_helper.config['learning_starts'], tensorboard_log=self.agent_helper.graph_path, seed=self.agent_helper.seed) pass
def test(model, test_images): test_env = Monitor( PuzzleEnv(images=test_images, img_size=IMG_SIZE, channel_num=CHANNEL_NUM, puzzle_size=(3, 3), puzzle_type="switch", dist_type="manhattan", penalty_for_step=-0.2, reward_for_completiton=20, positive_reward_coefficient=1.0, obs_conf=OBS_CONF)) solutions = [] rews = [] steps = [] sample = len(test_images) errors = 0 for iter in range(sample): i = 0 done = False obs = test_env.reset() frames = [obs] while not done: i += 1 action, _states = model.predict(obs) obs, rewards, done, info = test_env.step(action) frames.append(obs) rews.append(rewards) if i == 10000: errors += 1 break solutions.append(frames) done = False print(i, sum(rews), rews) rews = [] steps.append(i) print('Average steps taken: ', sum(steps) / sample) print('Median of steps taken: ', statistics.median(steps)) print('Number of errors: ', errors) plt.hist(steps, bins=9) plt.savefig('fig.png')
def test_vec_monitor_warn(): env = DummyVecEnv([lambda: Monitor(gym.make("CartPole-v1"))]) # We should warn the user when the env is already wrapped with a Monitor wrapper with pytest.warns(UserWarning): VecMonitor(env) with pytest.warns(UserWarning): VecMonitor(VecNormalize(env))
def _thunk(): if env_id.startswith("dm"): _, domain, task = env_id.split('.') env = dmc2gym.make(domain_name=domain, task_name=task) env = ClipAction(env) elif env_id.startswith("rrc"): _, ac_type, ac_wrapper = env_id.split('.') ts_relative, sa_relative = False, False scaled_ac, task_space = False, False if ac_wrapper.split('-')[0] == 'task': task_space = True ts_relative = ac_wrapper.split('-')[-1] == 'rel' elif ac_wrapper.split('-')[0] == 'scaled': scaled_ac = True sa_relative = ac_wrapper.split('-')[-1] == 'rel' env = rrc_utils.build_env_fn( action_type=ac_type, initializer=None, scaled_ac=scaled_ac, task_space=task_space, sa_relative=sa_relative, ts_relative=ts_relative, goal_relative=True, rew_fn='step')() else: env = gym.make(env_id) is_atari = hasattr(gym.envs, 'atari') and isinstance( env.unwrapped, gym.envs.atari.atari_env.AtariEnv) if is_atari: env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) env.seed(seed + rank) if str(env.__class__.__name__).find('TimeLimit') >= 0: env = TimeLimitMask(env) if log_dir is not None: env = Monitor(env, os.path.join(log_dir, str(rank)), allow_early_resets=allow_early_resets) if is_atari: if len(env.observation_space.shape) == 3: env = EpisodicLifeEnv(env) if "FIRE" in env.unwrapped.get_action_meanings(): env = FireResetEnv(env) env = WarpFrame(env, width=84, height=84) env = ClipRewardEnv(env) elif len(env.observation_space.shape) == 3: raise NotImplementedError( "CNN models work only for atari,\n" "please use a custom wrapper for a custom pixel input env.\n" "See wrap_deepmind for an example.") # If the input has shape (W,H,3), wrap for PyTorch convolutions obs_shape = env.observation_space.shape if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: env = TransposeImage(env, op=[2, 0, 1]) return env
def _inner() -> gym.Env: env = gym.make(ENV_NAME, verbose=0) env.seed(seed) if not is_eval: env = Monitor(env, run_dir) env = GrayScaleObservation(env, keep_dim=True) if frame_skip > 0: env = MaxAndSkipEnv(env, skip=frame_skip) return env
def load(self, name: str, env, replace_parameters=None): self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace( ":", "-") os.makedirs(self.log_dir, exist_ok=True) monitor_env = Monitor(env, self.log_dir, allow_early_resets=True) vec_env = DummyVecEnv([lambda: monitor_env]) self.model = PPO.load(name, env=vec_env, custom_objects=replace_parameters)
def _init(): set_random_seed(seed + rank) env = gym.make(env_id, **env_kwargs) # Wrap first with a monitor (e.g. for Atari env where reward clipping is used) log_file = os.path.join(log_dir, str(rank)) if log_dir is not None else None # Monitor success rate too for the real robot info_keywords = ('is_success', ) if 'NeckEnv' in env_id else () env = Monitor(env, log_file, info_keywords=info_keywords) # Dict observation space is currently not supported. # https://github.com/hill-a/stable-baselines/issues/321 # We allow a Gym env wrapper (a subclass of gym.Wrapper) if wrapper_class: env = wrapper_class(env) env.seed(seed + rank) return env