action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.02 * np.ones(n_actions), theta=0.15, dt=0.01, initial_noise=None) model = TD3(MlpPolicyTD3, env, action_noise=action_noise, verbose=1, policy_kwargs = dict(layers=[400,300]) , tensorboard_log = workDirectory+"/log" ) model.test_env = DummyVecEnv([lambda: e.AidaBulletEnv(commands, render = False, on_rack = False, default_reward = args.default_reward, height_weight = args.height_weight, orientation_weight = args.orientation_weight, direction_weight = args.direction_weight, speed_weight = args.speed_weight, mimic_weight = args.mimic_weight, consistancy_weight = args.consistancy_weight, logReward = True ) ]) if normalize: model.test_env = VecNormalize(model.test_env, gamma=args.gamma) def callback(_locals, _globals): """ Callback for monitoring learning progress. :param _locals: (dict)
def objective(trial): kwargs = hyperparams.copy() trial.model_class = None kwargs.update(sample_sac_params(trial)) def callback(_locals, _globals): """ Callback for monitoring learning progress. :param _locals: (dict) :param _globals: (dict) :return: (bool) If False: stop training """ self_ = _locals['self'] trial = self_.trial # Initialize variables if not hasattr(self_, 'is_pruned'): self_.is_pruned = False self_.last_mean_test_reward = -np.inf self_.last_time_evaluated = 0 self_.eval_idx = 0 if (self_.num_timesteps - self_.last_time_evaluated) < evaluate_interval: return True self_.last_time_evaluated = self_.num_timesteps # Evaluate the trained agent on the test env rewards = [] n_steps_done, reward_sum = 0, 0.0 # Sync the obs rms if using vecnormalize # NOTE: this does not cover all the possible cases if isinstance(self_.test_env, VecNormalize): self_.test_env.obs_rms = deepcopy(self_.env.obs_rms) self_.test_env.ret_rms = deepcopy(self_.env.ret_rms) # Do not normalize reward self_.test_env.norm_reward = False obs = self_.test_env.reset() while n_steps_done < n_test_steps: # Use default value for deterministic action, _ = self_.predict(obs, ) obs, reward, done, _ = self_.test_env.step(action) reward_sum += reward n_steps_done += 1 if done: rewards.append(reward_sum) reward_sum = 0.0 obs = self_.test_env.reset() rewards.append(reward_sum) mean_reward = np.mean(rewards) summary = tf.Summary(value=[ tf.Summary.Value(tag='evaluation', simple_value=mean_reward) ]) _locals['writer'].add_summary(summary, self_.num_timesteps) self_.last_mean_test_reward = mean_reward self_.eval_idx += 1 # report best or report current ? # report num_timesteps or elasped time ? trial.report(-1 * mean_reward, self_.eval_idx) # Prune trial if need if trial.should_prune(self_.eval_idx): self_.is_pruned = True return False return True commands = [[1, 0], [2, 0], [3, 0]] env = DummyVecEnv([ lambda: e.AidaBulletEnv(commands, render=False, on_rack=False, default_reward=2, height_weight=5, orientation_weight=3, direction_weight=2, speed_weight=4) ]) model = SAC(MlpPolicy, env, gamma=kwargs['gamma'], learning_rate=kwargs['learning_rate'], batch_size=kwargs['batch_size'], buffer_size=kwargs['buffer_size'], learning_starts=kwargs['learning_starts'], train_freq=kwargs['train_freq'], gradient_steps=kwargs['gradient_steps'], ent_coef=kwargs['ent_coef'], target_entropy=kwargs['target_entropy'], policy_kwargs=kwargs['policy_kwargs'], tensorboard_log="./optimisationSAC/logOPTI") model.test_env = DummyVecEnv([ lambda: e.AidaBulletEnv(commands, render=False, on_rack=False, default_reward=2, height_weight=5, orientation_weight=3, direction_weight=2, speed_weight=4) ]) model.trial = trial try: model.learn(n_timesteps, callback=callback, tb_log_name="SAC_" + str(trial.number)) # Free memory model.env.close() model.test_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() model.test_env.close() raise is_pruned = False cost = np.inf if hasattr(model, 'is_pruned'): is_pruned = model.is_pruned cost = -1 * model.last_mean_test_reward try: os.mkdir("./optimisationSAC/resultats/" + str(trial.number)) except FileExistsError: print("Directory already exists") model.save("./optimisationSAC/resultats/" + str(trial.number) + "/" + str(trial.number)) del model.env, model.test_env del model if is_pruned: try: # Optuna >= 0.19.0 raise optuna.exceptions.TrialPruned() except AttributeError: raise optuna.structs.TrialPruned() return cost