def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) else: action_noise = None model = model_class("MlpPolicy", env, gamma=0.1, seed=0, action_noise=action_noise, buffer_size=int(1e6)) model.learn(total_timesteps=20000) n_trials = 1000 reward_sum = 0 set_global_seeds(0) obs = env.reset() for _ in range(n_trials): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) reward_sum += reward assert reward_sum > 0.9 * n_trials # Free memory del model, env
def sample_td3_params(trial): """ Sampler for TD3 hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512]) buffer_size = trial.suggest_categorical('buffer_size', [int(1e4), int(1e5), int(1e6)]) train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 1000, 2000]) gradient_steps = train_freq noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal']) noise_std = trial.suggest_uniform('noise_std', 0, 1) hyperparams = { 'gamma': gamma, 'learning_rate': learning_rate, 'batch_size': batch_size, 'buffer_size': buffer_size, 'train_freq': train_freq, 'gradient_steps': gradient_steps, } if noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) else: action_noise = None model = model_class("MlpPolicy", env, gamma=0.1, seed=0, action_noise=action_noise, buffer_size=int(1e6)) model.learn(total_timesteps=20000) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) # Free memory del model, env
def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) n_steps = {SAC: 700, TD3: 500, DDPG: 2000}[model_class] kwargs = dict(seed=0, gamma=0.95, buffer_size=1e5) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.05 * np.ones(n_actions)) kwargs["action_noise"] = action_noise if model_class == DDPG: kwargs["actor_lr"] = 1e-3 kwargs["batch_size"] = 100 model = model_class("MlpPolicy", env, **kwargs) model.learn(total_timesteps=n_steps) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) # Free memory del model, env
def test_ddpg_popart(): """ Test DDPG with pop-art normalization """ n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True, normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1, batch_size=64, action_noise=action_noise, enable_popart=True) model.learn(1000)
def sample_ddpg_params(trial): """ Sampler for DDPG hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical( 'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1) # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256]) buffer_size = trial.suggest_categorical( 'memory_limit', [int(1e4), int(1e5), int(1e6)]) noise_type = trial.suggest_categorical( 'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param']) noise_std = trial.suggest_uniform('noise_std', 0, 1) normalize_observations = trial.suggest_categorical( 'normalize_observations', [True, False]) normalize_returns = trial.suggest_categorical('normalize_returns', [True, False]) hyperparams = { 'gamma': gamma, 'actor_lr': learning_rate, 'critic_lr': learning_rate, 'batch_size': batch_size, 'memory_limit': buffer_size, 'normalize_observations': normalize_observations, 'normalize_returns': normalize_returns } if noise_type == 'adaptive-param': hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) # Apply layer normalization when using parameter perturbation hyperparams['policy_kwargs'] = dict(layer_norm=True) elif noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
n_actions = env.action_space.shape[0] if 'adaptive-param' in noise_type: assert algo_ == 'ddpg', 'Parameter is not supported by SAC' hyperparams['param_noise'] = AdaptiveParamNoiseSpec( initial_stddev=noise_std, desired_action_stddev=noise_std) elif 'normal' in noise_type: if 'lin' in noise_type: hyperparams['action_noise'] = LinearNormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions), final_sigma=hyperparams.get('noise_std_final', 0.0) * np.ones(n_actions), max_steps=n_timesteps) else: hyperparams['action_noise'] = NormalActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) elif 'ornstein-uhlenbeck' in noise_type: hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) else: raise RuntimeError( 'Unknown noise type "{}"'.format(noise_type)) print("Applying {} noise with std {}".format( noise_type, noise_std)) del hyperparams['noise_type'] del hyperparams['noise_std'] if 'noise_std_final' in hyperparams: del hyperparams['noise_std_final']
def main(args): envconfig_string = args.envconfig custom_envconfig = _preprocess_custom_envconfig( args.envconfig) if args.envconfig is not None else {} env_id = 'gym_auv:' + args.env env_name = env_id.split(':')[-1] if ':' in env_id else env_id envconfig = gym_auv.SCENARIOS[env_name][ 'config'] if env_name in gym_auv.SCENARIOS else {} envconfig.update(custom_envconfig) NUM_CPU = multiprocessing.cpu_count() EXPERIMENT_ID = str(int(time())) + args.algo.lower() model = { 'ppo': PPO2, 'ddpg': DDPG, 'td3': TD3, 'a2c': A2C, 'acer': ACER, 'acktr': ACKTR, 'sac': SAC, 'trpo': TRPO }[args.algo.lower()] if args.mode == 'play': agent = model.load(args.agent) if args.agent is not None else None envconfig_play = envconfig.copy() envconfig_play['show_indicators'] = True #envconfig_play['autocamera3d'] = False env = create_env(env_id, envconfig_play, test_mode=True, render_mode=args.render, pilot=args.pilot, verbose=True) print('Created environment instance') if args.scenario: env.load(args.scenario) vec_env = DummyVecEnv([lambda: env]) recorded_env = VecVideoRecorder( vec_env, args.video_dir, record_video_trigger=lambda x: x == 0, video_length=args.recording_length, name_prefix=(args.env if args.video_name == 'auto' else args.video_name)) print(args.video_dir, args.video_name) play_scenario(env, recorded_env, args, agent=agent) recorded_env.env.close() elif (args.mode == 'enjoy'): agent = model.load(args.agent) figure_folder = os.path.join(DIR_PATH, 'logs', 'enjoys', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) scenario_folder = os.path.join(figure_folder, 'scenarios') os.makedirs(scenario_folder, exist_ok=True) video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env, EXPERIMENT_ID) os.makedirs(video_folder, exist_ok=True) env = create_env(env_id, envconfig, test_mode=True, render_mode=args.render, pilot=args.pilot) if args.scenario: env.load(args.scenario) vec_env = DummyVecEnv([lambda: env]) recorded_env = VecVideoRecorder( vec_env, video_folder, record_video_trigger=lambda x: x == 0, video_length=args.recording_length, name_prefix=(args.env if args.video_name == 'auto' else args.video_name)) obs = recorded_env.reset() state = None t_steps = 0 ep_number = 1 done = [False for _ in range(vec_env.num_envs)] for _ in range(args.recording_length): if args.recurrent: action, _states = agent.predict( observation=obs, state=state, mask=done, deterministic=not args.stochastic) state = _states else: action, _states = agent.predict( obs, deterministic=not args.stochastic) obs, reward, done, info = recorded_env.step(action) recorded_env.render() t_steps += 1 if t_steps % 800 == 0 or done: if not done: env.save_latest_episode(save_history=False) gym_auv.reporting.plot_trajectory( env, fig_dir=scenario_folder, fig_prefix=(args.env + '_ep{}_step{}'.format(ep_number, t_steps))) gym_auv.reporting.plot_trajectory( env, fig_dir=scenario_folder, fig_prefix=( args.env + '_ep{}_step{}_local'.format(ep_number, t_steps)), local=True) if done: ep_number += 1 recorded_env.close() elif (args.mode == 'train'): figure_folder = os.path.join(DIR_PATH, 'logs', 'figures', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) scenario_folder = os.path.join(figure_folder, 'scenarios') os.makedirs(scenario_folder, exist_ok=True) video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env, EXPERIMENT_ID) recording_length = 8000 os.makedirs(video_folder, exist_ok=True) agent_folder = os.path.join(DIR_PATH, 'logs', 'agents', args.env, EXPERIMENT_ID) os.makedirs(agent_folder, exist_ok=True) tensorboard_log = os.path.join(DIR_PATH, 'logs', 'tensorboard', args.env, EXPERIMENT_ID) tensorboard_port = 6006 if (args.nomp or model == DDPG or model == TD3 or model == SAC or model == TRPO): num_cpu = 1 vec_env = DummyVecEnv( [lambda: create_env(env_id, envconfig, pilot=args.pilot)]) else: num_cpu = NUM_CPU vec_env = SubprocVecEnv([ make_mp_env(env_id, i, envconfig, pilot=args.pilot) for i in range(num_cpu) ]) if (args.agent is not None): agent = model.load(args.agent) agent.set_env(vec_env) else: if (model == PPO2): if args.recurrent: hyperparams = { # 'n_steps': 1024, # 'nminibatches': 32, # 'lam': 0.95, # 'gamma': 0.99, # 'noptepochs': 10, # 'ent_coef': 0.0, # 'learning_rate': 0.0003, # 'cliprange': 0.2, 'n_steps': 1024, 'nminibatches': 1, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'learning_rate': 2e-3, } class CustomLSTMPolicy(MlpLstmPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[ 256, 256, 'lstm', dict(vf=[64], pi=[64]) ], **_kwargs) agent = PPO2(CustomLSTMPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) else: hyperparams = { # 'n_steps': 1024, # 'nminibatches': 32, # 'lam': 0.95, # 'gamma': 0.99, # 'noptepochs': 10, # 'ent_coef': 0.0, # 'learning_rate': 0.0003, # 'cliprange': 0.2, 'n_steps': 1024, 'nminibatches': 32, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'learning_rate': 2e-4, } #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[64, 64, 64]) #policy_kwargs = dict(net_arch=[64, 64, 64]) layers = [256, 128, 64] #layers = [64, 64] policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)]) agent = PPO2(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams, policy_kwargs=policy_kwargs) #dataset = ExpertDataset(expert_path='gail_expert.npz', traj_limitation=1, batch_size=128) #print('Pretraining {} agent on "{}"'.format(args.algo.upper(), env_id)) #agent.pretrain(dataset, n_epochs=1000) #print('Done pretraining {} agent on "{}"'.format(args.algo.upper(), env_id)) elif (model == DDPG): # rl-baselines-zoo inspired: # hyperparams = { # 'memory_limit': 50000, # 'normalize_observations': True, # 'normalize_returns': False, # 'gamma': 0.98, # 'actor_lr': 0.00156, # 'critic_lr': 0.00156, # 'batch_size': 256, # 'param_noise': AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1) # } hyperparams = { 'memory_limit': 1000000, 'normalize_observations': True, 'normalize_returns': False, 'gamma': 0.98, 'actor_lr': 0.00156, 'critic_lr': 0.00156, 'batch_size': 256, 'param_noise': AdaptiveParamNoiseSpec(initial_stddev=0.287, desired_action_stddev=0.287) } agent = DDPG(LnMlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) elif (model == TD3): # rl-baselines-zoo inspired: # hyperparams = { # 'batch_size': 256, # 'buffer_size': 50000, # 'learning_starts': 1000 # } hyperparams = { 'buffer_size': 1000000, 'train_freq': 1000, 'gradient_steps': 1000, 'learning_starts': 10000 } action_noise = NormalActionNoise(mean=np.zeros(2), sigma=0.1 * np.ones(2)) agent = TD3(stable_baselines.td3.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, action_noise=action_noise, **hyperparams) elif model == A2C: # rl-baselines-zoo inspired: # hyperparams = { # 'n_steps': 5, # 'gamma': 0.995, # 'ent_coef': 0.00001, # 'learning_rate': 0.00083, # 'lr_schedule': 'linear' # } # layers = [256, 128, 64] hyperparams = { 'n_steps': 16, 'gamma': 0.99, 'ent_coef': 0.001, 'learning_rate': 2e-4, 'lr_schedule': 'linear' } layers = [64, 64] policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)]) agent = A2C(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams, policy_kwargs=policy_kwargs) elif model == ACER: agent = ACER(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) elif model == ACKTR: # rl-baselines-zoo inspired: # hyperparams = { # 'gamma': 0.99, # 'n_steps': 16, # 'ent_coef': 0.0, # 'learning_rate': 0.06, # 'lr_schedule': 'constant' # } # agent = ACKTR(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) agent = ACKTR(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) elif model == SAC: # rl-baselines-zoo inspired: # hyperparams = { # 'batch_size': 256, # 'learning_starts': 1000 # } # agent = SAC(stable_baselines.sac.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) agent = SAC(stable_baselines.sac.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) elif model == TRPO: agent = TRPO(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) print('Training {} agent on "{}"'.format(args.algo.upper(), env_id)) n_updates = 0 n_episodes = 0 def callback(_locals, _globals): nonlocal n_updates nonlocal n_episodes sys.stdout.write('Training update: {}\r'.format(n_updates)) sys.stdout.flush() _self = _locals['self'] vec_env = _self.get_env() class Struct(object): pass report_env = Struct() report_env.history = [] report_env.config = envconfig report_env.nsensors = report_env.config[ "n_sensors_per_sector"] * report_env.config["n_sectors"] report_env.sensor_angle = 2 * np.pi / (report_env.nsensors + 1) report_env.last_episode = vec_env.get_attr('last_episode')[0] report_env.config = vec_env.get_attr('config')[0] report_env.obstacles = vec_env.get_attr('obstacles')[0] env_histories = vec_env.get_attr('history') for episode in range(max(map(len, env_histories))): for env_idx in range(len(env_histories)): if (episode < len(env_histories[env_idx])): report_env.history.append( env_histories[env_idx][episode]) report_env.episode = len(report_env.history) + 1 total_t_steps = _self.get_env().get_attr( 'total_t_steps')[0] * num_cpu agent_filepath = os.path.join(agent_folder, str(total_t_steps) + '.pkl') if model == PPO2: recording_criteria = n_updates % 10 == 0 report_criteria = True _self.save(agent_filepath) elif model == A2C or model == ACER or model == ACKTR or model == SAC or model == TRPO: save_criteria = n_updates % 100 == 0 recording_criteria = n_updates % 1000 == 0 report_criteria = True if save_criteria: _self.save(agent_filepath) elif model == DDPG or model == TD3: save_criteria = n_updates % 10000 == 0 recording_criteria = n_updates % 50000 == 0 report_criteria = report_env.episode > n_episodes if save_criteria: _self.save(agent_filepath) if report_env.last_episode is not None and len( report_env.history) > 0 and report_criteria: try: #gym_auv.reporting.plot_trajectory(report_env, fig_dir=scenario_folder, fig_prefix=args.env + '_ep_{}'.format(report_env.episode)) gym_auv.reporting.report(report_env, report_dir=figure_folder) #vec_env.env_method('save', os.path.join(scenario_folder, '_ep_{}'.format(report_env.episode))) except OSError as e: print("Ignoring reporting OSError:") print(repr(e)) if recording_criteria: if args.pilot: cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --pilot {} --envconfig {}{}'.format( args.env, agent_filepath, video_folder, args.env + '-' + str(total_t_steps), recording_length, args.algo, args.pilot, envconfig_string, ' --recurrent' if args.recurrent else '') else: cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --envconfig {}{}'.format( args.env, agent_filepath, video_folder, args.env + '-' + str(total_t_steps), recording_length, args.algo, envconfig_string, ' --recurrent' if args.recurrent else '') subprocess.Popen(cmd) n_episodes = report_env.episode n_updates += 1 agent.learn(total_timesteps=1500000, tb_log_name='log', callback=callback) elif (args.mode in ['policyplot', 'vectorfieldplot', 'streamlinesplot']): figure_folder = os.path.join(DIR_PATH, 'logs', 'plots', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) agent = PPO2.load(args.agent) if args.testvals: testvals = json.load(open(args.testvals, 'r')) valuegrid = list(ParameterGrid(testvals)) for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env = create_env(env_id, envconfig, test_mode=True, pilot=args.pilot) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) print('Running {} test for {}...'.format( args.mode, valuedict_str)) if args.mode == 'policyplot': gym_auv.reporting.plot_actions(env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) elif args.mode == 'vectorfieldplot': gym_auv.reporting.plot_vector_field( env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) elif args.mode == 'streamlinesplot': gym_auv.reporting.plot_streamlines( env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) else: env = create_env(env_id, envconfig, test_mode=True, pilot=args.pilot) with open(os.path.join(figure_folder, 'config.json'), 'w') as f: json.dump(env.config, f) if args.mode == 'policyplot': gym_auv.reporting.plot_actions(env, agent, fig_dir=figure_folder) elif args.mode == 'vectorfieldplot': gym_auv.reporting.plot_vector_field(env, agent, fig_dir=figure_folder) elif args.mode == 'streamlinesplot': gym_auv.reporting.plot_streamlines(env, agent, fig_dir=figure_folder) print('Output folder: ', figure_folder) elif args.mode == 'test': figure_folder = os.path.join(DIR_PATH, 'logs', 'tests', args.env, EXPERIMENT_ID) scenario_folder = os.path.join(figure_folder, 'scenarios') video_folder = os.path.join(figure_folder, 'videos') os.makedirs(figure_folder, exist_ok=True) os.makedirs(scenario_folder, exist_ok=True) os.makedirs(video_folder, exist_ok=True) if not args.onlyplot: agent = model.load(args.agent) def create_test_env(video_name_prefix, envconfig=envconfig): print('Creating test environment: ' + env_id) env = create_env(env_id, envconfig, test_mode=True, render_mode=args.render if args.video else None, pilot=args.pilot) vec_env = DummyVecEnv([lambda: env]) if args.video: video_length = min(500, args.recording_length) recorded_env = VecVideoRecorder(vec_env, video_folder, record_video_trigger=lambda x: (x % video_length) == 0, video_length=video_length, name_prefix=video_name_prefix) active_env = recorded_env if args.video else vec_env return env, active_env failed_tests = [] def run_test(id, reset=True, report_dir=figure_folder, scenario=None, max_t_steps=None, env=None, active_env=None): nonlocal failed_tests if env is None or active_env is None: env, active_env = create_test_env(video_name_prefix=args.env + '_' + id) if scenario is not None: obs = active_env.reset() env.load(args.scenario) print('Loaded', args.scenario) else: if reset: obs = active_env.reset() else: obs = env.observe() gym_auv.reporting.plot_scenario(env, fig_dir=scenario_folder, fig_postfix=id, show=args.onlyplot) if args.onlyplot: return cumulative_reward = 0 t_steps = 0 if max_t_steps is None: done = False else: done = t_steps > max_t_steps while not done: action, _states = agent.predict( obs, deterministic=not args.stochastic) obs, reward, done, info = active_env.step(action) if args.video: active_env.render() t_steps += 1 cumulative_reward += reward[0] report_msg = '{:<20}{:<20}{:<20.2f}{:<20.2%}\r'.format( id, t_steps, cumulative_reward, info[0]['progress']) sys.stdout.write(report_msg) sys.stdout.flush() if args.save_snapshots and t_steps % 1000 == 0 and not done: env.save_latest_episode(save_history=False) for size in (20, 50, 100, 200, 300, 400, 500): gym_auv.reporting.plot_trajectory( env, fig_dir=scenario_folder, fig_prefix=(args.env + '_t_step_' + str(t_steps) + '_' + str(size) + '_' + id), local=True, size=size) elif done: gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id)) env.close() gym_auv.reporting.report(env, report_dir=report_dir, lastn=-1) #gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id)) #env.save(os.path.join(scenario_folder, id)) if env.collision: failed_tests.append(id) with open(os.path.join(figure_folder, 'failures.txt'), 'w') as f: f.write(', '.join(map(str, failed_tests))) return copy.deepcopy(env.last_episode) print('Testing scenario "{}" for {} episodes.\n '.format( args.env, args.episodes)) report_msg_header = '{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}'.format( 'Episode', 'Timesteps', 'Cum. Reward', 'Progress', 'Collisions', 'CT-Error [m]', 'H-Error [deg]') print(report_msg_header) print('-' * len(report_msg_header)) if args.testvals: testvals = json.load(open(args.testvals, 'r')) valuegrid = list(ParameterGrid(testvals)) if args.scenario: if args.testvals: episode_dict = {} for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env, active_env = create_test_env(envconfig=customconfig) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) colorval = -np.log10( valuedict['reward_lambda']) #should be general rep_subfolder = os.path.join(figure_folder, valuedict_str) os.makedirs(rep_subfolder, exist_ok=True) for episode in range(args.episodes): last_episode = run_test(valuedict_str + '_ep' + str(episode), report_dir=rep_subfolder) episode_dict[valuedict_str] = [last_episode, colorval] print('Plotting all') gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_all_agents'), episode_dict=episode_dict) else: run_test("ep0", reset=True, scenario=args.scenario) else: if args.testvals: episode_dict = {} agent_index = 1 for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env, active_env = create_test_env(envconfig=customconfig) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) colorval = np.log10( valuedict['reward_lambda']) #should be general rep_subfolder = os.path.join(figure_folder, valuedict_str) os.makedirs(rep_subfolder, exist_ok=True) for episode in range(args.episodes): last_episode = run_test(valuedict_str + '_ep' + str(episode), report_dir=rep_subfolder) episode_dict['Agent ' + str(agent_index)] = [last_episode, colorval] agent_index += 1 gym_auv.reporting.plot_trajectory(env, fig_dir=figure_folder, fig_prefix=(args.env + '_all_agents'), episode_dict=episode_dict) else: env, active_env = create_test_env(video_name_prefix=args.env) for episode in range(args.episodes): run_test('ep' + str(episode), env=env, active_env=active_env) if args.video and active_env: active_env.close()
def train(algo, df, model_name, uniqueId, lr=None, gamma=None, noBacktest=1, cutoff_date=None, commission=0, addTA='N'): before = np.zeros(noBacktest) after = np.zeros(noBacktest) backtest = np.zeros(noBacktest) train_dates = np.empty(noBacktest, dtype="datetime64[s]") start_test_dates = np.empty(noBacktest, dtype="datetime64[s]") end_test_dates = np.empty(noBacktest, dtype="datetime64[s]") # print(str(df.columns.tolist())) dates = np.unique(df.date) logfile = "./log/" print("noBacktest", noBacktest) # backtest=1 uses cut of date to split train/test cutoff_date = np.datetime64(cutoff_date) print("cutoff_date", cutoff_date) if noBacktest == 1: a = np.where(dates <= cutoff_date)[0] b = np.where(dates > cutoff_date)[0] s = [] s.append((a, b)) else: # ref https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html splits = TimeSeriesSplit(n_splits=noBacktest) s = splits.split(dates) loop = 0 for train_date_index, test_date_index in s: print("loop", loop) train = df[df.date.isin(dates[train_date_index])] test = df[df.date.isin(dates[test_date_index])] runtimeId = uniqueId + "_" + str(loop) train_dates[loop] = max(train.date) start_test_dates[loop] = min(test.date) end_test_dates[loop] = max(test.date) n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) global env title = runtimeId + "_Train lr=" + \ str(lr) + ", cliprange=" + str(cliprange) + ", commission=" + str(commission) env = DummyVecEnv([ lambda: StockEnvPlayer(train, logfile + runtimeId + ".csv", title, seed=seed, commission=commission, addTA=addTA) ]) # Automatically normalize the input features env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) model = algo( MlpPolicy, env, seedy=seed, gamma=g, n_steps=128, ent_coef=0.01, learning_rate=lr, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=cliprange, cliprange_vf=None, # tensorboard_log="./tensorlog", _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, ) # Random Agent, before training print("\n*** Agent before learning ***") steps = len(np.unique(train.date)) before[loop] = evaluate(model, num_steps=steps) model.learn(total_timesteps=round(steps)) print("\n*** Evaluate the trained agent ***") after[loop] = evaluate(model, num_steps=steps) print("\n*** Run agent on unseen data ***") title = runtimeId + "_Test lr=" + \ str(lr) + ", cliprange=" + str(cliprange) + ", commission=" + str(commission) env = DummyVecEnv([ lambda: StockEnvPlayer(test, logfile + runtimeId + ".csv", title, seed=seed, commission=commission, addTA=addTA) ]) env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.) steps = len(np.unique(test.date)) backtest[loop] = evaluate(model, num_steps=steps) del model env.close() loop += 1 # display result on screen for i in range(noBacktest): print("\ntrain_dates:", min(df.date), train_dates[i]) print("test_dates:", start_test_dates[i], end_test_dates[i]) print( "backtest {} : SUM reward : before | after | backtest : {: 8.2f} | {: 8.2f} | {: 8.2f}" .format(i, before[i], after[i], backtest[i])) return pd.DataFrame({ "Model": uniqueId, "addTA": addTA, "Columns": str(df.columns.tolist()), "commission": commission, "Seed": seed, "cliprange": cliprange, "learningRate": lr, "gamma": g, "backtest # ": np.arange(noBacktest), "StartTrainDate": min(train.date), "EndTrainDate": train_dates, "before": before, "after": after, "testDate": end_test_dates, "Sum Reward@roadTest": backtest })
# Create 4 artificial transitions per real transition n_sampled_goal = 4 # # SAC hyperparams: # model = HER('MlpPolicy', env, SAC, n_sampled_goal=n_sampled_goal, # goal_selection_strategy='future', # verbose=1, buffer_size=int(1e6), # learning_rate=1e-3, # gamma=0.95, batch_size=256, # policy_kwargs=dict(layers=[256, 256, 256])) # DDPG Hyperparams: # NOTE: it works even without action noise n_actions = env.action_space.shape[0] noise_std = 0.2 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)) model = HER('MlpPolicy', env, DDPG, n_sampled_goal=n_sampled_goal, goal_selection_strategy='future', verbose=1, buffer_size=int(1e6), actor_lr=1e-3, critic_lr=1e-3, action_noise=action_noise, gamma=0.95, batch_size=256, policy_kwargs=dict(layers=[256, 256, 256])) model.learn(int(2e5))