Beispiel #1
0
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
    else:
        action_noise = None

    model = model_class("MlpPolicy",
                        env,
                        gamma=0.1,
                        seed=0,
                        action_noise=action_noise,
                        buffer_size=int(1e6))
    model.learn(total_timesteps=20000)

    n_trials = 1000
    reward_sum = 0
    set_global_seeds(0)
    obs = env.reset()
    for _ in range(n_trials):
        action, _ = model.predict(obs)
        obs, reward, _, _ = env.step(action)
        reward_sum += reward
    assert reward_sum > 0.9 * n_trials
    # Free memory
    del model, env
Beispiel #2
0
def sample_td3_params(trial):
    """
    Sampler for TD3 hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512])
    buffer_size = trial.suggest_categorical('buffer_size', [int(1e4), int(1e5), int(1e6)])
    train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 1000, 2000])
    gradient_steps = train_freq
    noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)

    hyperparams = {
        'gamma': gamma,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'buffer_size': buffer_size,
        'train_freq': train_freq,
        'gradient_steps': gradient_steps,
    }

    if noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions),
                                                        sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions),
                                                                   sigma=noise_std * np.ones(trial.n_actions))

    return hyperparams
Beispiel #3
0
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
    else:
        action_noise = None

    model = model_class("MlpPolicy",
                        env,
                        gamma=0.1,
                        seed=0,
                        action_noise=action_noise,
                        buffer_size=int(1e6))
    model.learn(total_timesteps=20000)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
    # Free memory
    del model, env
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    n_steps = {SAC: 700, TD3: 500, DDPG: 2000}[model_class]

    kwargs = dict(seed=0, gamma=0.95, buffer_size=1e5)
    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.05 * np.ones(n_actions))
        kwargs["action_noise"] = action_noise

    if model_class == DDPG:
        kwargs["actor_lr"] = 1e-3
        kwargs["batch_size"] = 100

    model = model_class("MlpPolicy", env, **kwargs)
    model.learn(total_timesteps=n_steps)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
    # Free memory
    del model, env
Beispiel #5
0
def test_ddpg_popart():
    """
    Test DDPG with pop-art normalization
    """
    n_actions = 1
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    model = DDPG('MlpPolicy', 'Pendulum-v0', memory_limit=50000, normalize_observations=True,
                 normalize_returns=True, nb_rollout_steps=128, nb_train_steps=1,
                 batch_size=64, action_noise=action_noise, enable_popart=True)
    model.learn(1000)
def sample_ddpg_params(trial):
    """
    Sampler for DDPG hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical(
        'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1)
    # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1)
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size',
                                           [16, 32, 64, 128, 256])
    buffer_size = trial.suggest_categorical(
        'memory_limit', [int(1e4), int(1e5), int(1e6)])
    noise_type = trial.suggest_categorical(
        'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)
    normalize_observations = trial.suggest_categorical(
        'normalize_observations', [True, False])
    normalize_returns = trial.suggest_categorical('normalize_returns',
                                                  [True, False])

    hyperparams = {
        'gamma': gamma,
        'actor_lr': learning_rate,
        'critic_lr': learning_rate,
        'batch_size': batch_size,
        'memory_limit': buffer_size,
        'normalize_observations': normalize_observations,
        'normalize_returns': normalize_returns
    }

    if noise_type == 'adaptive-param':
        hyperparams['param_noise'] = AdaptiveParamNoiseSpec(
            initial_stddev=noise_std, desired_action_stddev=noise_std)
        # Apply layer normalization when using parameter perturbation
        hyperparams['policy_kwargs'] = dict(layer_norm=True)
    elif noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    return hyperparams
Beispiel #7
0
            n_actions = env.action_space.shape[0]
            if 'adaptive-param' in noise_type:
                assert algo_ == 'ddpg', 'Parameter is not supported by SAC'
                hyperparams['param_noise'] = AdaptiveParamNoiseSpec(
                    initial_stddev=noise_std, desired_action_stddev=noise_std)
            elif 'normal' in noise_type:
                if 'lin' in noise_type:
                    hyperparams['action_noise'] = LinearNormalActionNoise(
                        mean=np.zeros(n_actions),
                        sigma=noise_std * np.ones(n_actions),
                        final_sigma=hyperparams.get('noise_std_final', 0.0) *
                        np.ones(n_actions),
                        max_steps=n_timesteps)
                else:
                    hyperparams['action_noise'] = NormalActionNoise(
                        mean=np.zeros(n_actions),
                        sigma=noise_std * np.ones(n_actions))
            elif 'ornstein-uhlenbeck' in noise_type:
                hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions))
            else:
                raise RuntimeError(
                    'Unknown noise type "{}"'.format(noise_type))
            print("Applying {} noise with std {}".format(
                noise_type, noise_std))
            del hyperparams['noise_type']
            del hyperparams['noise_std']
            if 'noise_std_final' in hyperparams:
                del hyperparams['noise_std_final']
Beispiel #8
0
def main(args):
    envconfig_string = args.envconfig
    custom_envconfig = _preprocess_custom_envconfig(
        args.envconfig) if args.envconfig is not None else {}
    env_id = 'gym_auv:' + args.env
    env_name = env_id.split(':')[-1] if ':' in env_id else env_id
    envconfig = gym_auv.SCENARIOS[env_name][
        'config'] if env_name in gym_auv.SCENARIOS else {}
    envconfig.update(custom_envconfig)

    NUM_CPU = multiprocessing.cpu_count()

    EXPERIMENT_ID = str(int(time())) + args.algo.lower()
    model = {
        'ppo': PPO2,
        'ddpg': DDPG,
        'td3': TD3,
        'a2c': A2C,
        'acer': ACER,
        'acktr': ACKTR,
        'sac': SAC,
        'trpo': TRPO
    }[args.algo.lower()]

    if args.mode == 'play':
        agent = model.load(args.agent) if args.agent is not None else None
        envconfig_play = envconfig.copy()
        envconfig_play['show_indicators'] = True
        #envconfig_play['autocamera3d'] = False
        env = create_env(env_id,
                         envconfig_play,
                         test_mode=True,
                         render_mode=args.render,
                         pilot=args.pilot,
                         verbose=True)
        print('Created environment instance')

        if args.scenario:
            env.load(args.scenario)
        vec_env = DummyVecEnv([lambda: env])
        recorded_env = VecVideoRecorder(
            vec_env,
            args.video_dir,
            record_video_trigger=lambda x: x == 0,
            video_length=args.recording_length,
            name_prefix=(args.env
                         if args.video_name == 'auto' else args.video_name))
        print(args.video_dir, args.video_name)
        play_scenario(env, recorded_env, args, agent=agent)
        recorded_env.env.close()

    elif (args.mode == 'enjoy'):
        agent = model.load(args.agent)

        figure_folder = os.path.join(DIR_PATH, 'logs', 'enjoys', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        os.makedirs(scenario_folder, exist_ok=True)

        video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env,
                                    EXPERIMENT_ID)
        os.makedirs(video_folder, exist_ok=True)

        env = create_env(env_id,
                         envconfig,
                         test_mode=True,
                         render_mode=args.render,
                         pilot=args.pilot)
        if args.scenario:
            env.load(args.scenario)
        vec_env = DummyVecEnv([lambda: env])
        recorded_env = VecVideoRecorder(
            vec_env,
            video_folder,
            record_video_trigger=lambda x: x == 0,
            video_length=args.recording_length,
            name_prefix=(args.env
                         if args.video_name == 'auto' else args.video_name))
        obs = recorded_env.reset()
        state = None
        t_steps = 0
        ep_number = 1
        done = [False for _ in range(vec_env.num_envs)]
        for _ in range(args.recording_length):
            if args.recurrent:
                action, _states = agent.predict(
                    observation=obs,
                    state=state,
                    mask=done,
                    deterministic=not args.stochastic)
                state = _states
            else:
                action, _states = agent.predict(
                    obs, deterministic=not args.stochastic)
            obs, reward, done, info = recorded_env.step(action)
            recorded_env.render()
            t_steps += 1

            if t_steps % 800 == 0 or done:
                if not done:
                    env.save_latest_episode(save_history=False)
                gym_auv.reporting.plot_trajectory(
                    env,
                    fig_dir=scenario_folder,
                    fig_prefix=(args.env +
                                '_ep{}_step{}'.format(ep_number, t_steps)))
                gym_auv.reporting.plot_trajectory(
                    env,
                    fig_dir=scenario_folder,
                    fig_prefix=(
                        args.env +
                        '_ep{}_step{}_local'.format(ep_number, t_steps)),
                    local=True)
            if done:
                ep_number += 1
        recorded_env.close()

    elif (args.mode == 'train'):
        figure_folder = os.path.join(DIR_PATH, 'logs', 'figures', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        os.makedirs(scenario_folder, exist_ok=True)
        video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env,
                                    EXPERIMENT_ID)
        recording_length = 8000
        os.makedirs(video_folder, exist_ok=True)
        agent_folder = os.path.join(DIR_PATH, 'logs', 'agents', args.env,
                                    EXPERIMENT_ID)
        os.makedirs(agent_folder, exist_ok=True)
        tensorboard_log = os.path.join(DIR_PATH, 'logs', 'tensorboard',
                                       args.env, EXPERIMENT_ID)
        tensorboard_port = 6006

        if (args.nomp or model == DDPG or model == TD3 or model == SAC
                or model == TRPO):
            num_cpu = 1
            vec_env = DummyVecEnv(
                [lambda: create_env(env_id, envconfig, pilot=args.pilot)])
        else:
            num_cpu = NUM_CPU
            vec_env = SubprocVecEnv([
                make_mp_env(env_id, i, envconfig, pilot=args.pilot)
                for i in range(num_cpu)
            ])

        if (args.agent is not None):
            agent = model.load(args.agent)
            agent.set_env(vec_env)
        else:
            if (model == PPO2):
                if args.recurrent:
                    hyperparams = {
                        # 'n_steps': 1024,
                        # 'nminibatches': 32,
                        # 'lam': 0.95,
                        # 'gamma': 0.99,
                        # 'noptepochs': 10,
                        # 'ent_coef': 0.0,
                        # 'learning_rate': 0.0003,
                        # 'cliprange': 0.2,
                        'n_steps': 1024,
                        'nminibatches': 1,
                        'lam': 0.98,
                        'gamma': 0.999,
                        'noptepochs': 4,
                        'ent_coef': 0.01,
                        'learning_rate': 2e-3,
                    }

                    class CustomLSTMPolicy(MlpLstmPolicy):
                        def __init__(self,
                                     sess,
                                     ob_space,
                                     ac_space,
                                     n_env,
                                     n_steps,
                                     n_batch,
                                     n_lstm=256,
                                     reuse=False,
                                     **_kwargs):
                            super().__init__(sess,
                                             ob_space,
                                             ac_space,
                                             n_env,
                                             n_steps,
                                             n_batch,
                                             n_lstm,
                                             reuse,
                                             net_arch=[
                                                 256, 256, 'lstm',
                                                 dict(vf=[64], pi=[64])
                                             ],
                                             **_kwargs)

                    agent = PPO2(CustomLSTMPolicy,
                                 vec_env,
                                 verbose=True,
                                 tensorboard_log=tensorboard_log,
                                 **hyperparams)
                else:
                    hyperparams = {
                        # 'n_steps': 1024,
                        # 'nminibatches': 32,
                        # 'lam': 0.95,
                        # 'gamma': 0.99,
                        # 'noptepochs': 10,
                        # 'ent_coef': 0.0,
                        # 'learning_rate': 0.0003,
                        # 'cliprange': 0.2,
                        'n_steps': 1024,
                        'nminibatches': 32,
                        'lam': 0.98,
                        'gamma': 0.999,
                        'noptepochs': 4,
                        'ent_coef': 0.01,
                        'learning_rate': 2e-4,
                    }
                    #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[64, 64, 64])
                    #policy_kwargs = dict(net_arch=[64, 64, 64])
                    layers = [256, 128, 64]
                    #layers = [64, 64]
                    policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)])
                    agent = PPO2(MlpPolicy,
                                 vec_env,
                                 verbose=True,
                                 tensorboard_log=tensorboard_log,
                                 **hyperparams,
                                 policy_kwargs=policy_kwargs)
                    #dataset = ExpertDataset(expert_path='gail_expert.npz', traj_limitation=1, batch_size=128)
                    #print('Pretraining {} agent on "{}"'.format(args.algo.upper(), env_id))
                    #agent.pretrain(dataset, n_epochs=1000)
                    #print('Done pretraining {} agent on "{}"'.format(args.algo.upper(), env_id))
            elif (model == DDPG):
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'memory_limit': 50000,
                #     'normalize_observations': True,
                #     'normalize_returns': False,
                #     'gamma': 0.98,
                #     'actor_lr': 0.00156,
                #     'critic_lr': 0.00156,
                #     'batch_size': 256,
                #     'param_noise': AdaptiveParamNoiseSpec(initial_stddev=0.1, desired_action_stddev=0.1)
                # }
                hyperparams = {
                    'memory_limit':
                    1000000,
                    'normalize_observations':
                    True,
                    'normalize_returns':
                    False,
                    'gamma':
                    0.98,
                    'actor_lr':
                    0.00156,
                    'critic_lr':
                    0.00156,
                    'batch_size':
                    256,
                    'param_noise':
                    AdaptiveParamNoiseSpec(initial_stddev=0.287,
                                           desired_action_stddev=0.287)
                }
                agent = DDPG(LnMlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log,
                             **hyperparams)
            elif (model == TD3):
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'batch_size': 256,
                #     'buffer_size': 50000,
                #     'learning_starts': 1000
                # }
                hyperparams = {
                    'buffer_size': 1000000,
                    'train_freq': 1000,
                    'gradient_steps': 1000,
                    'learning_starts': 10000
                }
                action_noise = NormalActionNoise(mean=np.zeros(2),
                                                 sigma=0.1 * np.ones(2))
                agent = TD3(stable_baselines.td3.MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log,
                            action_noise=action_noise,
                            **hyperparams)
            elif model == A2C:
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'n_steps': 5,
                #     'gamma': 0.995,
                #     'ent_coef': 0.00001,
                #     'learning_rate': 0.00083,
                #     'lr_schedule': 'linear'
                # }
                # layers = [256, 128, 64]
                hyperparams = {
                    'n_steps': 16,
                    'gamma': 0.99,
                    'ent_coef': 0.001,
                    'learning_rate': 2e-4,
                    'lr_schedule': 'linear'
                }
                layers = [64, 64]
                policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)])
                agent = A2C(MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log,
                            **hyperparams,
                            policy_kwargs=policy_kwargs)
            elif model == ACER:
                agent = ACER(MlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log)
            elif model == ACKTR:
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'gamma': 0.99,
                #     'n_steps': 16,
                #     'ent_coef': 0.0,
                #     'learning_rate': 0.06,
                #     'lr_schedule': 'constant'
                # }
                # agent = ACKTR(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams)
                agent = ACKTR(MlpPolicy,
                              vec_env,
                              verbose=True,
                              tensorboard_log=tensorboard_log)
            elif model == SAC:
                # rl-baselines-zoo inspired:
                # hyperparams = {
                #     'batch_size': 256,
                #     'learning_starts': 1000
                # }
                # agent = SAC(stable_baselines.sac.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams)
                agent = SAC(stable_baselines.sac.MlpPolicy,
                            vec_env,
                            verbose=True,
                            tensorboard_log=tensorboard_log)
            elif model == TRPO:
                agent = TRPO(MlpPolicy,
                             vec_env,
                             verbose=True,
                             tensorboard_log=tensorboard_log)

        print('Training {} agent on "{}"'.format(args.algo.upper(), env_id))

        n_updates = 0
        n_episodes = 0

        def callback(_locals, _globals):
            nonlocal n_updates
            nonlocal n_episodes

            sys.stdout.write('Training update: {}\r'.format(n_updates))
            sys.stdout.flush()

            _self = _locals['self']
            vec_env = _self.get_env()

            class Struct(object):
                pass

            report_env = Struct()
            report_env.history = []
            report_env.config = envconfig
            report_env.nsensors = report_env.config[
                "n_sensors_per_sector"] * report_env.config["n_sectors"]
            report_env.sensor_angle = 2 * np.pi / (report_env.nsensors + 1)
            report_env.last_episode = vec_env.get_attr('last_episode')[0]
            report_env.config = vec_env.get_attr('config')[0]
            report_env.obstacles = vec_env.get_attr('obstacles')[0]

            env_histories = vec_env.get_attr('history')
            for episode in range(max(map(len, env_histories))):
                for env_idx in range(len(env_histories)):
                    if (episode < len(env_histories[env_idx])):
                        report_env.history.append(
                            env_histories[env_idx][episode])
            report_env.episode = len(report_env.history) + 1

            total_t_steps = _self.get_env().get_attr(
                'total_t_steps')[0] * num_cpu
            agent_filepath = os.path.join(agent_folder,
                                          str(total_t_steps) + '.pkl')

            if model == PPO2:
                recording_criteria = n_updates % 10 == 0
                report_criteria = True
                _self.save(agent_filepath)
            elif model == A2C or model == ACER or model == ACKTR or model == SAC or model == TRPO:
                save_criteria = n_updates % 100 == 0
                recording_criteria = n_updates % 1000 == 0
                report_criteria = True
                if save_criteria:
                    _self.save(agent_filepath)
            elif model == DDPG or model == TD3:
                save_criteria = n_updates % 10000 == 0
                recording_criteria = n_updates % 50000 == 0
                report_criteria = report_env.episode > n_episodes
                if save_criteria:
                    _self.save(agent_filepath)

            if report_env.last_episode is not None and len(
                    report_env.history) > 0 and report_criteria:
                try:
                    #gym_auv.reporting.plot_trajectory(report_env, fig_dir=scenario_folder, fig_prefix=args.env + '_ep_{}'.format(report_env.episode))
                    gym_auv.reporting.report(report_env,
                                             report_dir=figure_folder)
                    #vec_env.env_method('save', os.path.join(scenario_folder, '_ep_{}'.format(report_env.episode)))
                except OSError as e:
                    print("Ignoring reporting OSError:")
                    print(repr(e))

            if recording_criteria:
                if args.pilot:
                    cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --pilot {} --envconfig {}{}'.format(
                        args.env, agent_filepath, video_folder,
                        args.env + '-' + str(total_t_steps), recording_length,
                        args.algo, args.pilot, envconfig_string,
                        ' --recurrent' if args.recurrent else '')
                else:
                    cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --envconfig {}{}'.format(
                        args.env, agent_filepath, video_folder,
                        args.env + '-' + str(total_t_steps), recording_length,
                        args.algo, envconfig_string,
                        ' --recurrent' if args.recurrent else '')
                subprocess.Popen(cmd)

            n_episodes = report_env.episode
            n_updates += 1

        agent.learn(total_timesteps=1500000,
                    tb_log_name='log',
                    callback=callback)

    elif (args.mode in ['policyplot', 'vectorfieldplot', 'streamlinesplot']):
        figure_folder = os.path.join(DIR_PATH, 'logs', 'plots', args.env,
                                     EXPERIMENT_ID)
        os.makedirs(figure_folder, exist_ok=True)
        agent = PPO2.load(args.agent)

        if args.testvals:
            testvals = json.load(open(args.testvals, 'r'))
            valuegrid = list(ParameterGrid(testvals))
            for valuedict in valuegrid:
                customconfig = envconfig.copy()
                customconfig.update(valuedict)
                env = create_env(env_id,
                                 envconfig,
                                 test_mode=True,
                                 pilot=args.pilot)
                valuedict_str = '_'.join(
                    (key + '-' + str(val) for key, val in valuedict.items()))

                print('Running {} test for {}...'.format(
                    args.mode, valuedict_str))

                if args.mode == 'policyplot':
                    gym_auv.reporting.plot_actions(env,
                                                   agent,
                                                   fig_dir=figure_folder,
                                                   fig_prefix=valuedict_str)
                elif args.mode == 'vectorfieldplot':
                    gym_auv.reporting.plot_vector_field(
                        env,
                        agent,
                        fig_dir=figure_folder,
                        fig_prefix=valuedict_str)
                elif args.mode == 'streamlinesplot':
                    gym_auv.reporting.plot_streamlines(
                        env,
                        agent,
                        fig_dir=figure_folder,
                        fig_prefix=valuedict_str)

        else:
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             pilot=args.pilot)
            with open(os.path.join(figure_folder, 'config.json'), 'w') as f:
                json.dump(env.config, f)

            if args.mode == 'policyplot':
                gym_auv.reporting.plot_actions(env,
                                               agent,
                                               fig_dir=figure_folder)
            elif args.mode == 'vectorfieldplot':
                gym_auv.reporting.plot_vector_field(env,
                                                    agent,
                                                    fig_dir=figure_folder)
            elif args.mode == 'streamlinesplot':
                gym_auv.reporting.plot_streamlines(env,
                                                   agent,
                                                   fig_dir=figure_folder)

        print('Output folder: ', figure_folder)

    elif args.mode == 'test':
        figure_folder = os.path.join(DIR_PATH, 'logs', 'tests', args.env,
                                     EXPERIMENT_ID)
        scenario_folder = os.path.join(figure_folder, 'scenarios')
        video_folder = os.path.join(figure_folder, 'videos')
        os.makedirs(figure_folder, exist_ok=True)
        os.makedirs(scenario_folder, exist_ok=True)
        os.makedirs(video_folder, exist_ok=True)

        if not args.onlyplot:
            agent = model.load(args.agent)

        def create_test_env(video_name_prefix, envconfig=envconfig):
            print('Creating test environment: ' + env_id)
            env = create_env(env_id,
                             envconfig,
                             test_mode=True,
                             render_mode=args.render if args.video else None,
                             pilot=args.pilot)
            vec_env = DummyVecEnv([lambda: env])
            if args.video:
                video_length = min(500, args.recording_length)
                recorded_env = VecVideoRecorder(vec_env,
                                                video_folder,
                                                record_video_trigger=lambda x:
                                                (x % video_length) == 0,
                                                video_length=video_length,
                                                name_prefix=video_name_prefix)
            active_env = recorded_env if args.video else vec_env

            return env, active_env

        failed_tests = []

        def run_test(id,
                     reset=True,
                     report_dir=figure_folder,
                     scenario=None,
                     max_t_steps=None,
                     env=None,
                     active_env=None):
            nonlocal failed_tests

            if env is None or active_env is None:
                env, active_env = create_test_env(video_name_prefix=args.env +
                                                  '_' + id)

            if scenario is not None:
                obs = active_env.reset()
                env.load(args.scenario)
                print('Loaded', args.scenario)
            else:
                if reset:
                    obs = active_env.reset()
                else:
                    obs = env.observe()

            gym_auv.reporting.plot_scenario(env,
                                            fig_dir=scenario_folder,
                                            fig_postfix=id,
                                            show=args.onlyplot)
            if args.onlyplot:
                return
            cumulative_reward = 0
            t_steps = 0
            if max_t_steps is None:
                done = False
            else:
                done = t_steps > max_t_steps

            while not done:
                action, _states = agent.predict(
                    obs, deterministic=not args.stochastic)
                obs, reward, done, info = active_env.step(action)
                if args.video:
                    active_env.render()
                t_steps += 1
                cumulative_reward += reward[0]
                report_msg = '{:<20}{:<20}{:<20.2f}{:<20.2%}\r'.format(
                    id, t_steps, cumulative_reward, info[0]['progress'])
                sys.stdout.write(report_msg)
                sys.stdout.flush()

                if args.save_snapshots and t_steps % 1000 == 0 and not done:
                    env.save_latest_episode(save_history=False)
                    for size in (20, 50, 100, 200, 300, 400, 500):
                        gym_auv.reporting.plot_trajectory(
                            env,
                            fig_dir=scenario_folder,
                            fig_prefix=(args.env + '_t_step_' + str(t_steps) +
                                        '_' + str(size) + '_' + id),
                            local=True,
                            size=size)
                elif done:
                    gym_auv.reporting.plot_trajectory(env,
                                                      fig_dir=scenario_folder,
                                                      fig_prefix=(args.env +
                                                                  '_' + id))

            env.close()

            gym_auv.reporting.report(env, report_dir=report_dir, lastn=-1)
            #gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id))
            #env.save(os.path.join(scenario_folder, id))
            if env.collision:
                failed_tests.append(id)
                with open(os.path.join(figure_folder, 'failures.txt'),
                          'w') as f:
                    f.write(', '.join(map(str, failed_tests)))

            return copy.deepcopy(env.last_episode)

        print('Testing scenario "{}" for {} episodes.\n '.format(
            args.env, args.episodes))
        report_msg_header = '{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}'.format(
            'Episode', 'Timesteps', 'Cum. Reward', 'Progress', 'Collisions',
            'CT-Error [m]', 'H-Error [deg]')
        print(report_msg_header)
        print('-' * len(report_msg_header))

        if args.testvals:
            testvals = json.load(open(args.testvals, 'r'))
            valuegrid = list(ParameterGrid(testvals))

        if args.scenario:
            if args.testvals:
                episode_dict = {}
                for valuedict in valuegrid:
                    customconfig = envconfig.copy()
                    customconfig.update(valuedict)
                    env, active_env = create_test_env(envconfig=customconfig)
                    valuedict_str = '_'.join(
                        (key + '-' + str(val)
                         for key, val in valuedict.items()))

                    colorval = -np.log10(
                        valuedict['reward_lambda'])  #should be general

                    rep_subfolder = os.path.join(figure_folder, valuedict_str)
                    os.makedirs(rep_subfolder, exist_ok=True)
                    for episode in range(args.episodes):
                        last_episode = run_test(valuedict_str + '_ep' +
                                                str(episode),
                                                report_dir=rep_subfolder)
                        episode_dict[valuedict_str] = [last_episode, colorval]
                print('Plotting all')
                gym_auv.reporting.plot_trajectory(env,
                                                  fig_dir=scenario_folder,
                                                  fig_prefix=(args.env +
                                                              '_all_agents'),
                                                  episode_dict=episode_dict)

            else:
                run_test("ep0", reset=True, scenario=args.scenario)

        else:
            if args.testvals:
                episode_dict = {}
                agent_index = 1
                for valuedict in valuegrid:
                    customconfig = envconfig.copy()
                    customconfig.update(valuedict)
                    env, active_env = create_test_env(envconfig=customconfig)
                    valuedict_str = '_'.join(
                        (key + '-' + str(val)
                         for key, val in valuedict.items()))

                    colorval = np.log10(
                        valuedict['reward_lambda'])  #should be general

                    rep_subfolder = os.path.join(figure_folder, valuedict_str)
                    os.makedirs(rep_subfolder, exist_ok=True)
                    for episode in range(args.episodes):
                        last_episode = run_test(valuedict_str + '_ep' +
                                                str(episode),
                                                report_dir=rep_subfolder)
                    episode_dict['Agent ' +
                                 str(agent_index)] = [last_episode, colorval]
                    agent_index += 1

                gym_auv.reporting.plot_trajectory(env,
                                                  fig_dir=figure_folder,
                                                  fig_prefix=(args.env +
                                                              '_all_agents'),
                                                  episode_dict=episode_dict)
            else:
                env, active_env = create_test_env(video_name_prefix=args.env)
                for episode in range(args.episodes):
                    run_test('ep' + str(episode),
                             env=env,
                             active_env=active_env)

        if args.video and active_env:
            active_env.close()
Beispiel #9
0
def train(algo,
          df,
          model_name,
          uniqueId,
          lr=None,
          gamma=None,
          noBacktest=1,
          cutoff_date=None,
          commission=0,
          addTA='N'):
    before = np.zeros(noBacktest)
    after = np.zeros(noBacktest)
    backtest = np.zeros(noBacktest)
    train_dates = np.empty(noBacktest, dtype="datetime64[s]")
    start_test_dates = np.empty(noBacktest, dtype="datetime64[s]")
    end_test_dates = np.empty(noBacktest, dtype="datetime64[s]")
    # print(str(df.columns.tolist()))

    dates = np.unique(df.date)
    logfile = "./log/"
    print("noBacktest", noBacktest)

    # backtest=1 uses cut of date to split train/test
    cutoff_date = np.datetime64(cutoff_date)
    print("cutoff_date", cutoff_date)

    if noBacktest == 1:
        a = np.where(dates <= cutoff_date)[0]
        b = np.where(dates > cutoff_date)[0]
        s = []
        s.append((a, b))

    else:
        # ref https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
        splits = TimeSeriesSplit(n_splits=noBacktest)
        s = splits.split(dates)

    loop = 0

    for train_date_index, test_date_index in s:
        print("loop", loop)
        train = df[df.date.isin(dates[train_date_index])]
        test = df[df.date.isin(dates[test_date_index])]
        runtimeId = uniqueId + "_" + str(loop)
        train_dates[loop] = max(train.date)
        start_test_dates[loop] = min(test.date)
        end_test_dates[loop] = max(test.date)

        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
        global env

        title = runtimeId + "_Train lr=" + \
            str(lr) + ", cliprange=" + str(cliprange) + ", commission=" + str(commission)
        env = DummyVecEnv([
            lambda: StockEnvPlayer(train,
                                   logfile + runtimeId + ".csv",
                                   title,
                                   seed=seed,
                                   commission=commission,
                                   addTA=addTA)
        ])

        # Automatically normalize the input features
        env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)

        model = algo(
            MlpPolicy,
            env,
            seedy=seed,
            gamma=g,
            n_steps=128,
            ent_coef=0.01,
            learning_rate=lr,
            vf_coef=0.5,
            max_grad_norm=0.5,
            lam=0.95,
            nminibatches=4,
            noptepochs=4,
            cliprange=cliprange,
            cliprange_vf=None,  # tensorboard_log="./tensorlog",
            _init_setup_model=True,
            policy_kwargs=None,
            full_tensorboard_log=False,
        )

        # Random Agent, before training
        print("\n*** Agent before learning ***")
        steps = len(np.unique(train.date))
        before[loop] = evaluate(model, num_steps=steps)

        model.learn(total_timesteps=round(steps))

        print("\n*** Evaluate the trained agent ***")
        after[loop] = evaluate(model, num_steps=steps)

        print("\n*** Run agent on unseen data ***")
        title = runtimeId + "_Test lr=" + \
            str(lr) + ", cliprange=" + str(cliprange) + ", commission=" + str(commission)
        env = DummyVecEnv([
            lambda: StockEnvPlayer(test,
                                   logfile + runtimeId + ".csv",
                                   title,
                                   seed=seed,
                                   commission=commission,
                                   addTA=addTA)
        ])
        env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
        steps = len(np.unique(test.date))
        backtest[loop] = evaluate(model, num_steps=steps)

        del model
        env.close()

        loop += 1

    # display result on screen
    for i in range(noBacktest):
        print("\ntrain_dates:", min(df.date), train_dates[i])
        print("test_dates:", start_test_dates[i], end_test_dates[i])
        print(
            "backtest {} : SUM reward : before | after | backtest : {: 8.2f} | {: 8.2f} | {: 8.2f}"
            .format(i, before[i], after[i], backtest[i]))

    return pd.DataFrame({
        "Model": uniqueId,
        "addTA": addTA,
        "Columns": str(df.columns.tolist()),
        "commission": commission,
        "Seed": seed,
        "cliprange": cliprange,
        "learningRate": lr,
        "gamma": g,
        "backtest  # ": np.arange(noBacktest),
        "StartTrainDate": min(train.date),
        "EndTrainDate": train_dates,
        "before": before,
        "after": after,
        "testDate": end_test_dates,
        "Sum Reward@roadTest": backtest
    })
# Create 4 artificial transitions per real transition
n_sampled_goal = 4

# # SAC hyperparams:
# model = HER('MlpPolicy', env, SAC, n_sampled_goal=n_sampled_goal,
#             goal_selection_strategy='future',
#             verbose=1, buffer_size=int(1e6),
#             learning_rate=1e-3,
#             gamma=0.95, batch_size=256,
#             policy_kwargs=dict(layers=[256, 256, 256]))

# DDPG Hyperparams:
# NOTE: it works even without action noise
n_actions = env.action_space.shape[0]
noise_std = 0.2
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=noise_std * np.ones(n_actions))
model = HER('MlpPolicy',
            env,
            DDPG,
            n_sampled_goal=n_sampled_goal,
            goal_selection_strategy='future',
            verbose=1,
            buffer_size=int(1e6),
            actor_lr=1e-3,
            critic_lr=1e-3,
            action_noise=action_noise,
            gamma=0.95,
            batch_size=256,
            policy_kwargs=dict(layers=[256, 256, 256]))

model.learn(int(2e5))