def sample_ddpg_params(trial):
    """
    Sampler for DDPG hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical(
        'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1)
    # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1)
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size',
                                           [16, 32, 64, 128, 256])
    buffer_size = trial.suggest_categorical(
        'memory_limit', [int(1e4), int(1e5), int(1e6)])
    noise_type = trial.suggest_categorical(
        'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)
    normalize_observations = trial.suggest_categorical(
        'normalize_observations', [True, False])
    normalize_returns = trial.suggest_categorical('normalize_returns',
                                                  [True, False])

    hyperparams = {
        'gamma': gamma,
        'actor_lr': learning_rate,
        'critic_lr': learning_rate,
        'batch_size': batch_size,
        'memory_limit': buffer_size,
        'normalize_observations': normalize_observations,
        'normalize_returns': normalize_returns
    }

    if noise_type == 'adaptive-param':
        hyperparams['param_noise'] = AdaptiveParamNoiseSpec(
            initial_stddev=noise_std, desired_action_stddev=noise_std)
        # Apply layer normalization when using parameter perturbation
        hyperparams['policy_kwargs'] = dict(layer_norm=True)
    elif noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    return hyperparams
def DDPGgive_results(files, balance, shares=None):
    env = create_stock_env(files, train=False, balance=balance, shares=shares)
    max_steps = env.max_steps - env.num_prev
    env = DummyVecEnv([lambda: env])
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(0, 2)
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=1,
                                         desired_action_stddev=0.1,
                                         adoption_coefficient=1.01)
    model = DDPG(CustomDDPGPolicy,
                 env,
                 verbose=0,
                 param_noise=param_noise,
                 action_noise=action_noise)

    # model = DDPG.load("/home/harshit/Documents/itsp-trade agent/Reinforcement-Learning-Stock-Trader/WebPortal/StockApp/Stock_stable.zip",env=env)
    model.learn(total_timesteps=100)
    profit = 0
    profitst = np.zeros((max_steps - 1, 2))
    actionst = np.zeros((n_actions // 2, max_steps - 1, 2))
    shares = np.zeros((len(files), max_steps - 1, 2))
    obs = env.reset()
    for i in range(max_steps):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        actionst[:, i, 1] = -info[0]['action'][0][0:n_actions // 2] + info[0][
            'action'][0][n_actions // 2:]
        actionst[:, i, 0] = i
        shares[:, i, 1] = info[0]['shares_held']
        shares[:, i, 0] = i
        #         print('a',action)
        profit += rewards
        profitst[i] = [i, profit]
        if dones:
            break
    print(info[0]['action'][0])
    print(actionst)
    return profitst.tolist(), shares.tolist(), actionst.tolist()
Esempio n. 3
0
eval_env = Gait2DGenAct(integrator_accuracy=3e-2)
#env = Arm2DVecEnv(visualize=True)
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=1000,
                                                 verbose=1)
eval_callback = EvalCallback(eval_env,
                             callback_on_new_best=callback_on_best,
                             verbose=1)

n_actions = env.action_space.shape[-1]

param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                            sigma=float(0.1) *
                                            np.ones(n_actions),
                                            theta=0.05)
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.287)


class CustomTD3Policy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomTD3Policy, self).__init__(*args,
                                              **kwargs,
                                              layers=[400, 400],
                                              layer_norm=True,
                                              feature_extraction="mlp")


model = TD3(CustomTD3Policy,
            env,
            verbose=1,
            action_noise=action_noise,
Esempio n. 4
0
    if 'frame_stack' in hyperparams:
        del hyperparams['frame_stack']

    # Stop env processes to free memory
    if args.optimize_hyperparameters and n_envs > 1:
        env.close()

    # Parse noise string for DDPG and SAC
    if algo_ in ['ddpg', 'sac', 'td3'
                 ] and hyperparams.get('noise_type') is not None:
        noise_type = hyperparams['noise_type'].strip()
        noise_std = hyperparams['noise_std']
        n_actions = env.action_space.shape[0]
        if 'adaptive-param' in noise_type:
            assert algo_ == 'ddpg', 'Parameter is not supported by SAC'
            hyperparams['param_noise'] = AdaptiveParamNoiseSpec(
                initial_stddev=noise_std, desired_action_stddev=noise_std)
        elif 'normal' in noise_type:
            if 'lin' in noise_type:
                hyperparams['action_noise'] = LinearNormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions),
                    final_sigma=hyperparams.get('noise_std_final', 0.0) *
                    np.ones(n_actions),
                    max_steps=n_timesteps)
            else:
                hyperparams['action_noise'] = NormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions))
        elif 'ornstein-uhlenbeck' in noise_type:
            hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
                    if self.verbose > 0:
                        print("Saving new best model to {}".format(
                            self.save_path))
                    self.model.save(self.save_path)

        return True


# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment

env = gym.make('SatelliteEnvironment-v0')
env = Monitor(env, log_dir)

# Add some param noise for exploration
param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                     desired_action_stddev=0.1)
# Because we use parameter noise, we should use a MlpPolicy with layer normalization
model = DDPG(LnMlpPolicy, env, param_noise=param_noise, verbose=0)
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
# Train the agent
time_steps = 1e5
model.learn(total_timesteps=int(time_steps), callback=callback)

results_plotter.plot_results([log_dir], time_steps,
                             results_plotter.X_TIMESTEPS, "DDPG Satellite")
plt.show()
            shutil.copyfile(original_adr, target_adr)

        else:
            save_path = 'logs/'
            env = Monitor(env, 'logs/',
                          info_keywords=('reserved', ))  # logging monitor
        model_dir = save_path + '{}_final_model'.format(
            cfg.POLICY.NAME)  # model save/load directory

        if cfg.POLICY.NAME == 'DDPG':
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions),
                sigma=float(cfg.POLICY.ACTION_NOISE) * np.ones(n_actions))

            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(cfg.POLICY.PARAM_NOISE_STD),
                desired_action_stddev=float(cfg.POLICY.PARAM_NOISE_STD))
            model = DDPG(policy[cfg.POLICY.NET],
                         env,
                         verbose=1,
                         param_noise=param_noise,
                         action_noise=action_noise,
                         policy_kwargs={
                             'cnn_extractor': eval(cfg.POLICY.CNN_EXTRACTOR)
                         })
        elif cfg.POLICY.NAME == 'PPO2':
            model = PPO2(policy[cfg.POLICY.NET],
                         env,
                         verbose=1,
                         model_dir=save_path,
                         policy_kwargs={
Esempio n. 7
0
    def _preprocess_hyperparams(self, _hyperparams):
        # Convert to python object if needed
        if "policy_kwargs" in _hyperparams.keys() and isinstance(_hyperparams["policy_kwargs"], str):
            _hyperparams["policy_kwargs"] = eval(_hyperparams["policy_kwargs"])

        n_timesteps = _hyperparams.pop("n_timesteps", None)
        n_envs = _hyperparams.pop("n_envs", None)
        log_every = _hyperparams.pop("log_every", None)
        if not self.continue_learning:
            if not log_every:
                self.logger.debug("log_every not defined in yml file: using command line log_every {}".format(self.log_every))
                log_every = self.log_every
            else:
                self.logger.debug("using log_every as defined in yml file: {}".format(log_every))
        else:
            self.logger.debug("priority to command line log_every {}".format(self.log_every))
            log_every = self.log_every

        # Parse noise string
        if self.algo_name in ["ddpg", "sac", "td3"] and _hyperparams.get("noise_type") is not None:
            noise_type = _hyperparams["noise_type"].strip()
            noise_std = _hyperparams["noise_std"]
            n_actions = get_n_actions(env_name=self.env_name, env_variables=self.env_kwargs)
            self.logger.debug("n_actions: {}".format(n_actions))
            if "adaptive-param" in noise_type:
                assert self.algo_name == "ddpg", "Parameter is not supported by SAC"
                _hyperparams["param_noise"] = AdaptiveParamNoiseSpec(initial_stddev=noise_std, desired_action_stddev=noise_std)
            elif "normal" in noise_type:
                if "lin" in noise_type:
                    _hyperparams["action_noise"] = LinearNormalActionNoise(
                        mean=np.zeros(n_actions),
                        sigma=noise_std * np.ones(n_actions),
                        final_sigma=_hyperparams.get("noise_std_final", 0.0) * np.ones(n_actions),
                        max_steps=n_timesteps,
                    )
                else:
                    _hyperparams["action_noise"] = NormalActionNoise(
                        mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
                    )
            elif "ornstein-uhlenbeck" in noise_type:
                _hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
                )
            else:
                raise RuntimeError('Unknown noise type "{}"'.format(noise_type))
            self.logger.debug("Applying {} noise with std {}".format(noise_type, noise_std))
            del _hyperparams["noise_type"]
            del _hyperparams["noise_std"]
            if "noise_std_final" in _hyperparams:
                del _hyperparams["noise_std_final"]

        normalize_kwargs = _parse_normalize(dictionary=_hyperparams)

        if n_envs is None:
            self.logger.debug("n_envs not defined in yml file: using command line n_envs {}".format(self.num_envs))
            n_envs = self.num_envs
        else:
            self.logger.debug("using n_envs as num of envs defined in yml file:".format(n_envs))

        if not self.continue_learning:
            # priority to yml defined n_timesteps
            if n_timesteps is None:
                self.logger.debug(
                    "n_timesteps not defined in yml file: using command line n_timesteps {}".format(self.train_total_timesteps)
                )
                n_timesteps = self.train_total_timesteps
            else:
                self.logger.debug("using n_timesteps as total timesteps defined in yml file: {}".format(n_timesteps))
                n_timesteps = int(n_timesteps)
        else:
            if self.train_total_timesteps and self.train_total_timesteps != -1:
                assert self.train_total_timesteps <= int(n_timesteps), "train_total_timesteps <= n_timesteps: {}, {}".format(
                    self.train_total_timesteps, n_timesteps
                )
                # priority to command line n_timesteps
                self.logger.debug("priority to command line n_timesteps {}".format(self.train_total_timesteps))
                n_timesteps = self.train_total_timesteps
            elif self.train_total_timesteps == -1:
                assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps)
                n_timesteps = int(n_timesteps)
                self.logger.info("training in continual learning = training from scratch. n_timesteps {}".format(n_timesteps))
            else:
                assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps)
                n_timesteps = int(n_timesteps // 2)
                self.logger.debug(
                    "train_total_timesteps not specified in continue_learning: "
                    "taking half of original n_timesteps defined in yml file {}".format(n_timesteps)
                )

        assert n_timesteps % log_every == 0, "it should be possible to divide n_timesteps for log_every: {}, {}".format(
            n_timesteps, log_every
        )
        return normalize_kwargs, n_envs, n_timesteps, log_every, _hyperparams
Esempio n. 8
0
    def learn(self):
        # Use deterministic actions for evaluation
        eval_path = self.model_dir + "/best_model"
        # TODO save checkpoints with vecnormalize callback pkl file
        save_vec_normalize = SaveVecNormalizeCallback(save_freq=1, save_path=eval_path)
        if self.norm:
            # Don't normalize the reward for test env
            self.test_env = VecNormalize(self.test_env, norm_obs=True, norm_reward=False,
                                        clip_obs=10.)
        eval_callback = EvalCallback(self.test_env, best_model_save_path=eval_path,
                                    log_path=eval_path+'/logs', eval_freq=50000,
                                    n_eval_episodes=10, callback_on_new_best=save_vec_normalize,
                                    deterministic=True, render=False)
        checkpoint_callback = CheckpointCallback(save_freq=25000, save_path=self.model_dir+'/logs/',
                                         name_prefix='rl_model')
        time_callback = TrainingTimeCallback()
        tensorboard_file = None if self.config[self.algo]['tensorboard_logs'] is None else "tensorboard_logs/"+self.model_dir
        if self.algo == 'SAC':
            if not self.env.envs[0].is_simplified() and (self.env.envs[0].depth_obs or self.env.envs[0].full_obs):
                policy_kwargs = {
                    "layers": self.config[self.algo]['layers'],
                    "cnn_extractor": custom_obs_policy.create_augmented_nature_cnn(1)}
                policy = sacCnn
            elif self.env.envs[0].depth_obs or self.env.envs[0].full_obs:
                policy_kwargs = {}
                policy = sacCnn
            else:
                policy_kwargs = {"layers": self.config[self.algo]['layers'], "layer_norm": False}
                policy = sacMlp
            if self.load_dir:
                top_folder_idx = self.load_dir.rfind('/')
                top_folder_str = self.load_dir[0:top_folder_idx]
                if self.norm:
                    self.env = VecNormalize(self.env, training=True, norm_obs=False, norm_reward=False,
                                            clip_obs=10.)
                    self.env = VecNormalize.load(os.path.join(top_folder_str, 'vecnormalize.pkl'), self.env)
                model = sb.SAC(policy,
                            self.env,
                            policy_kwargs=policy_kwargs,
                            verbose=1,
                            gamma=self.config['discount_factor'],
                            buffer_size=self.config[self.algo]['buffer_size'],
                            batch_size=self.config[self.algo]['batch_size'],
                            learning_rate=self.config[self.algo]['step_size'],
                            tensorboard_log=tensorboard_file)
                model_load = sb.SAC.load(self.load_dir, self.env)
                params = model_load.get_parameters()
                model.load_parameters(params, exact_match=False)
            else:
                if self.norm:
                    self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True,
                                            clip_obs=10.)
                model = sb.SAC(policy,
                            self.env,
                            policy_kwargs=policy_kwargs,
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            buffer_size=self.config[self.algo]['buffer_size'],
                            batch_size=self.config[self.algo]['batch_size'],
                            learning_rate=self.config[self.algo]['step_size'],
                            tensorboard_log=tensorboard_file)
        elif self.algo == 'TRPO':
            model = sb.TRPO(MlpPolicy, 
                            self.env, 
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            timesteps_per_batch=self.config[self.algo]['max_iters'],
                            vf_stepsize=self.config[self.algo]['step_size'],
                            tensorboard_log=tensorboard_file)
        elif self.algo == 'PPO':
            if not self.env.envs[0].is_simplified() and (self.env.envs[0].depth_obs or self.env.envs[0].full_obs):
                policy_kwargs = {
                    "layers": self.config[self.algo]['layers'],
                    "cnn_extractor": custom_obs_policy.create_augmented_nature_cnn(1)}
                policy = CnnPolicy
            elif self.env.envs[0].depth_obs or self.env.envs[0].full_obs:
                policy_kwargs = {}
                policy = CnnPolicy
            else:
                policy_kwargs = {"layers": self.config[self.algo]['layers'], "layer_norm": False}
                policy = MlpPolicy
            model = sb.PPO2(MlpPolicy, 
                            self.env, 
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            learning_rate=self.config[self.algo]['learning_rate'],
                            tensorboard_log=tensorboard_file)
        elif self.algo == 'DQN':
            if self.load_dir:
                model = self.load_params()
            else:
                model = sb.DQN(DQNMlpPolicy, 
                            self.env, 
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            batch_size=self.config[self.algo]['batch_size'],
                            prioritized_replay=self.config[self.algo]['prioritized_replay'],
                            tensorboard_log=tensorboard_file)
        elif self.algo == "DDPG":
            param_noise = AdaptiveParamNoiseSpec()
            model = sb.DDPG(ddpgMlp,
                            self.env,
                            verbose=2,
                            gamma=self.config['discount_factor'],
                            param_noise=param_noise,
                            tensorboard_log=tensorboard_file)
        try:
            model.learn(total_timesteps=int(self.config[self.algo]['total_timesteps']), 
                        callback=[TensorboardCallback(self.env, tensorboard_file, self.algo, self.log_freq, self.model_dir), 
                                   eval_callback])
        except KeyboardInterrupt:
            pass

        self.save(model, self.model_dir)
Esempio n. 9
0
            repo = git.Repo(search_parent_directories=False)
            commit_id = repo.head.object.hexsha
            with open('logs/agent_{}/reproduction_info.txt'.format(args.agent_id), 'w') as f:  # Use file to refer to the file object
                f.write('Git commit id: {}\n\n'.format(commit_id))
                f.write('Program arguments:\n\n{}'.format(args))
                f.close()
        else:
            save_path = '../logs/'
            env = Monitor(env, '../logs/')                                   # logging monitor
        model_dir = save_path + '{}_final_model'.format(args.alg)                                       # model save/load directory

        if args.alg == 'ddpg':
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                        sigma=args.action_noise * np.ones(n_actions))

            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(args.param_noise_stddev),
                                                 desired_action_stddev=float(args.param_noise_stddev))
            model = DDPG(DDPGPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise,
                         render=args.play)
        elif args.alg == 'ppo2':
            model = PPO2(CommonMlpPolicy, env, verbose=1)
        elif args.alg == 'trpo':
            model = TRPO(CommonMlpPolicy, env, verbose=1, model_dir=save_path)
        elif args.alg =='a2c':
            model = A2C(CommonMlpPolicy, env, verbose=1)
        else:
            print(args.alg)
            raise Exception('Algorithm name is not defined!')

        print('Model is Created')
        try:
            print('Training Started')