def test_sync_vec_normalize():
    env = DummyVecEnv([make_env])

    assert unwrap_vec_normalize(env) is None

    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    env = VecFrameStack(env, 1)

    assert isinstance(unwrap_vec_normalize(env), VecNormalize)

    eval_env = DummyVecEnv([make_env])
    eval_env = VecNormalize(eval_env, training=False, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
    eval_env = VecFrameStack(eval_env, 1)

    env.reset()
    # Initialize running mean
    for _ in range(100):
        env.step([env.action_space.sample()])

    obs = env.reset()
    original_obs = env.get_original_obs()
    # Normalization must be different
    assert not np.allclose(obs, eval_env.normalize_obs(original_obs))

    sync_envs_normalization(env, eval_env)

    # Now they must be synced
    assert np.allclose(obs, eval_env.normalize_obs(original_obs))
Example #2
0
    def optimize_agent(self, trial):
        #self.env_params = self.optimize_envs(trial)
        env_id = "default"
        num_e = 1  # Number of processes to use
        self.train_env = DummyVecEnv([lambda: Template_Gym(eval=False)])
        #self.train_env = SubprocVecEnv([self.make_env(env_id, i, eval=False) for i in range(num_e)])
        self.train_env = VecNormalize(self.train_env,
                                      norm_obs=True,
                                      norm_reward=True)
        self.test_env = DummyVecEnv([lambda: Template_Gym(eval=True)])
        #self.test_env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_e)])
        self.test_env = VecNormalize(self.train_env,
                                     norm_obs=True,
                                     norm_reward=True)

        self.model_params = self.optimize_ppo2(trial)
        self.model = PPO2(CustomPolicy_2,
                          self.train_env,
                          verbose=0,
                          nminibatches=1,
                          tensorboard_log=Path("./tensorboard2").name,
                          **self.model_params)
        #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./min1" )

        last_reward = -np.finfo(np.float16).max
        #evaluation_interval = int(len(train_df) / self.n_evaluations)
        evaluation_interval = 3000

        for eval_idx in range(self.n_evaluations):
            try:
                self.model.learn(evaluation_interval)
            except AssertionError:
                raise

            rewards = []
            n_episodes, reward_sum = 0, 0.0

            obs = self.test_env.reset()
            while n_episodes < self.n_test_episodes:
                action, _ = self.model.predict(obs)
                obs, reward, done, _ = self.test_env.step(action)
                reward_sum += reward

                if done:
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    n_episodes += 1
                    obs = self.test_env.reset()

            last_reward = np.mean(rewards)
            trial.report(-1 * last_reward, eval_idx)

            if trial.should_prune(eval_idx):
                raise optuna.structs.TrialPruned()

        return -1 * last_reward
Example #3
0
def _add_normalization_wrapper(env, n_envs, normalize):
    if isinstance(normalize, bool):
        env = VecNormalize(env)
    elif isinstance(normalize, dict):
        if 'trained_agent' in normalize:
            path = normalize.pop('trained_agent')
            env = VecNormalize.load(path, env)
            env.training = normalize.pop('training', True)
        elif normalize.pop('precompute', False):
            samples = normalize.pop('samples', 10000)
            env = _precompute_normalization(env, n_envs, samples, normalize)
        else:
            env = VecNormalize(env, **normalize)
    return env
def make_alrs_env(args, test=False, baseline=False):
	"""
	Make a new ALRS environment with parameters specified as command line arguments.
	"""
	from environment import AdaptiveLearningRateOptimizer

	env = make_vec_env(
        env_id=AdaptiveLearningRateOptimizer,
        n_envs=1 if test else args.num_envs,
        env_kwargs={
			'dataset': args.dataset,
			'architecture': args.architecture,
            'batch_size': args.batch_size,
            'update_freq': args.update_freq,
            'num_train_steps': args.num_train_steps,
            'initial_lr': args.initial_lr,
            'discrete': args.discrete,
            'action_range': np.inf if baseline else args.action_range,
			'lr_noise':  not (test or baseline)
        }
    )
	env = VecNormalize(
        venv=env,
        norm_obs=args.ppo2_norm_obs,
        norm_reward=args.ppo2_norm_reward,
        clip_obs=args.ppo2_cliprange if args.ppo2_cliprange > 0 else 10,
        clip_reward=args.ppo2_cliprange if args.ppo2_cliprange > 0 else 10,
        gamma=args.ppo2_gamma
    )
	env.alrs = env.venv.envs[0].env

	return env
    def train(self,
              num_e=1,
              n_timesteps=100000000,
              save_fraction=0.1,
              save='saves/min1'):
        env_id = "default"
        num_e = 1  # Number of processes to use
        # Create the vectorized environment
        #env = DummyVecEnv([lambda: env])
        #Ramona
        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_e)])
        #env = Template_Gym()
        #self.env = DummyVecEnv([lambda: env])
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        self.model = PPO2(CustomPolicy_2,
                          self.env,
                          verbose=0,
                          learning_rate=1e-4,
                          nminibatches=1,
                          tensorboard_log="./min1")

        #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" )
        n_timesteps = n_timesteps * save_fraction
        n_timesteps = int(n_timesteps)
        training_loop = 1 / save_fraction
        training_loop = int(training_loop)
        log_dir = "saves"
        for i in range(training_loop):
            self.model.learn(n_timesteps)
            self.model.save(save + str(i))
            self.env.save_running_average(log_dir)
        self.env.save_running_average(log_dir)
Example #6
0
    def __init__(self, **params):
        super().__init__(**params)
        self.Model = PPO2
        self.solver_signature = "gym_" + ParameterManager.get_param_footprint(self.get_footprint_params())

        # parameters from our config, not the original one
        self.days = self.params['dataset']["days"]
        env_id = "TaxiEnv-v01"
        self.env_params = self.load_env_params()

        seed = np.random.randint(1,10000)
        self.log['seed'] = seed

        if self.params.get("lstm", 0) == 1:
            Policy = MlpLstmPolicy
            nminibatches = 1
            num_cpu = 1 # One current limitation of recurrent policies is that you must test them with the same number of environments they have been trained on.
        else:
            Policy = MlpPolicy
            nminibatches = 4
            num_cpu = self.params['num_cpu']
        # Create the vectorized environment
        self.train_env = SubprocVecEnv([self.make_env(env_id, i, seed+i, self.env_params) for i in range(num_cpu)])

        self.train_env = VecNormalize(self.train_env, norm_obs=False, norm_reward=False)

        # self.model = self.Model(Policy, self.train_env, verbose=0, nminibatches=nminibatches, tensorboard_log=os.path.join(self.dpath,self.solver_signature))
                                # minibatches are important, and no parallelism
                                #n_steps=self.params['dataset']['time_periods']+1,
        self.model = self.Model(Policy, self.train_env, verbose=0, nminibatches=4, tensorboard_log=os.path.join(self.dpath,self.solver_signature), n_steps=self.params['dataset']['time_periods']+1)
Example #7
0
def create_env(n_envs, eval_env=False, no_log=False):

    global hyperparams, env_kwargs
    log_dir = None if eval_env or no_log else save_path

    if n_envs == 1:
        env = DummyVecEnv([
            make_env(env_id,
                     0,
                     seed,
                     wrapper_class=env_wrapper,
                     log_dir=log_dir,
                     env_kwargs=env_kwargs)
        ])
    else:
        env = DummyVecEnv([
            make_env(env_id,
                     wrapper_class=env_wrapper,
                     log_dir=log_dir,
                     env_kwargs=env_kwargs)
        ])
        if normalize:
            local_normalize_kwargs = {'norm_reward': False}
            env = VecNormalize(env, **local_normalize_kwargs)

    return env
Example #8
0
def createEnvs(args,
               allow_early_resets=False,
               env_kwargs=None,
               load_path_normalise=None):
    """
    :param args: (argparse.Namespace Object)
    :param allow_early_resets: (bool) Allow reset before the enviroment is done, usually used in ES to halt the envs
    :param env_kwargs: (dict) The extra arguments for the environment
    :param load_path_normalise: (str) the path to loading the rolling average, None if not available or wanted.
    :return: (Gym VecEnv)
    """
    # imported here to prevent cyclic imports

    envs = [
        makeEnv(args.env,
                args.seed,
                i,
                args.log_dir,
                allow_early_resets=allow_early_resets,
                env_kwargs=env_kwargs) for i in range(args.num_cpu)
    ]

    if len(envs) == 1:
        # No need for subprocesses when having only one env
        envs = DummyVecEnv(envs)
    else:
        envs = SubprocVecEnv(envs)

    envs = VecFrameStack(envs, args.num_stack)

    envs = VecNormalize(envs, norm_obs=True, norm_reward=False)
    # envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise)

    return envs
Example #9
0
def main(log_dir, easy, n_steps=450):
    exp_dir, seed_offset = get_exp_dir(
        os.path.join(log_dir, "reacher-obstacle-default" + ("-easy" if easy else "") + "-ppo"))
    print("Seed offset: " + str(seed_offset))

    log_path = os.path.join(exp_dir, "ppo-reach-avoid.log")
    avg_log_path = exp_dir
    if not os.path.exists(log_path):
        n_envs = 8
        env = VecNormalize(
            SubprocVecEnv([create_env_fn(seed_offset * n_envs + i, easy=easy) for i in range(0, n_envs)]),
            gamma=0.999)
        model = PPO2(policy='MlpPolicy', env=env, n_steps=n_steps, nminibatches=5, verbose=1, gamma=0.999,
                     noptepochs=15, ent_coef=1e-3, lam=1, policy_kwargs=dict(layers=[164, 164]))

        average_rewards = []

        def log_callback(local_vars, global_vars):
            avg_r = np.mean([ep_info['r'] for ep_info in local_vars["ep_info_buf"]])
            average_rewards.append(avg_r)
            return True

        # 3067500 = 409 iterations (400 + 9 for buffer initialization) * 50 trajectories * 150 timesteps
        model.learn(3067500, seed=seed_offset, callback=log_callback)
        model.save(log_path)
        env.save_running_average(avg_log_path)
        np.save(os.path.join(exp_dir, "rewards.npy"), np.array(average_rewards))
Example #10
0
def create_env(n_envs, env_name=None, log_dir=None):
    return VecNormalize(make_vec_env(ENVS[env_name][env_id],
                                     n_envs=n_envs,
                                     env_kwargs=ENVS[env_name][env_kwargs],
                                     monitor_dir=log_dir),
                        norm_obs=False,
                        norm_reward=True)
Example #11
0
 def train(self,
           num_e=1,
           n_timesteps=1000000,
           save_fraction=0.0125,
           save='saves/audbuyh4120',
           config=config):
     env_id = "default"
     num_e = 1  # Number of processes to use
     # Create the vectorized environment
     #env = DummyVecEnv([lambda: env])
     #Ramona
     self.config = config
     self.env = SubprocVecEnv([
         self.make_env(env_id, i, eval=False, config=self.config)
         for i in range(num_e)
     ])
     #env = Template_Gym()
     #self.env = DummyVecEnv([lambda: env])
     self.env = VecNormalize(self.env, norm_obs=False, norm_reward=True)
     self.model = PPO2(CnnPolicy, self.env, verbose=0)
     #self.model = PPO2("MlpPolicy", self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5  )
     #self.model = PPO2(CustomPolicy_4, self.env, verbose=0, nminibatches=1, tensorboard_log="./gbp_chf_4h_r", **self.config.params )
     #self.model = PPO2(CustomPolicy_5, self.env, verbose=0, nminibatches=1, tensorboard_log="./aud_chf", learning_rate=1e-5  )#**self.config.params
     #self.model = PPO2.load('saves/playerdetails39', self.env, policy=CustomPolicy,  tensorboard_log="./playerdetailsex" )
     #self.model = PPO2.load(self.config.path+str(79)+'.pkl', self.env, policy=CustomPolicy_5,  tensorboard_log="./default/" )
     #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" )
     n_timesteps = n_timesteps * save_fraction
     n_timesteps = int(n_timesteps)
     training_loop = 1 / save_fraction
     training_loop = int(training_loop)
     log_dir = "saves"
     #self.env.load_running_average(log_dir)
     for i in range(training_loop):
         self.model.learn(n_timesteps)
         self.model.save(self.config.save + str(i))
Example #12
0
    def train(self, num_e=1, n_timesteps=1000000, save='saves/agent4'):
        env_id = "default"
        num_e = 1  # Number of processes to use
        # Create the vectorized environment
        #env = DummyVecEnv([lambda: env])

        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_e)])
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        #self.model = PPO2(policy=CnnPolicy,
        #env=SubprocVecEnv(self.env_fns),
        #n_steps=8192,
        #nminibatches=8,
        #lam=0.95,
        #gamma=0.99,
        #noptepochs=4,
        #ent_coef=0.001,
        #learning_rate=lambda _: 2e-5,
        #cliprange=lambda _: 0.2,
        #verbose=1,
        #tensorboard_log="./breakorbust")
        self.model = PPO2(CustomPolicy,
                          env=self.env,
                          verbose=0,
                          learning_rate=1e-5,
                          tensorboard_log=save)
        for i in range(10):
            self.model.learn(n_timesteps)
            self.model.save(save)
Example #13
0
    def evaluate(self, num_env=1, num_steps=1461, load='saves/audbuyh1', runs=80, config=pc.configgbpchf4h):
        """
        Evaluate a RL agent
        :param model: (BaseRLModel object) the RL Agent
        :param num_steps: (int) number of timesteps to evaluate it
        :return: (float) Mean reward
        """
        env_id = config.year+config.pair
        num_e = 1
        self.config = config
        log_dir = self.config.log
        
        #log_dir = self.config.norm
        #self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_env)])
        self.env = SubprocVecEnv([self.make_env(env_id, i, eval=True, config=self.config) for i in range(num_env)])
        #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" )
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        try:
            self.env.load_running_average(log_dir)
        except:
            print('cant load')
        for i in range(runs):
            #self.model = PPO2(CustomPolicy, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./moose14" )
            #self.model = PPO2.load(self.config.path, self.env, policy=CustomPolicy_2,  tensorboard_log="./default/" )
            self.model = PPO2.load(self.config.path+'8'+str(i)+'.pkl', self.env, policy=CustomPolicy_5,  tensorboard_log="./default/" )
            #self.env.load_running_average(log_dir)
            episode_rewards = [[0.0] for _ in range(self.env.num_envs)]
            #self.total_pips = []
            obs = self.env.reset()
            state = None
            # When using VecEnv, done is a vector
            done = [False for _ in range(self.env.num_envs)]
            for i in range(num_steps):
                # _states are only useful when using LSTM policies
                action, state = self.model.predict(obs, state=state, mask=done, deterministic=True)
                obs, rewards , dones, _ = self.env.step(action)
                #actions, _states = self.model.predict(obs)
                # # here, action, rewards and dones are arrays
                 # # because we are using vectorized env
                #obs, rewards, dones, info = self.env.step(actions)
                #self.total_pips.append(self.env.player.placement)
        
        # Stats
                for i in range(self.env.num_envs):
                    episode_rewards[i][-1] += rewards[i]
                    if dones[i]:
                        episode_rewards[i].append(0.0)
            #self.env.save_running_average(log_dir)
            mean_rewards =  [0.0 for _ in range(self.env.num_envs)]
            n_episodes = 0
            for i in range(self.env.num_envs):
                mean_rewards[i] = np.mean(episode_rewards[i])     
                n_episodes += len(episode_rewards[i])   

        # Compute mean reward
            mean_reward = np.mean(mean_rewards)
            print("Mean reward:", mean_reward, "Num episodes:", n_episodes)
            #self.env.save(log_dir)

        return mean_reward
Example #14
0
    def create_env(n_envs, eval_env=False):
        """
        Create the environment and wrap it if necessary
        :param n_envs: (int)
        :param eval_env: (bool) Whether is it an environment used for evaluation or not
        :return: (Union[gym.Env, VecEnv])
        :return: (gym.Env)
        """
        global hyperparams

        # Do not log eval env (issue with writing the same file)
        log_dir = None if eval_env else save_path

        if is_atari:
            if args.verbose > 0:
                print("Using Atari wrapper")
            env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
            # Frame-stacking with 4 frames
            env = VecFrameStack(env, n_stack=4)
        elif algo_ in ['dqn', 'ddpg']:
            if hyperparams.get('normalize', False):
                print("WARNING: normalization not supported yet for DDPG/DQN")
            env = gym.make(env_id)
            env.seed(args.seed)
            if env_wrapper is not None:
                env = env_wrapper(env)
        else:
            if n_envs == 1:
                env = DummyVecEnv([
                    make_env(env_id,
                             0,
                             args.seed,
                             wrapper_class=env_wrapper,
                             log_dir=log_dir)
                ])
            else:
                # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                # On most env, SubprocVecEnv does not help and is quite memory hungry
                env = DummyVecEnv([
                    make_env(env_id,
                             i,
                             args.seed,
                             log_dir=log_dir,
                             wrapper_class=env_wrapper) for i in range(n_envs)
                ])
            if normalize:
                if args.verbose > 0:
                    if len(normalize_kwargs) > 0:
                        print("Normalization activated: {}".format(
                            normalize_kwargs))
                    else:
                        print("Normalizing input and reward")
                env = VecNormalize(env, **normalize_kwargs)
        # Optional Frame-stacking
        if hyperparams.get('frame_stack', False):
            n_stack = hyperparams['frame_stack']
            env = VecFrameStack(env, n_stack)
            print("Stacking {} frames".format(n_stack))
            del hyperparams['frame_stack']
        return env
Example #15
0
def main(cfg, run_dir):
    run_name = make_run_name(cfg)
    output_dir = run_dir / run_name
    output_dir.mkdir(parents=True)

    with (output_dir / 'config.json').open('w') as fp:
        json.dump(cfg, fp, indent=2)

    # Setting log levels to cut out minor errors
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    tf.logging.set_verbosity(tf.logging.ERROR)

    log_dir = output_dir / cfg['log_dir']
    tensorboard_dir = output_dir / cfg['tb_dir']

    configure(log_dir=str(log_dir),
              format_strs=['log', 'csv', 'tensorboard'],
              tensorboard_dir=str(tensorboard_dir))

    # Create and wrap the environment
    logging.info('Starting {env_name}'.format(**cfg))
    env = make_atari_env(env_id=cfg['env_name'],
                         num_env=8,
                         seed=cfg['train_seed'])
    env = VecFrameStack(env, n_stack=4)
    if cfg['normalize']:
        env = VecNormalize(env)

    # Setting all known random seeds (Python, Numpy, TF, Gym if available)
    set_global_seeds(cfg['train_seed'])

    logging.info('Running {algo}'.format(**cfg))

    algo = get_algo(cfg['algo'])
    policy = cfg['policy_type']
    feature_extractor = get_network_builder(cfg['network'])
    attn_loss = get_loss(cfg['attn_loss'])()
    model = algo(
        policy=policy,
        env=env,
        verbose=1,
        learning_rate=lambda frac: 0.00025 * frac,
        attn_loss=attn_loss,
        attn_coef=cfg['attn_coef'],
        policy_kwargs={
            'cnn_extractor': feature_extractor,
        },
        tensorboard_log=str(tensorboard_dir),
    )

    logging.info('Training for {time_steps} steps'.format(**cfg))

    # Training
    model.learn(
        total_timesteps=cfg['time_steps'],
        log_interval=cfg['log_interval'],
        tb_log_name=None,
        callback=Callback(output_dir),
    )
def test_vec_normalize():
    env = DummyVecEnv([lambda: gym.make("Pendulum-v0")])
    normalized_vec_env = VecNormalize(env)
    obs = normalized_vec_env.reset()
    for _ in range(10):
        action = [normalized_vec_env.action_space.sample()]
        obs, reward, _, _ = normalized_vec_env.step(action)
        print(obs, reward)
Example #17
0
def test_model_manipulation(model_class, goal_selection_strategy):
    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    env = DummyVecEnv([lambda: env])

    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=3,
                goal_selection_strategy=goal_selection_strategy,
                verbose=0)
    model.learn(1000)

    model_predict(model, env, n_steps=100, additional_check=None)

    model.save('./test_her')
    del model

    # NOTE: HER does not support VecEnvWrapper yet
    with pytest.raises(AssertionError):
        model = HER.load('./test_her', env=VecNormalize(env))

    model = HER.load('./test_her')

    # Check that the model raises an error when the env
    # is not wrapped (or no env passed to the model)
    with pytest.raises(ValueError):
        model.predict(env.reset())

    env_ = BitFlippingEnv(N_BITS,
                          continuous=model_class in [DDPG, SAC],
                          max_steps=N_BITS)
    env_ = HERGoalEnvWrapper(env_)

    model_predict(model, env_, n_steps=100, additional_check=None)

    model.set_env(env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    del model

    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    model = HER.load('./test_her', env=env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    if os.path.isfile('./test_her.pkl'):
        os.remove('./test_her.pkl')
Example #18
0
def create_test_env(env_id,
                    n_envs=1,
                    is_atari=False,
                    stats_path=None,
                    seed=0,
                    log_dir='',
                    should_render=True,
                    hyperparams=None,
                    env_kwargs=None):

    if hyperparams is None:
        hyperparams = {}

    if env_kwargs is None:
        env_kwargs = {}

    # Create the environment and wrap it if necessary
    if is_atari:
        print("Using Atari wrapper")
        env = make_atari_env(env_id, num_env=n_envs, seed=seed)
        # Frame-stacking with 4 frames
        env = VecFrameStack(env, n_stack=4)
    else:
        # start_method = 'spawn' for thread safe
        env = DummyVecEnv([
            make_env(env_id,
                     i,
                     seed,
                     log_dir,
                     wrapper_class=None,
                     env_kwargs=env_kwargs) for i in range(n_envs)
        ])

    # Load saved stats for normalizing input and rewards
    # And optionally stack frames
    if stats_path is not None:
        if hyperparams['normalize']:
            print("Loading running average")
            print("with params: {}".format(hyperparams['normalize_kwargs']))
            env = VecNormalize(env,
                               training=False,
                               **hyperparams['normalize_kwargs'])

            if os.path.exists(os.path.join(stats_path, 'vecnormalize.pkl')):
                env = VecNormalize.load(
                    os.path.join(stats_path, 'vecnormalize.pkl'), env)
                # Deactivate training and reward normalization
                env.training = False
                env.norm_reward = False
            else:
                # Legacy:
                env.load_running_average(stats_path)

        n_stack = hyperparams.get('frame_stack', 0)
        if n_stack > 0:
            print("Stacking {} frames".format(n_stack))
            env = VecFrameStack(env, n_stack)
    return env
Example #19
0
def vec_env(env_name, num_envs=4, seed=33, norm_rew=True, load_path=None):
    '''creates environments, vectorizes them and sets different seeds
    :param norm_rew: reward should only be normalized during training
    :param load_path: if set, the VecNormalize environment will
                      load the running means from this path.
    :returns: VecNormalize (wrapped Subproc- or Dummy-VecEnv) '''

    from gym_mimic_envs.mimic_env import MimicEnv
    from gym_mimic_envs.monitor import Monitor as EnvMonitor

    def make_env_func(env_name, seed, rank):
        def make_env():
            env = gym.make(env_name)
            env.seed(seed + rank * 100)
            if isinstance(env, MimicEnv):
                # wrap a MimicEnv in the EnvMonitor
                # has to be done before converting into a VecEnv!
                env = EnvMonitor(env)
            return env

        return make_env

    if num_envs == 1:
        vec_env = DummyVecEnv([make_env_func(env_name, seed, 0)])
    else:
        env_fncts = [
            make_env_func(env_name, seed, rank) for rank in range(num_envs)
        ]
        vec_env = SubprocVecEnv(env_fncts)

    # normalize environments
    # if a load_path was specified, load the running mean and std of obs and rets from this path
    if load_path is not None:
        vec_normed = VecNormalize.load(load_path, vec_env)
    # todo: think the whole else statement can be deleted.
    #  In case, we want to load obs_rms from an earlier run,
    #  we should be able to do it by just specifying a load_path...
    #  the same way as when we load a complete trained model.
    else:
        try:
            from scripts.common.config import is_mod, MOD_LOAD_OBS_RMS
            if not is_mod(MOD_LOAD_OBS_RMS): raise Exception
            # load the obs_rms from a previously trained model
            init_obs_rms_path = abs_project_path + \
                                'models/behav_clone/models/rms/env_999'
            vec_normed = VecNormalize.load(init_obs_rms_path, vec_env)
            log('Successfully loaded OBS_RMS from a previous model:', [
                f'file:\t {init_obs_rms_path}',
                f'mean:\t {vec_normed.obs_rms.mean}',
                f'var:\t {vec_normed.obs_rms.var}'
            ])
        except:
            log('Do NOT loading obs_rms from a previous run.')
            vec_normed = VecNormalize(vec_env,
                                      norm_obs=True,
                                      norm_reward=norm_rew)

    return vec_normed
Example #20
0
    def evaluate(self, num_env=1, num_steps=175200, load="saves/min", runs=2):
        """
        Evaluate a RL agent
        :param model: (BaseRLModel object) the RL Agent
        :param num_steps: (int) number of timesteps to evaluate it
        :return: (float) Mean reward
        """
        env_id = 'default'
        num_e = 1
        log_dir = "saves"
        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_env)])
        #self.model = PPO2(CustomPolicy, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./default" )
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        self.env.load_running_average(log_dir)
        for i in range(runs):
            self.model = PPO2.load(load + str(i),
                                   self.env,
                                   policy=CustomPolicy_2,
                                   tensorboard_log="./default/")
            self.env.load_running_average(log_dir)
            episode_rewards = [[0.0] for _ in range(self.env.num_envs)]
            #self.total_pips = []
            obs = self.env.reset()
            state = None
            # When using VecEnv, done is a vector
            done = [False for _ in range(env.num_envs)]
            for i in range(num_steps):
                # _states are only useful when using LSTM policies
                action, state = self.model.predict(obs,
                                                   state=state,
                                                   mask=done,
                                                   deterministic=False)
                obs, rewards, dones, _ = self.env.step(action)
                #actions, _states = self.model.predict(obs)
                # # here, action, rewards and dones are arrays
                # # because we are using vectorized env
                #obs, rewards, dones, info = self.env.step(actions)
                #self.total_pips.append(self.env.player.placement)

                # Stats
                for i in range(self.env.num_envs):
                    episode_rewards[i][-1] += rewards[i]
                    if dones[i]:
                        episode_rewards[i].append(0.0)

            mean_rewards = [0.0 for _ in range(self.env.num_envs)]
            n_episodes = 0
            for i in range(self.env.num_envs):
                mean_rewards[i] = np.mean(episode_rewards[i])
                n_episodes += len(episode_rewards[i])

        # Compute mean reward
            mean_reward = np.mean(mean_rewards)
            print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

        return mean_reward
Example #21
0
def run():
    """
    The main function of the agent
    Parses argv and executes accordingly
    """
    visualize = sys.argv[1] == "v" if len(sys.argv) > 1 else False
    resume = sys.argv[1] == "r" if len(sys.argv) > 1 else False
    evaluate = visualize or (sys.argv[1] == "e"
                             if len(sys.argv) > 1 else False)
    loadpath = sys.argv[2] if resume or evaluate else ""
    print("Setting up env")
    env = SubprocVecEnv([make_env(ENV, i) for i in range(N_PROCS)],
                        start_method='spawn')

    eval_env = DummyVecEnv([make_env(ENV, i) for i in range(N_PROCS)])
    eval_env = VecNormalize(eval_env,
                            norm_obs=True,
                            norm_reward=False,
                            clip_obs=10.)

    print("Setting up model")

    if not (resume or evaluate):
        env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
        model = Model(env=env,
                      eval_env=eval_env,
                      env_name=ENV_NAME,
                      seed=SEED,
                      n_procs=N_PROCS,
                      num_steps=NUM_STEPS)
    else:
        model = Model.load(loadpath,
                           env,
                           eval_env=eval_env,
                           env_name=ENV_NAME,
                           seed=SEED,
                           n_procs=N_PROCS,
                           num_steps=NUM_STEPS)
        #model = Model(env=None, eval_env=eval_env, env_name="FieldEnv", seed=SEED, n_procs=N_PROCS, num_steps=NUM_STEPS)

    if not evaluate:
        model.trainAndSave()
    else:
        model.evaluate(visualize)
Example #22
0
def main():
    all_ports = []
    parser = argparse.ArgumentParser()
    parser.add_argument("algorithm",
                        help='Which algorithm are you using',
                        type=str)
    parser.add_argument("training_timesteps",
                        help="How many traning steps are there?",
                        type=int)
    parser.add_argument("testing_timesteps",
                        help="How many testing steps are there?",
                        type=int)
    parser.add_argument("training_iterations",
                        help="How many traning iterations are there?",
                        type=int)
    parser.add_argument("testing_iterations",
                        help="How many traning iterations are there?",
                        type=int)
    parser.add_argument("learning_rate",
                        help="What is the learning rate?",
                        type=float)
    parser.add_argument("batch_size", help="What is the batch size?", type=int)
    parser.add_argument("building_port",
                        help="What is the building_port?",
                        type=int)
    parser.add_argument("reward_port",
                        help="What is the reward_port?",
                        type=int)
    parser.add_argument("agent_port", help="What is the agent_port?", type=int)
    args = parser.parse_args()
    all_ports = [args.building_port, args.reward_port, args.agent_port]

    df11 = pd.DataFrame(all_ports)
    df11.to_csv('allports.csv', index=False)

    hostname = socket.gethostname()
    # Path
    path = os.path.join(sys.path[0], hostname)
    # os.mkdir(path)
    path_for_kill_file = os.path.join(sys.path[0], "kill.sh")

    env = gym.make('RCRS-v2')
    # The algorithms require a vectorized environment to run
    env = DummyVecEnv([lambda: env])
    # Automatically normalize the input features
    env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
    run_model(args.algorithm,
              args.training_timesteps,
              args.testing_timesteps,
              args.training_iterations,
              args.testing_iterations,
              args.learning_rate,
              args.batch_size,
              env=env,
              hostname=hostname,
              path_for_kill_file=path_for_kill_file)
Example #23
0
def _make_warmstart_cartpole():
    """Warm-start VecNormalize by stepping through CartPole"""
    venv = DummyVecEnv([lambda: gym.make("CartPole-v1")])
    venv = VecNormalize(venv)
    venv.reset()
    venv.get_original_obs()

    for _ in range(100):
        actions = [venv.action_space.sample()]
        venv.step(actions)
    return venv
Example #24
0
def load_train_env(num_envs, robot_radius, rew_fnc, num_stacks, stack_offset,
                   debug, task_mode, policy, disc_action_space, normalize):
    # Choosing environment wrapper according to the policy
    if policy == "CnnPolicy" or policy == "CnnLnLstmPolicy" or policy == "CnnLstmPolicy":
        if disc_action_space:
            env_temp = RosEnvDiscImg
        else:
            env_temp = RosEnvContImg
    elif policy == "CNN1DPolicy":
        if disc_action_space:
            env_temp = RosEnvDiscRawScanPrepWp
        else:
            env_temp = RosEnvContRawScanPrepWp
    elif policy == "CNN1DPolicy_multi_input":
        if disc_action_space:
            env_temp = RosEnvDiscRaw
        else:
            env_temp = RosEnvContRaw
    elif policy == "CnnPolicy_multi_input_vel" or policy == "CnnPolicy_multi_input_vel2":
        if disc_action_space:
            env_temp = RosEnvDiscImgVel
        else:
            env_temp = RosEnvContImgVel

    env = SubprocVecEnv([
        lambda k=k: Monitor(env_temp(
            "sim%d" % (k + 1), StateCollector("sim%s" %
                                              (k + 1), "train"), stack_offset,
            num_stacks, robot_radius, rew_fnc, debug, "train", task_mode),
                            '%s/%s/sim_%d' %
                            (path_to_models, agent_name, k + 1),
                            allow_early_resets=True) for k in range(num_envs)
    ])

    # Normalizing?
    if normalize:
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_obs=100.0,
                           clip_reward=10.0,
                           gamma=0.99,
                           epsilon=1e-08)
    else:
        env = env

    # Stack of data?
    if num_stacks > 1:
        env = VecFrameStack(env, n_stack=num_stacks, n_offset=stack_offset)

    return env
Example #25
0
def run_ppo_policies(easy, main_dir, n_exps):
    env = VecNormalize(DummyVecEnv(
        [create_env_fn(0, monitored=False, easy=easy)]),
                       gamma=0.999,
                       training=False)

    states = []
    for i in range(1, n_exps + 1):
        states.append(
            np.array(
                run_ppo_policy(env, os.path.join(main_dir, "exp-" + str(i)))))

    return states
Example #26
0
def make_env(env_id, env_args, seed, is_train, with_vecnorm):

    monitor_dir = os.path.join(env_args['log_file'], 'log')

    if is_train:
        # env for training
        env = make_vec_env(env_id=lambda: gym.make(env_id, **env_args),
                           seed=seed,
                           monitor_dir=monitor_dir,
                           n_envs=1)

        if with_vecnorm:
            env = VecNormalize(env,
                               norm_obs=True,
                               norm_reward=True,
                               clip_obs=10.,
                               clip_reward=10.)

        # env for evaluation during training
        env_args['renders'] = False
        if 'dset' in env_args:
            env_args['dset'] = 'eval'
        eval_env = make_vec_env(env_id=lambda: gym.make(env_id, **env_args),
                                seed=seed + 1,
                                monitor_dir=monitor_dir + '/eval',
                                n_envs=1)

        if with_vecnorm:
            eval_env = VecNormalize(eval_env,
                                    norm_obs=True,
                                    norm_reward=True,
                                    clip_obs=10.,
                                    clip_reward=10.)

    else:
        env = gym.make(env_id, **env_args)
        eval_env = None

    return env, eval_env
Example #27
0
        def create_env(n_envs):
            """
            Create the environment and wrap it if necessary
            :param n_envs: (int)
            :return: (gym.Env)
            """
            global hyperparams

            if is_atari:
                if args.verbose > 0:
                    print("Using Atari wrapper")
                env = make_atari_env(env_id, num_env=n_envs, seed=args.seed)
                # Frame-stacking with 4 frames
                env = VecFrameStack(env, n_stack=4)
            elif args.algo in ['dqn', 'ddpg']:
                if hyperparams.get('normalize', False):
                    print(
                        "WARNING: normalization not supported yet for DDPG/DQN"
                    )
                # No env_wrapper applied for now as not using make_env()
                env = gym.make(env_id)
                env.seed(args.seed)
            else:
                if n_envs == 1:
                    env = DummyVecEnv([
                        make_env(env_id,
                                 0,
                                 args.seed,
                                 wrapper_class=env_wrapper)
                    ])
                else:
                    # env = SubprocVecEnv([make_env(env_id, i, args.seed) for i in range(n_envs)])
                    # On most env, SubprocVecEnv does not help and is quite memory hungry
                    env = DummyVecEnv([
                        make_env(env_id,
                                 i,
                                 args.seed,
                                 wrapper_class=env_wrapper)
                        for i in range(n_envs)
                    ])
                if normalize:
                    if args.verbose > 0:
                        print("Normalizing input and return")
                    env = VecNormalize(env, **normalize_kwargs)
            # Optional Frame-stacking
            if hyperparams.get('frame_stack', False):
                n_stack = hyperparams['frame_stack']
                env = VecFrameStack(env, n_stack)
                print("Stacking {} frames".format(n_stack))
                del hyperparams['frame_stack']
            return env
def vecEnv(env_kwargs_local, env_class):
    """
    Local Env Wrapper
    :param env_kwargs_local: arguments related to the environment wrapper
    :param env_class: class of the env
    :return: env for the pretrained algo
    """
    train_env = env_class(**{
        **env_kwargs_local, "record_data": False,
        "renders": False
    })
    train_env = DummyVecEnv([lambda: train_env])
    train_env = VecNormalize(train_env, norm_obs=True, norm_reward=False)
    return train_env
Example #29
0
def create_env(env_name, config=None, n_workers=8, image_based=True, **kwargs):
    """
    Parses the environment to correctly return the attributes based on the spec and type
    Creates a corresponding vectorized environment
    """
    def make_rl(**kwargs):
        """
        Decorator for custom RL environments
        """
        def _init():
            env_obj = getattr(rl.environments, env_name)
            env = env_obj(config)
            return env

        return _init

    def make_gym(rank, seed=0, **kwargs):
        """
        Decorator for gym environments
        """
        def _init():
            env = gym.make(env_name)
            env.seed(seed + rank)
            return env

        return _init

    if config is not None:
        n_workers = config['main']['n_workers']
    mapping = {'gym': make_gym, 'rl': make_rl}
    env_type = get_env_type(env_name)
    env_decorator = mapping[env_type]
    vectorized_decorator = [env_decorator(rank=x) for x in range(n_workers)]

    # Parallelize
    if n_workers > 1:
        method = 'spawn' if sys.platform == 'win32' else 'forkserver'
        vectorized = SubprocVecEnv(vectorized_decorator, start_method=method)
    else:  # Non multi-processing env
        vectorized = DummyVecEnv(vectorized_decorator)

    # Frame-stacking for CNN based environments
    if 'frame_stack' in config['main'].keys():
        if config['main']['frame_stack'] != 0:
            vectorized = VecFrameStack(vectorized,
                                       n_stack=config['main']['frame_stack'])
    if 'normalize' in config['main'].keys():
        vectorized = VecNormalize(vectorized, clip_obs=1, clip_reward=1)

    return vectorized
Example #30
0
File: ddpg.py Project: s206283/gcrl
    def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None):
        # Even though DeepQ is single core only, we need to use the pipe system to work
        if env_kwargs is not None and env_kwargs.get("use_srl", False):
            srl_model = MultiprocessSRLModel(1, args.env, env_kwargs)
            env_kwargs["state_dim"] = srl_model.state_dim
            env_kwargs["srl_pipe"] = srl_model.pipe

        env = DummyVecEnv([makeEnv(args.env, args.seed, 0, args.log_dir, env_kwargs=env_kwargs)])

        if args.srl_model != "raw_pixels":
            env = VecNormalize(env, norm_reward=False)
            env = loadRunningAverage(env, load_path_normalise=load_path_normalise)

        return env