Exemple #1
0
    def test_pickle_yaml(self):
        # Create some data
        a = {'one': 1, 'two': [2, 3]}
        b = {'three': 3, 'four': [4, 5]}
        c = [a, b]

        def _check(x):
            assert isinstance(x, list)
            assert len(x) == 2
            assert all([isinstance(i, dict) for i in x])
            assert list(x[0].keys()) == ['one', 'two']
            assert list(x[1].keys()) == ['three', 'four']
            assert list(x[0].values()) == [1, [2, 3]]
            assert list(x[1].values()) == [3, [4, 5]]

        # Pickle
        pickle_dump(c, '.tmp_pickle')
        _check(pickle_load('.tmp_pickle.pkl'))
        # remove the file
        os.unlink('.tmp_pickle.pkl')

        # Yaml
        yaml_dump(c, '.tmp_yaml')
        _check(yaml_load('.tmp_yaml.yml'))
        # remove the file
        os.unlink('.tmp_yaml.yml')
Exemple #2
0
    def save_running_average(self, f):
        r"""Save the running averages for observation and reward in a dictionary by pickling. 
        
        It saves the mean and standard deviation for observation running average and the standard deviation
        for reward running average. A dictionary with keys 'obs_avg' and 'r_avg' will be created. Each key
        contains sub-keys ['mu', 'sigma']. 
        
        Args:
            f (str): saving path
        """
        # Get running average dictionary
        out = self.running_averages

        # Pickle it
        pickle_dump(obj=out, f=f, ext='.pkl')
Exemple #3
0
    def save_configs(self, f, method='pickle'):
        r"""Save the list of configurations returned from :meth:`make_configs`. 
        
        Args:
            f (str): file path
            method (str): the method to save the list of configuration. Either 'pickle' or 'yaml'
        """
        assert isinstance(method, str)
        methods = ['pickle', 'yaml']
        assert method in methods, f'expected {methods}, got {method}'

        if method == 'pickle':
            pickle_dump(obj=self.configs, f=f, ext='.pkl')
        elif method == 'yaml':
            yaml_dump(obj=self.configs, f=f, ext='.yml')
Exemple #4
0
 def _process_es_result(self, result):
     best_f_val = result['best_f_val']
     best_return = -best_f_val  # negate to get back reward
     
     # logging
     self.logger.log('generation', self.generation)
     self.logger.log('best_return', best_return)
     
     if self.generation == 0 or (self.generation+1) % self.config['log.interval'] == 0:
         print('-'*50)
         self.logger.dump(keys=None, index=-1, indent=0)
         print('-'*50)
         
     # Save the loggings and final parameters
     if (self.generation+1) == self.num_iteration:
         pickle_dump(obj=self.logger.logs, f=self.logdir/'result', ext='.pkl')
         np.save(self.logdir/'trained_param', result['best_param'])
Exemple #5
0
    def __call__(self, config, seed, device_str):
        # Set random seeds
        set_global_seeds(seed)
        # Create device
        device = torch.device(device_str)
        # Use log dir for current job (run_experiment)
        logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)

        # Make environment (VecEnv) for training and evaluating
        env = make_vec_env(
            vec_env_class=SerialVecEnv,
            make_env=make_gym_env,
            env_id=config['env.id'],
            num_env=config['train.N'],  # batch size for multiple environments
            init_seed=seed)
        eval_env = make_vec_env(vec_env_class=SerialVecEnv,
                                make_env=make_gym_env,
                                env_id=config['env.id'],
                                num_env=1,
                                init_seed=seed)
        if config[
                'env.standardize']:  # wrap with VecStandardize for running averages of observation and rewards
            env = VecStandardize(venv=env,
                                 use_obs=True,
                                 use_reward=True,
                                 clip_obs=10.,
                                 clip_reward=10.,
                                 gamma=0.99,
                                 eps=1e-8)
            eval_env = VecStandardize(
                venv=
                eval_env,  # remember to synchronize running averages during evaluation !!!
                use_obs=True,
                use_reward=False,  # do not process rewards, no training
                clip_obs=env.clip_obs,
                clip_reward=env.clip_reward,
                gamma=env.gamma,
                eps=env.eps,
                constant_obs_mean=env.obs_runningavg.
                mu,  # use current running average as constant
                constant_obs_std=env.obs_runningavg.sigma)
        env_spec = EnvSpec(env)

        # Create policy
        network = Network(config=config, env_spec=env_spec)
        if env_spec.control_type == 'Discrete':
            policy = CategoricalPolicy(config=config,
                                       network=network,
                                       env_spec=env_spec,
                                       learn_V=True)
        elif env_spec.control_type == 'Continuous':
            policy = GaussianPolicy(
                config=config,
                network=network,
                env_spec=env_spec,
                learn_V=True,
                min_std=config['agent.min_std'],
                std_style=config['agent.std_style'],
                constant_std=config['agent.constant_std'],
                std_state_dependent=config['agent.std_state_dependent'],
                init_std=config['agent.init_std'])
        network = network.to(device)

        # Create optimizer and learning rate scheduler
        optimizer = optim.Adam(policy.network.parameters(),
                               lr=config['algo.lr'])
        if config['algo.use_lr_scheduler']:
            if 'train.iter' in config:  # iteration-based training
                max_epoch = config['train.iter']
            elif 'train.timestep' in config:  # timestep-based training
                max_epoch = config[
                    'train.timestep'] + 1  # +1 to avoid 0.0 lr in final iteration
            lambda_f = lambda epoch: 1 - epoch / max_epoch  # decay learning rate for each training epoch
            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                       lr_lambda=lambda_f)

        # Create agent
        kwargs = {'device': device}
        if config['algo.use_lr_scheduler']:
            kwargs['lr_scheduler'] = lr_scheduler
        agent = A2CAgent(config=config,
                         policy=policy,
                         optimizer=optimizer,
                         **kwargs)

        # Create runner
        runner = SegmentRunner(agent=agent,
                               env=env,
                               gamma=config['algo.gamma'])
        eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0)

        # Create engine
        engine = Engine(agent=agent,
                        runner=runner,
                        config=config,
                        eval_runner=eval_runner)

        # Training and evaluation
        train_logs = []
        eval_logs = []

        for i in count():  # incremental iteration
            if 'train.iter' in config and i >= config[
                    'train.iter']:  # enough iterations
                break
            elif 'train.timestep' in config and agent.total_T >= config[
                    'train.timestep']:  # enough timesteps
                break

            # train and evaluation
            train_output = engine.train(n=i)

            # logging
            if i == 0 or (i + 1) % config['log.record_interval'] == 0 or (
                    i + 1) % config['log.print_interval'] == 0:
                train_log = engine.log_train(train_output)

                with torch.no_grad():  # disable grad, save memory
                    eval_output = engine.eval(n=i)
                eval_log = engine.log_eval(eval_output)

                if i == 0 or (i + 1) % config[
                        'log.record_interval'] == 0:  # record loggings
                    train_logs.append(train_log)
                    eval_logs.append(eval_log)

        # Save all loggings
        pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
        pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')

        return None
Exemple #6
0
 def __call__(self, config, seed, device_str):
     set_global_seeds(seed)
     device = torch.device(device_str)
     logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)
     
     # Environment related
     env = make_vec_env(vec_env_class=SerialVecEnv, 
                        make_env=make_gym_env, 
                        env_id=config['env.id'], 
                        num_env=config['train.N'],  # batched environment
                        init_seed=seed, 
                        rolling=True)
     eval_env = make_vec_env(vec_env_class=SerialVecEnv, 
                             make_env=make_gym_env, 
                             env_id=config['env.id'], 
                             num_env=config['eval.N'], 
                             init_seed=seed, 
                             rolling=False)
     if config['env.standardize']:  # running averages of observation and reward
         env = VecStandardize(venv=env, 
                              use_obs=True, 
                              use_reward=False,  # A2C
                              clip_obs=10., 
                              clip_reward=10., 
                              gamma=0.99, 
                              eps=1e-8)
         eval_env = VecStandardize(venv=eval_env,  # remember to synchronize running averages during evaluation !!!
                                   use_obs=True, 
                                   use_reward=False,  # do not process rewards, no training
                                   clip_obs=env.clip_obs, 
                                   clip_reward=env.clip_reward, 
                                   gamma=env.gamma, 
                                   eps=env.eps, 
                                   constant_obs_mean=env.obs_runningavg.mu,  # use current running average as constant
                                   constant_obs_std=env.obs_runningavg.sigma)
     env_spec = EnvSpec(env)
     
     # Network and policy
     if config['network.recurrent']:
         network = LSTM(config=config, device=device, env_spec=env_spec)
     else:
         network = Network(config=config, device=device, env_spec=env_spec)
     if env_spec.control_type == 'Discrete':
         policy = CategoricalPolicy(config=config, 
                                    network=network, 
                                    env_spec=env_spec, 
                                    device=device,
                                    learn_V=True)
     elif env_spec.control_type == 'Continuous':
         policy = GaussianPolicy(config=config, 
                                 network=network, 
                                 env_spec=env_spec, 
                                 device=device,
                                 learn_V=True,
                                 min_std=config['agent.min_std'], 
                                 std_style=config['agent.std_style'], 
                                 constant_std=config['agent.constant_std'],
                                 std_state_dependent=config['agent.std_state_dependent'],
                                 init_std=config['agent.init_std'])
     
     # Optimizer and learning rate scheduler
     optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr'])
     if config['algo.use_lr_scheduler']:
         if 'train.iter' in config:  # iteration-based
             max_epoch = config['train.iter']
         elif 'train.timestep' in config:  # timestep-based
             max_epoch = config['train.timestep'] + 1  # avoid zero lr in final iteration
         lambda_f = lambda epoch: 1 - epoch/max_epoch
         lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f)
     
     # Agent
     kwargs = {'device': device}
     if config['algo.use_lr_scheduler']:
         kwargs['lr_scheduler'] = lr_scheduler
     agent = A2CAgent(config=config, 
                      policy=policy, 
                      optimizer=optimizer, 
                      **kwargs)
     
     # Runner
     runner = SegmentRunner(agent=agent, 
                            env=env, 
                            gamma=config['algo.gamma'])
     eval_runner = TrajectoryRunner(agent=agent, 
                                    env=eval_env, 
                                    gamma=1.0)
     
     # Engine
     engine = Engine(agent=agent, 
                     runner=runner, 
                     config=config, 
                     eval_runner=eval_runner)
     
     # Training and evaluation
     train_logs = []
     eval_logs = []
     
     if config['network.recurrent']:
         rnn_states_buffer = agent.policy.rnn_states  # for SegmentRunner
     
     for i in count():
         if 'train.iter' in config and i >= config['train.iter']:  # enough iterations
             break
         elif 'train.timestep' in config and agent.total_T >= config['train.timestep']:  # enough timesteps
             break
         
         if config['network.recurrent']:
             if isinstance(rnn_states_buffer, list):  # LSTM: [h, c]
                 rnn_states_buffer = [buf.detach() for buf in rnn_states_buffer]
             else:
                 rnn_states_buffer = rnn_states_buffer.detach()
             agent.policy.rnn_states = rnn_states_buffer
             
         train_output = engine.train(n=i)
         
         # Logging
         if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0:
             train_log = engine.log_train(train_output)
             
             if config['network.recurrent']:
                 rnn_states_buffer = agent.policy.rnn_states  # for SegmentRunner
                 
             with torch.no_grad():  # disable grad, save memory
                 eval_output = engine.eval(n=i)
             eval_log = engine.log_eval(eval_output)
             
             if i == 0 or (i+1) % config['log.record_interval'] == 0:
                 train_logs.append(train_log)
                 eval_logs.append(eval_log)
     
     # Save all loggings
     pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
     pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl')
     
     return None