コード例 #1
0
ファイル: test_runner.py プロジェクト: lewisKit/lagom
def test_trajectory_runner(env_id):
    env = make_vec_env(SerialVecEnv, make_gym_env, env_id, 3, 0, False)
    env_spec = EnvSpec(env)

    agent = RandomAgent(None, env_spec)

    runner = TrajectoryRunner(None, agent, env)
    D = runner(4)

    assert len(D) == 3
    assert all([isinstance(d, Trajectory) for d in D])
    assert all([d.T == 4 for d in D])

    # Check if s in transition is equal to s_next in previous transition
    for d in D:
        for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]):
            assert np.allclose(s1.s_next, s2.s)

    # Long horizon
    D = runner(T=1000)
    for d in D:
        if d.T < 1000:
            assert d.all_done[-1] == True

    with pytest.raises(AssertionError):
        env = make_vec_env(SerialVecEnv, make_gym_env, env_id, 3, 0, True)
        TrajectoryRunner(None, agent, env)
コード例 #2
0
ファイル: test_runner.py プロジェクト: wolegechu/lagom
        def check(agent_name, env_name):
            # Create environment
            list_make_env = make_envs(make_env=make_gym_env, 
                                      env_id=env_name, 
                                      num_env=1, 
                                      init_seed=0)
            env = SerialVecEnv(list_make_env=list_make_env)
            env_spec = EnvSpec(env)
            
            # Create agent
            if agent_name == 'random':
                agent = RandomAgent(env_spec=env_spec, config=None)
            elif agent_name == 'agent1':
                agent = Agent1(config=None)
            elif agent_name == 'agent2':
                agent = Agent2(config=None)
            else:
                raise ValueError('Wrong agent name')
            
            # Test: not allowed more than one environment for TrajectoryRunner
            with pytest.raises(AssertionError):
                list_make_env2 = make_envs(make_env=make_gym_env, 
                                          env_id=env_name, 
                                          num_env=2, 
                                          init_seed=0)
                env2 = SerialVecEnv(list_make_env=list_make_env2)

                runner2 = TrajectoryRunner(agent=agent, env=env2, gamma=1.0)
            
            # Create runner
            runner = TrajectoryRunner(agent=agent, env=env, gamma=1.0)

            # Small batch
            D = runner(N=3, T=4)

            assert len(D) == 3
            assert all([isinstance(d, Trajectory) for d in D])
            assert all([d.T == 4 for d in D])
            assert all([d.gamma == 1.0 for d in D])

            # Check additional information
            for d in D:
                for t in d.transitions:
                    if agent_name != 'random':
                        assert 'action_logprob' in t.info

            # Check if s in transition is equal to s_next in previous transition
            for d in D:
                for s1, s2 in zip(d.transitions[:-1], d.transitions[1:]):
                    assert np.allclose(s1.s_next, s2.s)
        
            # Long horizon
            D = runner(N=3, T=1000)
            for d in D:
                if d.T < 1000:
                    assert d.all_done[-1] == True
コード例 #3
0
ファイル: algo.py プロジェクト: wolegechu/lagom
    def f(self, solution, seed, config):
        if self.agent is None:
            self.init(seed, config)

        # load solution parameters to the agent internal network
        msg = f'expected {self.network.num_params}, got {np.asarray(solution).size}'
        assert np.asarray(solution).size == self.network.num_params, msg
        self.agent.policy.network.from_vec(torch.from_numpy(solution).float())
        
        # seed the environment
        self.env.list_env[0].seed(seed)
        
        # create runner
        runner = TrajectoryRunner(agent=self.agent, env=self.env, gamma=1.0)
        
        # take rollouts and calculate mean return (no discount)
        with torch.no_grad():
            D = runner(N=config['train.N'], T=config['train.T'])
        
        mean_return = np.mean([sum(trajectory.all_r) for trajectory in D])
        
        # Negate return to be objective value, because ES does minimization
        function_value = -mean_return
        
        return function_value
コード例 #4
0
    def eval(self, n):
        # Set network as evaluation mode
        self.agent.policy.network.eval()

        # Create a new instance of the envrionment
        env = make_gym_env(env_id=self.config['env:id'],
                           seed=self.config['seed'],
                           monitor=False,
                           monitor_dir=None)
        # Create a TrajectoryRunner
        runner = TrajectoryRunner(agent=self.agent,
                                  env=env,
                                  gamma=self.config['algo:gamma'])
        # Evaluate the agent for a set of trajectories
        D = runner(N=self.config['eval:N'], T=self.config['eval:T'])

        # Return evaluation output
        eval_output = {}
        eval_output['D'] = D
        eval_output['n'] = n

        return eval_output
コード例 #5
0
ファイル: algo.py プロジェクト: wolegechu/lagom
    def __call__(self, config, seed, device_str):
        # Set random seeds
        set_global_seeds(seed)
        # Create device
        device = torch.device(device_str)
        # Use log dir for current job (run_experiment)
        logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)

        # Make environment (VecEnv) for training and evaluating
        env = make_vec_env(
            vec_env_class=SerialVecEnv,
            make_env=make_gym_env,
            env_id=config['env.id'],
            num_env=config['train.N'],  # batch size for multiple environments
            init_seed=seed)
        eval_env = make_vec_env(vec_env_class=SerialVecEnv,
                                make_env=make_gym_env,
                                env_id=config['env.id'],
                                num_env=1,
                                init_seed=seed)
        if config[
                'env.standardize']:  # wrap with VecStandardize for running averages of observation and rewards
            env = VecStandardize(venv=env,
                                 use_obs=True,
                                 use_reward=True,
                                 clip_obs=10.,
                                 clip_reward=10.,
                                 gamma=0.99,
                                 eps=1e-8)
            eval_env = VecStandardize(
                venv=
                eval_env,  # remember to synchronize running averages during evaluation !!!
                use_obs=True,
                use_reward=False,  # do not process rewards, no training
                clip_obs=env.clip_obs,
                clip_reward=env.clip_reward,
                gamma=env.gamma,
                eps=env.eps,
                constant_obs_mean=env.obs_runningavg.
                mu,  # use current running average as constant
                constant_obs_std=env.obs_runningavg.sigma)
        env_spec = EnvSpec(env)

        # Create policy
        network = Network(config=config, env_spec=env_spec)
        if env_spec.control_type == 'Discrete':
            policy = CategoricalPolicy(config=config,
                                       network=network,
                                       env_spec=env_spec,
                                       learn_V=True)
        elif env_spec.control_type == 'Continuous':
            policy = GaussianPolicy(
                config=config,
                network=network,
                env_spec=env_spec,
                learn_V=True,
                min_std=config['agent.min_std'],
                std_style=config['agent.std_style'],
                constant_std=config['agent.constant_std'],
                std_state_dependent=config['agent.std_state_dependent'],
                init_std=config['agent.init_std'])
        network = network.to(device)

        # Create optimizer and learning rate scheduler
        optimizer = optim.Adam(policy.network.parameters(),
                               lr=config['algo.lr'])
        if config['algo.use_lr_scheduler']:
            if 'train.iter' in config:  # iteration-based training
                max_epoch = config['train.iter']
            elif 'train.timestep' in config:  # timestep-based training
                max_epoch = config[
                    'train.timestep'] + 1  # +1 to avoid 0.0 lr in final iteration
            lambda_f = lambda epoch: 1 - epoch / max_epoch  # decay learning rate for each training epoch
            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                       lr_lambda=lambda_f)

        # Create agent
        kwargs = {'device': device}
        if config['algo.use_lr_scheduler']:
            kwargs['lr_scheduler'] = lr_scheduler
        agent = A2CAgent(config=config,
                         policy=policy,
                         optimizer=optimizer,
                         **kwargs)

        # Create runner
        runner = SegmentRunner(agent=agent,
                               env=env,
                               gamma=config['algo.gamma'])
        eval_runner = TrajectoryRunner(agent=agent, env=eval_env, gamma=1.0)

        # Create engine
        engine = Engine(agent=agent,
                        runner=runner,
                        config=config,
                        eval_runner=eval_runner)

        # Training and evaluation
        train_logs = []
        eval_logs = []

        for i in count():  # incremental iteration
            if 'train.iter' in config and i >= config[
                    'train.iter']:  # enough iterations
                break
            elif 'train.timestep' in config and agent.total_T >= config[
                    'train.timestep']:  # enough timesteps
                break

            # train and evaluation
            train_output = engine.train(n=i)

            # logging
            if i == 0 or (i + 1) % config['log.record_interval'] == 0 or (
                    i + 1) % config['log.print_interval'] == 0:
                train_log = engine.log_train(train_output)

                with torch.no_grad():  # disable grad, save memory
                    eval_output = engine.eval(n=i)
                eval_log = engine.log_eval(eval_output)

                if i == 0 or (i + 1) % config[
                        'log.record_interval'] == 0:  # record loggings
                    train_logs.append(train_log)
                    eval_logs.append(eval_log)

        # Save all loggings
        pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
        pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')

        return None
コード例 #6
0
 def __call__(self, config, seed, device_str):
     set_global_seeds(seed)
     device = torch.device(device_str)
     logdir = Path(config['log.dir']) / str(config['ID']) / str(seed)
     
     # Environment related
     env = make_vec_env(vec_env_class=SerialVecEnv, 
                        make_env=make_gym_env, 
                        env_id=config['env.id'], 
                        num_env=config['train.N'],  # batched environment
                        init_seed=seed, 
                        rolling=True)
     eval_env = make_vec_env(vec_env_class=SerialVecEnv, 
                             make_env=make_gym_env, 
                             env_id=config['env.id'], 
                             num_env=config['eval.N'], 
                             init_seed=seed, 
                             rolling=False)
     if config['env.standardize']:  # running averages of observation and reward
         env = VecStandardize(venv=env, 
                              use_obs=True, 
                              use_reward=False,  # A2C
                              clip_obs=10., 
                              clip_reward=10., 
                              gamma=0.99, 
                              eps=1e-8)
         eval_env = VecStandardize(venv=eval_env,  # remember to synchronize running averages during evaluation !!!
                                   use_obs=True, 
                                   use_reward=False,  # do not process rewards, no training
                                   clip_obs=env.clip_obs, 
                                   clip_reward=env.clip_reward, 
                                   gamma=env.gamma, 
                                   eps=env.eps, 
                                   constant_obs_mean=env.obs_runningavg.mu,  # use current running average as constant
                                   constant_obs_std=env.obs_runningavg.sigma)
     env_spec = EnvSpec(env)
     
     # Network and policy
     if config['network.recurrent']:
         network = LSTM(config=config, device=device, env_spec=env_spec)
     else:
         network = Network(config=config, device=device, env_spec=env_spec)
     if env_spec.control_type == 'Discrete':
         policy = CategoricalPolicy(config=config, 
                                    network=network, 
                                    env_spec=env_spec, 
                                    device=device,
                                    learn_V=True)
     elif env_spec.control_type == 'Continuous':
         policy = GaussianPolicy(config=config, 
                                 network=network, 
                                 env_spec=env_spec, 
                                 device=device,
                                 learn_V=True,
                                 min_std=config['agent.min_std'], 
                                 std_style=config['agent.std_style'], 
                                 constant_std=config['agent.constant_std'],
                                 std_state_dependent=config['agent.std_state_dependent'],
                                 init_std=config['agent.init_std'])
     
     # Optimizer and learning rate scheduler
     optimizer = optim.Adam(policy.network.parameters(), lr=config['algo.lr'])
     if config['algo.use_lr_scheduler']:
         if 'train.iter' in config:  # iteration-based
             max_epoch = config['train.iter']
         elif 'train.timestep' in config:  # timestep-based
             max_epoch = config['train.timestep'] + 1  # avoid zero lr in final iteration
         lambda_f = lambda epoch: 1 - epoch/max_epoch
         lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_f)
     
     # Agent
     kwargs = {'device': device}
     if config['algo.use_lr_scheduler']:
         kwargs['lr_scheduler'] = lr_scheduler
     agent = A2CAgent(config=config, 
                      policy=policy, 
                      optimizer=optimizer, 
                      **kwargs)
     
     # Runner
     runner = SegmentRunner(agent=agent, 
                            env=env, 
                            gamma=config['algo.gamma'])
     eval_runner = TrajectoryRunner(agent=agent, 
                                    env=eval_env, 
                                    gamma=1.0)
     
     # Engine
     engine = Engine(agent=agent, 
                     runner=runner, 
                     config=config, 
                     eval_runner=eval_runner)
     
     # Training and evaluation
     train_logs = []
     eval_logs = []
     
     if config['network.recurrent']:
         rnn_states_buffer = agent.policy.rnn_states  # for SegmentRunner
     
     for i in count():
         if 'train.iter' in config and i >= config['train.iter']:  # enough iterations
             break
         elif 'train.timestep' in config and agent.total_T >= config['train.timestep']:  # enough timesteps
             break
         
         if config['network.recurrent']:
             if isinstance(rnn_states_buffer, list):  # LSTM: [h, c]
                 rnn_states_buffer = [buf.detach() for buf in rnn_states_buffer]
             else:
                 rnn_states_buffer = rnn_states_buffer.detach()
             agent.policy.rnn_states = rnn_states_buffer
             
         train_output = engine.train(n=i)
         
         # Logging
         if i == 0 or (i+1) % config['log.record_interval'] == 0 or (i+1) % config['log.print_interval'] == 0:
             train_log = engine.log_train(train_output)
             
             if config['network.recurrent']:
                 rnn_states_buffer = agent.policy.rnn_states  # for SegmentRunner
                 
             with torch.no_grad():  # disable grad, save memory
                 eval_output = engine.eval(n=i)
             eval_log = engine.log_eval(eval_output)
             
             if i == 0 or (i+1) % config['log.record_interval'] == 0:
                 train_logs.append(train_log)
                 eval_logs.append(eval_log)
     
     # Save all loggings
     pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
     pickle_dump(obj=eval_logs, f=logdir/'eval_logs', ext='.pkl')
     
     return None
コード例 #7
0
ファイル: algo.py プロジェクト: shubhampachori12110095/lagom
    def __call__(self, config):
        # Set random seeds: PyTorch, numpy.random, random
        set_global_seeds(seed=config['seed'])

        # Create an environment
        env = make_gym_env(env_id=config['env:id'],
                           seed=config['seed'],
                           monitor=False,
                           monitor_dir=None)
        # Create environment specification
        env_spec = EnvSpec(env)

        # Create device
        torch.cuda.set_device(config['cuda_id'])
        device = torch.device(
            f'cuda:{config["cuda_id"]}' if config['cuda'] else 'cpu')

        # Create policy
        network = MLP(config=config).to(device)
        policy = CategoricalPolicy(network=network, env_spec=env_spec)

        # Create optimizer
        optimizer = optim.Adam(policy.network.parameters(),
                               lr=config['algo:lr'])
        # Create learning rate scheduler
        if config['algo:use_lr_scheduler']:
            max_epoch = config[
                'train:iter']  # Max number of lr decay, Note where lr_scheduler put
            lambda_f = lambda epoch: 1 - epoch / max_epoch  # decay learning rate for each training epoch
            lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer,
                                                       lr_lambda=lambda_f)

        # Create agent
        kwargs = {'device': device}
        if config['algo:use_lr_scheduler']:
            kwargs['lr_scheduler'] = lr_scheduler
        agent = ActorCriticAgent(policy=policy,
                                 optimizer=optimizer,
                                 config=config,
                                 **kwargs)

        # Create runner
        runner = TrajectoryRunner(agent=agent,
                                  env=env,
                                  gamma=config['algo:gamma'])

        # Create engine
        engine = Engine(agent=agent, runner=runner, config=config, logger=None)

        # Training and evaluation
        train_logs = []
        eval_logs = []
        for i in range(config['train:iter']):
            train_output = engine.train(i)

            # Logging and evaluation
            if i == 0 or (i + 1) % config['log:interval'] == 0:
                # Log training and record the loggings
                train_logger = engine.log_train(train_output)
                train_logs.append(train_logger.logs)
                # Log evaluation and record the loggings
                eval_output = engine.eval(i)
                eval_logger = engine.log_eval(eval_output)
                eval_logs.append(eval_logger.logs)

        # Save the loggings
        np.save(
            Path(config['log:dir']) / str(config['ID']) / 'train', train_logs)
        np.save(
            Path(config['log:dir']) / str(config['ID']) / 'eval', eval_logs)

        return None