Exemple #1
0
 def train(self, n=None, **kwargs):
     self.model.train()
     
     logger = Logger()
     for i, (data, label) in enumerate(self.train_loader):
         start_time = perf_counter()
         data = data.to(self.model.device)
         re_x, mu, logvar = self.model(data)
         out = vae_loss(re_x, data, mu, logvar, 'BCE')
         loss = out['loss']
         self.optimizer.zero_grad()
         loss.backward()
         self.optimizer.step()
         
         logger('epoch', n)
         self.model.total_iter += 1
         logger('iteration', self.model.total_iter)
         logger('mini-batch', i)
         logger('train_loss', out['loss'].item())
         logger('reconstruction_loss', out['re_loss'].item())
         logger('KL_loss', out['KL_loss'].item())
         logger('num_seconds', round(perf_counter() - start_time, 1))
         if i == 0 or (i+1) % self.config['log.freq'] == 0:
             logger.dump(keys=None, index=-1, indent=0, border='-'*50)
     mean_loss = np.mean([logger.logs['train_loss']])
     print(f'====> Average loss: {mean_loss}')
     
     # Use decoder to sample images from standard Gaussian noise
     with torch.no_grad():  # fast, disable grad
         z = torch.randn(64, self.config['nn.z_dim']).to(self.model.device)
         re_x = self.model.decode(z).cpu()
         save_image(re_x.view(64, 1, 28, 28), f'{kwargs["logdir"]}/sample_{n}.png')
     return logger
Exemple #2
0
    def log_eval(self, eval_output, **kwargs):
        # Create evaluation logger
        logger = Logger(name='eval_logger')

        # Unpack evaluation for logging
        D = eval_output['D']
        n = eval_output['n']
        T = eval_output['T']

        # Loggings: use item() to save memory
        # Log something about trajectories
        batch_returns = [sum(trajectory.all_r) for trajectory in D]
        batch_T = [trajectory.T for trajectory in D]

        logger.log('evaluation_iteration', n + 1)
        logger.log('num_trajectories', len(D))
        logger.log('max_allowed_horizon', T)
        logger.log('average_horizon', np.mean(batch_T))
        logger.log('num_timesteps', np.sum(batch_T))
        logger.log('accumulated_trained_timesteps', self.agent.total_T)
        logger.log('average_return', np.mean(batch_returns))
        logger.log('std_return', np.std(batch_returns))
        logger.log('min_return', np.min(batch_returns))
        logger.log('max_return', np.max(batch_returns))

        # Dump loggings
        if n == 0 or (n + 1) % self.config['log.print_interval'] == 0:
            print(color_str('+' * 50, 'yellow', 'bold'))
            logger.dump(keys=None, index=None, indent=0)
            print(color_str('+' * 50, 'yellow', 'bold'))

        return logger.logs
Exemple #3
0
def run(config, seed, device, logdir):
    set_global_seeds(seed)
    
    print('Initializing...')
    agent = Agent(config, make_env(config, seed), device)
    es = CMAES([config['train.mu0']]*agent.num_params, config['train.std0'], 
               {'popsize': config['train.popsize'], 
                'seed': seed})
    train_logs = []
    checkpoint_count = 0
    with ProcessPoolExecutor(max_workers=config['train.popsize'], initializer=initializer, initargs=(config, seed, device)) as executor:
        print('Finish initialization. Training starts...')
        for generation in range(config['train.generations']):
            start_time = time.perf_counter()
            solutions = es.ask()
            out = list(executor.map(fitness, solutions, chunksize=2))
            Rs, Hs = zip(*out)
            es.tell(solutions, [-R for R in Rs])
            logger = Logger()
            logger('generation', generation+1)
            logger('num_seconds', round(time.perf_counter() - start_time, 1))
            logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('fbest', es.result.fbest)
            train_logs.append(logger.logs)
            if generation == 0 or (generation+1)%config['log.freq'] == 0:
                logger.dump(keys=None, index=0, indent=0, border='-'*50)
            if (generation+1) >= int(config['train.generations']*(checkpoint_count/(config['checkpoint.num'] - 1))):
                agent.from_vec(tensorify(es.result.xbest, 'cpu'))
                agent.checkpoint(logdir, generation+1)
                checkpoint_count += 1
    pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
    return None
Exemple #4
0
 def eval(self, n=None, **kwargs):
     start_time = perf_counter()
     returns = []
     horizons = []
     for _ in range(self.config['eval.num_episode']):
         observation = self.eval_env.reset()
         for _ in range(self.eval_env.spec.max_episode_steps):
             with torch.no_grad():
                 action = self.agent.choose_action(observation, mode='eval')['action']
             next_observation, reward, done, info = self.eval_env.step(action)
             if done[0]:  # [0] single environment
                 returns.append(info[0]['episode']['return'])
                 horizons.append(info[0]['episode']['horizon'])
                 break
             observation = next_observation
     logger = Logger()
     logger('num_seconds', round(perf_counter() - start_time, 1))
     logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps'])
     logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes'])
     logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
     
     monitor_env = get_wrapper(self.eval_env, 'VecMonitor')
     logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
     return logger.logs
Exemple #5
0
 def train(self, n=None, **kwargs):
     train_logs, eval_logs = [], []
     checkpoint_count = 0
     for iteration in count():
         if self.agent.total_timestep >= self.config['train.timestep']:
             break
         t0 = time.perf_counter()
         
         if iteration < self.config['replay.init_trial']:
             [traj] = self.runner(self.random_agent, self.env, 1)
         else:
             [traj] = self.runner(self.agent, self.env, 1, mode='train')
         self.replay.add(traj)
         # Number of gradient updates = collected episode length
         out_agent = self.agent.learn(D=None, replay=self.replay, T=traj.T)
         
         logger = Logger()
         logger('train_iteration', iteration+1)
         logger('num_seconds', round(time.perf_counter() - t0, 1))
         [logger(key, value) for key, value in out_agent.items()]
         logger('episode_return', sum(traj.rewards))
         logger('episode_horizon', traj.T)
         logger('accumulated_trained_timesteps', self.agent.total_timestep)
         train_logs.append(logger.logs)
         if iteration == 0 or (iteration+1) % self.config['log.freq'] == 0:
             logger.dump(keys=None, index=0, indent=0, border='-'*50)
         if self.agent.total_timestep >= int(self.config['train.timestep']*(checkpoint_count/(self.config['checkpoint.num'] - 1))):
             self.agent.checkpoint(self.logdir, iteration + 1)
             checkpoint_count += 1
             
         if self.agent.total_timestep >= int(self.config['train.timestep']*(len(eval_logs)/(self.config['eval.num'] - 1))):
             eval_logs.append(self.eval(n=len(eval_logs)))
     return train_logs, eval_logs
Exemple #6
0
    def log_eval(self, eval_output, **kwargs):
        D = eval_output['D']
        n = eval_output['n']
        T = eval_output['T']
        num_sec = eval_output['num_sec']

        logger = Logger()

        batch_returns = D.numpy_rewards.sum(1)

        logger('evaluation_iteration', n + 1)
        logger('num_seconds', round(num_sec, 1))
        logger('num_trajectories', D.N)
        logger('max_allowed_horizon', T)
        logger('mean_horizon', D.Ts.mean())
        logger('total_timesteps', D.total_T)
        logger('accumulated_trained_timesteps', self.agent.total_T)
        logger('mean_return', batch_returns.mean())
        logger('std_return', batch_returns.std())
        logger('min_return', batch_returns.min())
        logger('max_return', batch_returns.max())

        print(color_str('+' * 50, 'yellow', 'bold'))
        logger.dump(keys=None, index=None, indent=0)
        print(color_str('+' * 50, 'yellow', 'bold'))

        return logger.logs
Exemple #7
0
    def train(self, n=None):
        self.agent.train()

        logger = Logger()

        for i, (data, label) in enumerate(self.agent.train_loader):
            data = data.to(self.agent.device)

            self.agent.optimizer.zero_grad()
            re_x, mu, logvar = self.agent(data)
            out = self.agent.vae_loss(re_x=re_x,
                                      x=data,
                                      mu=mu,
                                      logvar=logvar,
                                      loss_type='BCE')
            loss = out['loss']
            loss.backward()
            self.agent.optimizer.step()

            logger('epoch', n)
            logger('iteration', i)
            logger('train_loss', out['loss'].item())
            logger('reconstruction_loss', out['re_loss'].item())
            logger('KL_loss', out['KL_loss'].item())

            if i == 0 or (i + 1) % self.config['log.interval'] == 0:
                print('-' * 50)
                logger.dump(keys=None, index=-1, indent=0)
                print('-' * 50)

        return logger.logs
Exemple #8
0
    def log_train(self, train_output, **kwargs):
        D = train_output['D']
        out_agent = train_output['out_agent']
        n = train_output['n']
        num_sec = train_output['num_sec']

        logger = Logger()
        logger('train_iteration', n + 1)  # starts from 1
        logger('num_seconds', round(num_sec, 1))

        [logger(key, value) for key, value in out_agent.items()]

        batch_returns = D.numpy_rewards.sum(1)

        logger('num_trajectories', D.N)
        logger('num_timesteps', D.total_T)
        logger('accumulated_trained_timesteps', self.agent.total_T)
        logger('mean_return', batch_returns.mean())
        logger('std_return', batch_returns.std())
        logger('min_return', batch_returns.min())
        logger('max_return', batch_returns.max())

        monitor_env = get_wrapper(self.runner.env, 'VecMonitor')
        infos = list(
            filter(lambda info: 'episode' in info,
                   chain.from_iterable(D.infos)))
        if len(infos) > 0:
            online_returns = np.asarray(
                [info['episode']['return'] for info in infos])
            online_horizons = np.asarray(
                [info['episode']['horizon'] for info in infos])
            logger('online_N', len(infos))
            logger('online_mean_return', online_returns.mean())
            logger('online_std_return', online_returns.std())
            logger('online_min_return', online_returns.min())
            logger('online_max_return', online_returns.max())
            logger('online_mean_horizon', online_horizons.mean())
            logger('online_std_horizon', online_horizons.std())
            logger('online_min_horizon', online_horizons.min())
            logger('online_max_horizon', online_horizons.max())
        running_returns = np.asarray(monitor_env.return_queue)
        running_horizons = np.asarray(monitor_env.horizon_queue)
        if running_returns.size > 0 and running_horizons.size > 0:
            logger('running_queue', [
                len(monitor_env.return_queue), monitor_env.return_queue.maxlen
            ])
            logger('running_mean_return', running_returns.mean())
            logger('running_std_return', running_returns.std())
            logger('running_min_return', running_returns.min())
            logger('running_max_return', running_returns.max())
            logger('running_mean_horizon', running_horizons.mean())
            logger('running_std_horizon', running_horizons.std())
            logger('running_min_horizon', running_horizons.min())
            logger('running_max_horizon', running_horizons.max())

        print('-' * 50)
        logger.dump(keys=None, index=None, indent=0)
        print('-' * 50)

        return logger.logs
    def log_eval(self, eval_output):
        # Create evaluation logger
        logger = Logger(name='eval_logger')

        # Unpack evaluation for logging
        D = eval_output['D']
        n = eval_output['n']

        # Compute some metrics
        batch_returns = [sum(trajectory.all_r) for trajectory in D]
        batch_T = [trajectory.T for trajectory in D]

        # Loggings
        # Use item() for tensor to save memory
        logger.log(key='evaluation_iteration', val=n + 1)
        logger.log(key='num_trajectories', val=len(D))
        logger.log(key='max_allowed_horizon', val=self.config['eval:T'])
        logger.log(key='average_horizon', val=np.mean(batch_T))
        logger.log(key='num_timesteps', val=np.sum(batch_T))
        logger.log(key='accumulated_trained_timesteps',
                   val=self.accumulated_trained_timesteps)
        logger.log(key='average_return', val=np.mean(batch_returns))
        logger.log(key='std_return', val=np.std(batch_returns))
        logger.log(key='min_return', val=np.min(batch_returns))
        logger.log(key='max_return', val=np.max(batch_returns))

        # Dump the loggings
        print('-' * 50)
        logger.dump(keys=None, index=None, indent=0)
        print('-' * 50)

        return logger
Exemple #10
0
    def train(self, n=None, **kwargs):
        train_logs = []
        eval_logs = []
        eval_togo = 0
        dump_togo = 0
        num_episode = 0
        checkpoint_count = 0
        observation = self.env.reset()
        for i in count():
            if i >= self.config['train.timestep']:
                break
            if i < self.config['replay.init_size']:
                action = [self.env.action_space.sample()]
            else:
                action = self.agent.choose_action(observation,
                                                  mode='stochastic')['action']
            next_observation, reward, done, info = self.env.step(action)
            eval_togo += 1
            dump_togo += 1
            if done[0]:  # [0] due to single environment
                start_time = perf_counter()
                # NOTE: must use latest TimeLimit
                reach_time_limit = info[0].get('TimeLimit.truncated', False)
                reach_terminal = not reach_time_limit
                self.replay.add(observation[0], action[0], reward[0],
                                info[0]['last_observation'], reach_terminal)

                # updates in the end of episode, for each time step
                out_agent = self.agent.learn(
                    D=None,
                    replay=self.replay,
                    episode_length=info[0]['episode']['horizon'])
                num_episode += 1
                if i >= int(self.config['train.timestep'] *
                            (checkpoint_count /
                             (self.config['checkpoint.num'] - 1))):
                    self.agent.checkpoint(self.logdir, num_episode)
                    checkpoint_count += 1
                logger = Logger()
                logger('num_seconds', round(perf_counter() - start_time, 1))
                logger('accumulated_trained_timesteps', i + 1)
                logger('accumulated_trained_episodes', num_episode)
                [logger(key, value) for key, value in out_agent.items()]
                logger('episode_return', info[0]['episode']['return'])
                logger('episode_horizon', info[0]['episode']['horizon'])
                train_logs.append(logger.logs)
                if dump_togo >= self.config['log.freq']:
                    dump_togo %= self.config['log.freq']
                    logger.dump(keys=None, index=0, indent=0, border='-' * 50)
                if eval_togo >= self.config['eval.freq']:
                    eval_togo %= self.config['eval.freq']
                    eval_logs.append(
                        self.eval(accumulated_trained_timesteps=(i + 1),
                                  accumulated_trained_episodes=num_episode))
            else:
                self.replay.add(observation[0], action[0], reward[0],
                                next_observation[0], done[0])
            observation = next_observation
        return train_logs, eval_logs
    def log_train(self, train_output):
        # Create training logger
        logger = Logger(name='train_logger')

        # Unpack training output for logging
        D = train_output['D']
        out_agent = train_output['out_agent']
        n = train_output['n']

        # Loggings
        # Use item() for tensor to save memory
        logger.log(key='train_iteration', val=n + 1)  # iteration starts from 1
        if self.config['algo:use_lr_scheduler']:
            logger.log(key='current_lr', val=out_agent['current_lr'])

        logger.log(key='loss', val=out_agent['loss'].item())
        policy_loss = torch.stack(out_agent['batch_policy_loss']).mean().item()
        logger.log(key='policy_loss', val=policy_loss)
        entropy_loss = torch.stack(
            out_agent['batch_entropy_loss']).mean().item()
        logger.log(key='policy_entropy',
                   val=-entropy_loss)  # negation of entropy loss
        value_loss = torch.stack(out_agent['batch_value_loss']).mean().item()
        logger.log(key='value_loss', val=value_loss)

        # Get some data from trajectory list
        batch_returns = [trajectory.all_returns[0] for trajectory in D]
        batch_discounted_returns = [
            trajectory.all_discounted_returns[0] for trajectory in D
        ]
        num_timesteps = sum([trajectory.T for trajectory in D])

        # Log more information
        logger.log(key='num_trajectories', val=len(D))
        logger.log(key='num_timesteps', val=num_timesteps)
        logger.log(key='accumulated_trained_timesteps',
                   val=self.accumulated_trained_timesteps)
        logger.log(key='average_return', val=np.mean(batch_returns))
        logger.log(key='average_discounted_return',
                   val=np.mean(batch_discounted_returns))
        logger.log(key='std_return', val=np.std(batch_returns))
        logger.log(key='min_return', val=np.min(batch_returns))
        logger.log(key='max_return', val=np.max(batch_returns))

        # Dump the loggings
        print('-' * 50)
        logger.dump(keys=None, index=None, indent=0)
        print('-' * 50)

        return logger
Exemple #12
0
def run(config, seed, device, logdir):
    set_global_seeds(seed)
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK

    print('Initializing...')
    agent = Agent(config, make_env(config, seed, 'eval'), device)
    es = OpenAIES(
        [config['train.mu0']] * agent.num_params, config['train.std0'], {
            'popsize': config['train.popsize'],
            'seed': seed,
            'sigma_scheduler_args': config['train.sigma_scheduler_args'],
            'lr': config['train.lr'],
            'lr_decay': config['train.lr_decay'],
            'min_lr': config['train.min_lr'],
            'antithetic': config['train.antithetic'],
            'rank_transform': config['train.rank_transform']
        })
    train_logs = []
    checkpoint_count = 0
    with Pool(processes=config['train.popsize'] //
              config['train.worker_chunksize']) as pool:
        print('Finish initialization. Training starts...')
        for generation in range(config['train.generations']):
            t0 = time.perf_counter()
            solutions = es.ask()
            data = [(config, seed, device, solution) for solution in solutions]
            out = pool.map(CloudpickleWrapper(fitness),
                           data,
                           chunksize=config['train.worker_chunksize'])
            Rs, Hs = zip(*out)
            es.tell(solutions, [-R for R in Rs])
            logger = Logger()
            logger('generation', generation + 1)
            logger('num_seconds', round(time.perf_counter() - t0, 1))
            logger('Returns',
                   describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('Horizons',
                   describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('fbest', es.result.fbest)
            train_logs.append(logger.logs)
            if generation == 0 or (generation + 1) % config['log.freq'] == 0:
                logger.dump(keys=None, index=0, indent=0, border='-' * 50)
            if (generation + 1) >= int(config['train.generations'] *
                                       (checkpoint_count /
                                        (config['checkpoint.num'] - 1))):
                agent.from_vec(tensorify(es.result.xbest, 'cpu'))
                agent.checkpoint(logdir, generation + 1)
                checkpoint_count += 1
    pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
    return None
Exemple #13
0
class ESMaster(BaseESMaster):
    @property
    def _num_params(self):
        worker = ESWorker()
        worker._prepare(self.config)
        num_params = worker.agent.num_params
        del worker

        return num_params

    def make_es(self, config):
        if self.config['es.algo'] == 'CMAES':
            es = CMAES(mu0=[self.config['es.mu0']] * self._num_params,
                       std0=self.config['es.std0'],
                       popsize=self.config['es.popsize'])
        elif self.config['es.algo'] == 'OpenAIES':
            es = OpenAIES(mu0=[self.config['es.mu0']] * self._num_params,
                          std0=self.config['es.std0'],
                          popsize=self.config['es.popsize'],
                          std_decay=0.999,
                          min_std=0.01,
                          lr=1e-1,
                          lr_decay=0.99,
                          min_lr=1e-3,
                          antithetic=True,
                          rank_transform=True)

        self.logger = Logger()

        return es

    def process_es_result(self, result):
        best_f_val = result['best_f_val']
        best_return = -best_f_val

        self.logger('generation', self.generation + 1)
        self.logger('best_return', best_return)

        if self.generation == 0 or (self.generation +
                                    1) % self.config['log.interval'] == 0:
            print('-' * 50)
            self.logger.dump(keys=None, index=-1, indent=0)
            print('-' * 50)

        # Save the loggings and final parameters
        if (self.generation + 1) == self.config['train.num_iteration']:
            pickle_dump(obj=self.logger.logs,
                        f=self.logdir / 'result',
                        ext='.pkl')
            np.save(self.logdir / 'trained_param', result['best_param'])
    def log_train(self, train_output):
        # Create training logger
        logger = Logger(name='train_logger')

        # Unpack training output for logging
        D = train_output['D']
        out_agent = train_output['out_agent']
        n = train_output['n']

        # Loggings
        # Use item() for tensor to save memory
        logger.log(key='train_iteration', val=n + 1)  # iteration starts from 1
        if self.config['algo:use_lr_scheduler']:
            logger.log(key='current_lr', val=out_agent['current_lr'])

        logger.log(key='loss', val=out_agent['loss'].item())
        policy_loss = torch.stack(out_agent['batch_policy_loss']).mean().item()
        logger.log(key='policy_loss', val=policy_loss)
        entropy_loss = torch.stack(
            out_agent['batch_entropy_loss']).mean().item()
        logger.log(key='policy_entropy',
                   val=-entropy_loss)  # negation of entropy loss
        value_loss = torch.stack(out_agent['batch_value_loss']).mean().item()
        logger.log(key='value_loss', val=value_loss)

        # Get some data from segment list
        all_immediate_reward = [segment.all_r for segment in D]
        num_timesteps = sum([segment.T for segment in D])

        # Log more information
        logger.log(key='num_segments',
                   val=sum([len(segment.split_transitions) for segment in D]))
        logger.log(key='num_timesteps', val=num_timesteps)
        logger.log(key='accumulated_trained_timesteps',
                   val=self.accumulated_trained_timesteps)
        logger.log(key='average_immediate_reward',
                   val=np.mean(all_immediate_reward))
        logger.log(key='std_immediate_reward',
                   val=np.std(all_immediate_reward))
        logger.log(key='min_immediate_reward',
                   val=np.min(all_immediate_reward))
        logger.log(key='max_immediate_reward',
                   val=np.max(all_immediate_reward))

        # Dump the loggings
        print('-' * 50)
        logger.dump(keys=None, index=None, indent=0)
        print('-' * 50)

        return logger
Exemple #15
0
 def eval(self, n=None, **kwargs):
     t0 = time.perf_counter()
     with torch.no_grad():
         D = self.runner(self.agent, self.eval_env, 10, mode='eval')
     
     logger = Logger()
     logger('eval_iteration', n+1)
     logger('num_seconds', round(time.perf_counter() - t0, 1))
     logger('accumulated_trained_timesteps', self.agent.total_timestep)
     logger('online_return', describe([sum(traj.rewards) for traj in D], axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('online_horizon', describe([traj.T for traj in D], axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('running_return', describe(self.eval_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('running_horizon', describe(self.eval_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
     return logger.logs
Exemple #16
0
    def log_train(self, train_output, **kwargs):
        logger = Logger()
        D = train_output['D']
        out_agent = train_output['out_agent']
        n = train_output['n']
        logger('train_iteration', n + 1)  # starts from 1
        logger('params', self.agent.policy.state_dict())

        logger('num_segments', D.N)
        logger('num_timesteps', D.total_T)
        logger('accumulated_trained_timesteps', self.agent.total_T)
        print('-' * 50)
        logger.dump(keys=None, index=None, indent=0)
        print('-' * 50)
        return logger.logs
Exemple #17
0
    def log_train(self, train_output, **kwargs):
        # Create training logger
        logger = Logger(name='train_logger')

        # Unpack training output for logging
        D = train_output['D']
        out_agent = train_output['out_agent']
        n = train_output['n']

        # Loggings: use item() to save memory
        logger.log('train_iteration', n + 1)  # iteration starts from 1
        if self.config['algo.use_lr_scheduler']:
            logger.log('current_lr', out_agent['current_lr'])

        logger.log('loss', out_agent['loss'])
        logger.log('policy_loss', out_agent['policy_loss'])
        logger.log(
            'policy_entropy',
            -out_agent['entropy_loss'])  # negate entropy loss is entropy
        logger.log('value_loss', out_agent['value_loss'])

        # Log something about trajectories
        batch_returns = [sum(trajectory.all_r) for trajectory in D]
        batch_discounted_returns = [
            trajectory.all_discounted_returns[0] for trajectory in D
        ]
        num_timesteps = sum([trajectory.T for trajectory in D])

        logger.log('num_trajectories', len(D))
        logger.log('num_timesteps', num_timesteps)
        logger.log('accumulated_trained_timesteps', self.agent.total_T)
        logger.log('average_return', np.mean(batch_returns))
        logger.log('average_discounted_return',
                   np.mean(batch_discounted_returns))
        logger.log('std_return', np.std(batch_returns))
        logger.log('min_return', np.min(batch_returns))
        logger.log('max_return', np.max(batch_returns))

        # Dump loggings
        if n == 0 or (n + 1) % self.config['log.print_interval'] == 0:
            print('-' * 50)
            logger.dump(keys=None, index=None, indent=0)
            print('-' * 50)

        return logger.logs
Exemple #18
0
    def train(self, n=None):
        self.agent.train()  # set to training mode

        # Create a logger
        train_output = Logger()

        # Iterate over data batches for one epoch
        for i, (data, label) in enumerate(self.train_loader):
            # Put data to device
            data = data.to(self.device)
            # Zero-out gradient buffer
            self.optimizer.zero_grad()
            # Forward pass of data
            re_x, mu, logvar = self.agent(data)
            # Calculate loss
            out = self.agent.calculate_loss(re_x=re_x,
                                            x=data,
                                            mu=mu,
                                            logvar=logvar,
                                            loss_type='BCE')
            loss = out['loss']
            # Backward pass to calcualte gradients
            loss.backward()
            # Take a gradient step
            self.optimizer.step()

            # Record train output
            train_output.log('epoch', n)
            train_output.log('iteration', i)
            train_output.log('train_loss',
                             out['loss'].item())  # item() saves memory
            train_output.log('reconstruction_loss', out['re_loss'].item())
            train_output.log('KL_loss', out['KL_loss'].item())

            # Dump logging
            if i == 0 or (i + 1) % self.config['log.interval'] == 0:
                print('-' * 50)
                train_output.dump(keys=None, index=-1, indent=0)
                print('-' * 50)

        return train_output.logs
Exemple #19
0
    def log_train(self, train_output, **kwargs):
        # Unpack
        D = train_output['D']
        out_agent = train_output['out_agent']
        n = train_output['n']

        # Loggings
        logger = Logger(name='train_logger')
        logger.log('train_iteration', n + 1)  # starts from 1
        if self.config['algo.use_lr_scheduler']:
            logger.log('current_lr', out_agent['current_lr'])

        logger.log('loss', out_agent['loss'])
        logger.log('policy_loss', out_agent['policy_loss'])
        logger.log(
            'policy_entropy',
            -out_agent['entropy_loss'])  # entropy: negative entropy loss
        logger.log('value_loss', out_agent['value_loss'])

        all_immediate_reward = [segment.all_r for segment in D]
        num_timesteps = sum([segment.T for segment in D])

        logger.log('num_segments', len(D))
        logger.log('num_subsegments',
                   sum([len(segment.trajectories) for segment in D]))
        logger.log('num_timesteps', num_timesteps)
        logger.log('accumulated_trained_timesteps', self.agent.total_T)
        logger.log('average_immediate_reward', np.mean(all_immediate_reward))
        logger.log('std_immediate_reward', np.std(all_immediate_reward))
        logger.log('min_immediate_reward', np.min(all_immediate_reward))
        logger.log('max_immediate_reward', np.max(all_immediate_reward))

        # Dump loggings
        if n == 0 or (n + 1) % self.config['log.print_interval'] == 0:
            print('-' * 50)
            logger.dump(keys=None, index=None, indent=0)
            print('-' * 50)

        return logger.logs
Exemple #20
0
class ESMaster(BaseESMaster):
    def _network_size(self):
        worker = ESWorker()
        tmp_agent = worker.init(seed=0, config=self.config)
        num_params = worker.network.num_params
        
        del worker, tmp_agent
        
        return num_params
    
    def make_es(self, config):
        es = CMAES(mu0=[self.config['es.mu0']]*self._network_size(),
                   std0=self.config['es.std0'], 
                   popsize=self.config['es.popsize'])
        
        self.logger = Logger()
        
        return es
        
    def _process_es_result(self, result):
        best_f_val = result['best_f_val']
        best_return = -best_f_val  # negate to get back reward
        
        # logging
        self.logger.log('generation', self.generation)
        self.logger.log('best_return', best_return)
        
        if self.generation == 0 or (self.generation+1) % self.config['log.interval'] == 0:
            print('-'*50)
            self.logger.dump(keys=None, index=-1, indent=0)
            print('-'*50)
            
        # Save the loggings and final parameters
        if (self.generation+1) == self.num_iteration:
            pickle_dump(obj=self.logger.logs, f=self.logdir/'result', ext='.pkl')
            np.save(self.logdir/'trained_param', result['best_param'])
Exemple #21
0
    def test_logger(self):
        logger = Logger(name='logger')

        logger.log('iteration', 1)
        logger.log('learning_rate', 1e-3)
        logger.log('training_loss', 0.12)
        logger.log('evaluation_loss', 0.14)

        logger.log('iteration', 2)
        logger.log('learning_rate', 5e-4)
        logger.log('training_loss', 0.11)
        logger.log('evaluation_loss', 0.13)

        logger.log('iteration', 3)
        logger.log('learning_rate', 1e-4)
        logger.log('training_loss', 0.09)
        logger.log('evaluation_loss', 0.10)

        # Test dump, because dump will call print, impossible to use assert
        logger.dump()
        logger.dump(keys=None, index=None, indent=1)
        logger.dump(keys=None, index=None, indent=2)
        logger.dump(keys=['iteration', 'evaluation_loss'],
                    index=None,
                    indent=0)
        logger.dump(keys=None, index=0, indent=0)
        logger.dump(keys=None, index=2, indent=0)
        logger.dump(keys=None, index=[0, 2], indent=0)
        logger.dump(keys=['iteration', 'training_loss'],
                    index=[0, 2],
                    indent=0)

        # Test save function
        file = './test_logger_file'
        logger.save(file=file)

        assert os.path.exists(file)

        # Load file
        logging = Logger.load(file)

        assert len(logging) == 4
        assert 'iteration' in logging
        assert 'learning_rate' in logging
        assert 'training_loss' in logging
        assert 'evaluation_loss' in logging

        assert np.allclose(logging['iteration'], [1, 2, 3])
        assert np.allclose(logging['learning_rate'], [1e-3, 5e-4, 1e-4])
        assert np.allclose(logging['training_loss'], [0.12, 0.11, 0.09])
        assert np.allclose(logging['evaluation_loss'], [0.14, 0.13, 0.1])

        # Delete the temp logger file
        os.unlink(file)
Exemple #22
0
def test_logger():
    logger = Logger()

    logger('iteration', 1)
    logger('learning_rate', 1e-3)
    logger('train_loss', 0.12)
    logger('eval_loss', 0.14)

    logger('iteration', 2)
    logger('learning_rate', 5e-4)
    logger('train_loss', 0.11)
    logger('eval_loss', 0.13)

    logger('iteration', 3)
    logger('learning_rate', 1e-4)
    logger('train_loss', 0.09)
    logger('eval_loss', 0.10)

    def check(logs):
        assert len(logs) == 4
        assert list(logs.keys()) == ['iteration', 'learning_rate', 'train_loss', 'eval_loss']
        assert logs['iteration'] == [1, 2, 3]
        assert np.allclose(logs['learning_rate'], [1e-3, 5e-4, 1e-4])
        assert np.allclose(logs['train_loss'], [0.12, 0.11, 0.09])
        assert np.allclose(logs['eval_loss'], [0.14, 0.13, 0.10])

    check(logger.logs)

    logger.dump()
    logger.dump(border='-'*50)
    logger.dump(keys=['iteration'])
    logger.dump(keys=['iteration', 'train_loss'])
    logger.dump(index=0)
    logger.dump(index=[1, 2])
    logger.dump(index=0)
    logger.dump(keys=['iteration', 'eval_loss'], index=1)
    logger.dump(keys=['iteration', 'learning_rate'], indent=1)
    logger.dump(keys=['iteration', 'train_loss'], index=[0, 2], indent=1, border='#'*50)

    f = Path('./logger_file')
    logger.save(f)
    f = f.with_suffix('.pkl')
    assert f.exists()

    logs = pickle_load(f)
    check(logs)

    f.unlink()
    assert not f.exists()

    logger.clear()
    assert len(logger.logs) == 0
Exemple #23
0
def evaluator(config, logdir, seed, make_env, learner_agent):
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK
    eval_logs = []
    env = make_env(config, seed, 'train')
    agent = Agent(config, env, torch.device('cpu'))
    runner = EpisodeRunner(reset_on_call=True)
    evaluated_steps = config['eval.freq']
    while learner_agent.total_timestep < config['train.timestep']:
        if learner_agent.total_timestep < evaluated_steps:
            time.sleep(1.0)
        else:
            t0 = time.perf_counter()
            agent.load_state_dict(
                learner_agent.state_dict())  # copy to CPU by default
            with torch.no_grad():
                D = []
                for _ in range(config['eval.num_episode']):
                    D += runner(agent, env, env.spec.max_episode_steps)
            logger = Logger()
            logger('num_seconds', round(time.perf_counter() - t0, 1))
            logger('num_trajectories', len(D))
            logger('num_timesteps', sum([len(traj) for traj in D]))
            logger('accumulated_trained_timesteps',
                   learner_agent.total_timestep)

            infos = [
                info
                for info in chain.from_iterable([traj.infos for traj in D])
                if 'episode' in info
            ]
            online_returns = [info['episode']['return'] for info in infos]
            online_horizons = [info['episode']['horizon'] for info in infos]
            logger(
                'online_return',
                describe(online_returns,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger(
                'online_horizon',
                describe(online_horizons,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))

            monitor_env = get_wrapper(env, 'VecMonitor')
            logger(
                'running_return',
                describe(monitor_env.return_queue,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger(
                'running_horizon',
                describe(monitor_env.horizon_queue,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger.dump(keys=None,
                        index=0,
                        indent=0,
                        border=color_str('+' * 50, color='green'))
            eval_logs.append(logger.logs)

            evaluated_steps += config['eval.freq']
    pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')