コード例 #1
0
    def test_example(self):
        experiment_name = 'test_example'

        register_custom_components()

        # test training for a few thousand frames
        cfg = custom_parse_args(argv=['--algo=APPO', '--env=my_custom_env_v1', f'--experiment={experiment_name}'])
        cfg.num_workers = 4
        cfg.train_for_env_steps = 100000
        cfg.save_every_sec = 1
        cfg.decorrelate_experience_max_seconds = 0
        cfg.seed = 0
        cfg.device = 'cpu'

        status = run_algorithm(cfg)
        self.assertEqual(status, ExperimentStatus.SUCCESS)

        # then test the evaluation of the saved model
        cfg = custom_parse_args(
            argv=['--algo=APPO', '--env=my_custom_env_v1', f'--experiment={experiment_name}'],
            evaluation=True,
        )
        cfg.device = 'cpu'
        status, avg_reward = enjoy(cfg, max_num_frames=1000)

        directory = experiment_dir(cfg=cfg)
        self.assertTrue(isdir(directory))
        shutil.rmtree(directory, ignore_errors=True)
        # self.assertFalse(isdir(directory))

        self.assertEqual(status, ExperimentStatus.SUCCESS)

        # not sure if we should check it here, it's optional
        # maybe a longer test where it actually has a chance to converge
        self.assertGreater(avg_reward, 60)
コード例 #2
0
def ensure_initialized(cfg, env_name):
    global DMLAB_INITIALIZED
    if DMLAB_INITIALIZED:
        return

    dmlab_register_models()

    if env_name == 'dmlab_30':
        # extra functions to calculate human-normalized score etc.
        EXTRA_EPISODIC_STATS_PROCESSING.append(
            dmlab_extra_episodic_stats_processing)
        EXTRA_PER_POLICY_SUMMARIES.append(dmlab_extra_summaries)

    num_policies = cfg.num_policies if hasattr(cfg, 'num_policies') else 1
    all_levels = list_all_levels_for_experiment(env_name)
    level_cache_dir = cfg.dmlab_level_cache_path
    dmlab_ensure_global_cache_initialized(experiment_dir(cfg=cfg), all_levels,
                                          num_policies, level_cache_dir)

    DMLAB_INITIALIZED = True
コード例 #3
0
def policy_reward_shaping_file(cfg, policy_id):
    return join(experiment_dir(cfg=cfg),
                f'policy_{policy_id:02d}_reward_shaping.json')
コード例 #4
0
def policy_cfg_file(cfg, policy_id):
    return join(experiment_dir(cfg=cfg), f'policy_{policy_id:02d}_cfg.json')
コード例 #5
0
 def initialize(self):
     self._save_cfg()
     save_git_diff(experiment_dir(cfg=self.cfg))
     init_file_logger(experiment_dir(self.cfg))
コード例 #6
0
    def __init__(self, cfg):
        super().__init__(cfg)

        # we should not use CUDA in the main thread, only on the workers
        set_global_cuda_envvars(cfg)

        tmp_env = make_env_func(self.cfg, env_config=None)
        self.obs_space = tmp_env.observation_space
        self.action_space = tmp_env.action_space
        self.num_agents = tmp_env.num_agents

        self.reward_shaping_scheme = None
        if self.cfg.with_pbt:
            self.reward_shaping_scheme = get_default_reward_shaping(tmp_env)

        tmp_env.close()

        # shared memory allocation
        self.traj_buffers = SharedBuffers(self.cfg, self.num_agents,
                                          self.obs_space, self.action_space)

        self.actor_workers = None

        self.report_queue = MpQueue(40 * 1000 * 1000)
        self.policy_workers = dict()
        self.policy_queues = dict()

        self.learner_workers = dict()

        self.workers_by_handle = None

        self.policy_inputs = [[] for _ in range(self.cfg.num_policies)]
        self.policy_outputs = dict()
        for worker_idx in range(self.cfg.num_workers):
            for split_idx in range(self.cfg.worker_num_splits):
                self.policy_outputs[(worker_idx, split_idx)] = dict()

        self.policy_avg_stats = dict()
        self.policy_lag = [dict() for _ in range(self.cfg.num_policies)]

        self.last_timing = dict()
        self.env_steps = dict()
        self.samples_collected = [0 for _ in range(self.cfg.num_policies)]
        self.total_env_steps_since_resume = 0

        # currently this applies only to the current run, not experiment as a whole
        # to change this behavior we'd need to save the state of the main loop to a filesystem
        self.total_train_seconds = 0

        self.last_report = time.time()
        self.last_experiment_summaries = 0

        self.report_interval = 5.0  # sec
        self.experiment_summaries_interval = self.cfg.experiment_summaries_interval  # sec

        self.avg_stats_intervals = (2, 12, 60
                                    )  # 10 seconds, 1 minute, 5 minutes

        self.fps_stats = deque([], maxlen=max(self.avg_stats_intervals))
        self.throughput_stats = [
            deque([], maxlen=5) for _ in range(self.cfg.num_policies)
        ]
        self.avg_stats = dict()
        self.stats = dict()  # regular (non-averaged) stats

        init_wandb(self.cfg)

        self.writers = dict()
        writer_keys = list(range(self.cfg.num_policies))
        for key in writer_keys:
            summary_dir = join(summaries_dir(experiment_dir(cfg=self.cfg)),
                               str(key))
            summary_dir = ensure_dir_exists(summary_dir)
            self.writers[key] = SummaryWriter(summary_dir, flush_secs=20)

        self.pbt = PopulationBasedTraining(self.cfg,
                                           self.reward_shaping_scheme,
                                           self.writers)