def main(env_config, model_config, agent_config, replay_config, n, record=False, size=(128, 128), video_len=1000, fps=30, save=False): silence_tf_logs() configure_gpu() configure_precision(agent_config.get('precision', 32)) use_ray = env_config.get('n_workers', 0) > 1 if use_ray: import ray ray.init() sigint_shutdown_ray() algo_name = agent_config['algorithm'] env_name = env_config['name'] try: make_env = pkg.import_module('env', algo_name, place=-1).make_env except: make_env = None env_config.pop('reward_clip', False) env = create_env(env_config, env_fn=make_env) create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) agent = Agent(config=agent_config, models=models, dataset=None, env=env) if n < env.n_envs: n = env.n_envs scores, epslens, video = evaluate(env, agent, n, record=record, size=size, video_len=video_len) pwc(f'After running {n} episodes', f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}', color='cyan') if record: save_video(f'{algo_name}-{env_name}', video, fps=fps) if use_ray: ray.shutdown()
def train(agent, env, eval_env, replay): collect_fn = pkg.import_module('agent', algo=agent.name).collect collect = functools.partial(collect_fn, replay) _, step = replay.count_episodes() step = max(agent.env_step, step) runner = Runner(env, agent, step=step) def random_actor(*args, **kwargs): prev_action = random_actor.prev_action random_actor.prev_action = action = env.random_action() return action, {'prev_action': prev_action} random_actor.prev_action = np.zeros_like(env.random_action()) \ if isinstance(env.random_action(), np.ndarray) else 0 while not replay.good_to_learn(): step = runner.run(action_selector=random_actor, step_fn=collect) to_log = Every(agent.LOG_PERIOD) to_eval = Every(agent.EVAL_PERIOD) print('Training starts...') while step < int(agent.MAX_STEPS): start_step = step start_t = time.time() agent.learn_log(step) step = runner.run(step_fn=collect, nsteps=agent.TRAIN_PERIOD) duration = time.time() - start_t agent.store(fps=(step - start_step) / duration, tps=agent.N_UPDATES / duration) if to_eval(step): with TempStore(agent.get_states, agent.reset_states): score, epslen, video = evaluate(eval_env, agent, record=agent.RECORD, size=(64, 64)) if agent.RECORD: video_summary(f'{agent.name}/sim', video, step=step) agent.store(eval_score=score, eval_epslen=epslen) if to_log(step): agent.log(step) agent.save()
def train(agent, env, eval_env, buffer): collect_fn = pkg.import_module('agent', algo=agent.name).collect collect = functools.partial(collect_fn, buffer) step = agent.env_step runner = Runner(env, agent, step=step, nsteps=agent.N_STEPS) exp_buffer = get_expert_data(f'{buffer.DATA_PATH}-{env.name}') if step == 0 and agent.is_obs_normalized: print('Start to initialize running stats...') for _ in range(10): runner.run(action_selector=env.random_action, step_fn=collect) agent.update_obs_rms(np.concatenate(buffer['obs'])) agent.update_reward_rms(buffer['reward'], buffer['discount']) buffer.reset() buffer.clear() agent.save(print_terminal_info=True) runner.step = step # print("Initial running stats:", *[f'{k:.4g}' for k in agent.get_running_stats() if k]) to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD) to_eval = Every(agent.EVAL_PERIOD) rt = Timer('run') tt = Timer('train') et = Timer('eval') lt = Timer('log') print('Training starts...') while step < agent.MAX_STEPS: start_env_step = agent.env_step agent.before_run(env) with rt: step = runner.run(step_fn=collect) agent.store(fps=(step - start_env_step) / rt.last()) buffer.reshape_to_sample() agent.disc_learn_log(exp_buffer) buffer.compute_reward_with_func(agent.compute_reward) buffer.reshape_to_store() # NOTE: normalizing rewards here may introduce some inconsistency # if normalized rewards is fed as an input to the network. # One can reconcile this by moving normalization to collect # or feeding the network with unnormalized rewards. # The latter is adopted in our implementation. # However, the following line currently doesn't store # a copy of unnormalized rewards agent.update_reward_rms(buffer['reward'], buffer['discount']) buffer.update('reward', agent.normalize_reward(buffer['reward']), field='all') agent.record_last_env_output(runner.env_output) value = agent.compute_value() buffer.finish(value) start_train_step = agent.train_step with tt: agent.learn_log(step) agent.store(tps=(agent.train_step - start_train_step) / tt.last()) buffer.reset() if to_eval(agent.train_step) or step > agent.MAX_STEPS: with TempStore(agent.get_states, agent.reset_states): with et: eval_score, eval_epslen, video = evaluate( eval_env, agent, n=agent.N_EVAL_EPISODES, record=agent.RECORD, size=(64, 64)) if agent.RECORD: video_summary(f'{agent.name}/sim', video, step=step) agent.store(eval_score=eval_score, eval_epslen=eval_epslen) if to_log(agent.train_step) and agent.contains_stats('score'): with lt: agent.store( **{ 'train_step': agent.train_step, 'time/run': rt.total(), 'time/train': tt.total(), 'time/eval': et.total(), 'time/log': lt.total(), 'time/run_mean': rt.average(), 'time/train_mean': tt.average(), 'time/eval_mean': et.average(), 'time/log_mean': lt.average(), }) agent.log(step) agent.save()
def train(agent, env, eval_env, buffer): def collect(env, step, reset, next_obs, **kwargs): buffer.add(**kwargs) step = agent.env_step runner = Runner(env, agent, step=step, nsteps=agent.N_STEPS) actsel = lambda *args, **kwargs: np.random.randint( 0, env.action_dim, size=env.n_envs) if not agent.rnd_rms_restored(): print('Start to initialize observation running stats...') for _ in range(50): runner.run(action_selector=actsel, step_fn=collect) agent.update_obs_rms(buffer['obs']) buffer.reset() buffer.clear() agent.save() runner.step = step to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD) to_eval = Every(agent.EVAL_PERIOD) print('Training starts...') while step < agent.MAX_STEPS: start_env_step = agent.env_step with Timer('env') as rt: step = runner.run(step_fn=collect) agent.store(fps=(step - start_env_step) / rt.last()) agent.record_last_env_output(runner.env_output) value_int, value_ext = agent.compute_value() obs = buffer.get_obs(runner.env_output.obs) assert obs.shape[:2] == (env.n_envs, agent.N_STEPS + 1) assert obs.dtype == np.uint8 agent.update_obs_rms(obs[:, :-1]) norm_obs = agent.normalize_obs(obs) # compute intrinsic reward from the next normalized obs reward_int = agent.compute_int_reward(norm_obs[:, 1:]) agent.update_int_reward_rms(reward_int) reward_int = agent.normalize_int_reward(reward_int) buffer.finish(reward_int, norm_obs[:, :-1], value_int, value_ext) agent.store( reward_int_max=np.max(reward_int), reward_int_min=np.min(reward_int), reward_int=np.mean(reward_int), reward_int_std=np.std(reward_int), ) start_train_step = agent.train_step with Timer('train') as tt: agent.learn_log(step) agent.store(tps=(agent.train_step - start_train_step) / tt.last()) buffer.reset() if to_eval(agent.train_step): with TempStore(agent.get_states, agent.reset_states): scores, epslens, video = evaluate(eval_env, agent, record=True, video_len=4500) video_summary(f'{agent.name}/sim', video, step=step) if eval_env.n_envs == 1: rews_int, rews_ext = agent.retrieve_eval_rewards() assert len(rews_ext) == len(rews_int) == video.shape[1], ( len(rews_ext), len(rews_int), video.shape[1]) n = 10 idxes_int = rews_int.argsort()[::-1][:n] idxes_ext = rews_ext.argsort()[::-1][:n] assert idxes_int.shape == idxes_ext.shape, ( idxes_int.shape, idxes_ext.shape) imgs_int = video[0, idxes_int] imgs_ext = video[0, idxes_ext] rews_int = rews_int[idxes_int] rews_ext = rews_ext[idxes_ext] terms = { **{ f'eval/reward_int_{i}': rews_int[i] for i in range(0, n) }, **{ f'eval/reward_ext_{i}': rews_ext[i] for i in range(0, n) }, } agent.store(**terms) imgs = np.concatenate([imgs_int[:n], imgs_ext[:n]], 0) image_summary(f'{agent.name}/img', imgs, step=step) # info = eval_env.info()[0] # episode = info.get('episode', {'visited_rooms': 1}) # agent.store(visited_rooms_max=len(episode['visited_rooms'])) agent.histogram_summary( {'eval/action': agent.retrieve_eval_actions()}, step=step) agent.store(eval_score=scores, eval_epslen=epslens) if to_log(agent.train_step) and agent.contains_stats('score'): agent.store( **{ 'episodes': runner.episodes, 'train_step': agent.train_step, 'time/run': rt.total(), 'time/train': tt.total() }) agent.log(step) agent.save()
def _run(self, record): score, epslen, video = evaluate(self.env, self, record=record, n=self.N_EVALUATION) self.store(score, epslen, video)
def train(agent, env, eval_env, replay): collect_fn = pkg.import_module('agent', algo=agent.name).collect collect = functools.partial(collect_fn, replay) env_step = agent.env_step runner = Runner(env, agent, step=env_step, nsteps=agent.TRAIN_PERIOD) while not replay.good_to_learn(): env_step = runner.run( # NOTE: random action below makes a huge difference for Mujoco tasks # by default, we don't use it as it's not a conventional practice. # action_selector=env.random_action, step_fn=collect) to_eval = Every(agent.EVAL_PERIOD) to_log = Every(agent.LOG_PERIOD, agent.LOG_PERIOD) to_eval = Every(agent.EVAL_PERIOD) to_record = Every(agent.EVAL_PERIOD * 10) rt = Timer('run') tt = Timer('train') et = Timer('eval') lt = Timer('log') print('Training starts...') while env_step <= int(agent.MAX_STEPS): with rt: env_step = runner.run(step_fn=collect) with tt: agent.learn_log(env_step) if to_eval(env_step): with TempStore(agent.get_states, agent.reset_states): with et: record = agent.RECORD and to_record(env_step) eval_score, eval_epslen, video = evaluate( eval_env, agent, n=agent.N_EVAL_EPISODES, record=agent.RECORD, size=(64, 64)) if record: video_summary(f'{agent.name}/sim', video, step=env_step) agent.store(eval_score=eval_score, eval_epslen=eval_epslen) if to_log(env_step): with lt: fps = rt.average() * agent.TRAIN_PERIOD tps = tt.average() * agent.N_UPDATES agent.store( env_step=agent.env_step, train_step=agent.train_step, fps=fps, tps=tps, ) agent.store( **{ 'train_step': agent.train_step, 'time/run': rt.total(), 'time/train': tt.total(), 'time/eval': et.total(), 'time/log': lt.total(), 'time/run_mean': rt.average(), 'time/train_mean': tt.average(), 'time/eval_mean': et.average(), 'time/log_mean': lt.average(), }) agent.log(env_step) agent.save()
def main(env_config, model_config, agent_config, replay_config, n, record=False, size=(128, 128), video_len=1000, fps=30, save=False): logging.basicConfig(level=logging.DEBUG) silence_tf_logs() configure_gpu() configure_precision(agent_config.get('precision', 32)) use_ray = env_config.get('n_workers', 0) > 1 if use_ray: import ray ray.init() sigint_shutdown_ray() algo_name = agent_config['algorithm'] env_name = env_config['name'] if record: env_config['log_episode'] = True env_config['n_workers'] = env_config['n_envs'] = 1 env = create_env(env_config) create_model, Agent = pkg.import_agent(config=agent_config) models = create_model(model_config, env) agent = Agent(config=agent_config, models=models, dataset=None, env=env) if save: n_workers = env_config.get('n_workers', 1) n_envs = env_config.get('n_envs', 1) replay_config['n_envs'] = n_workers * n_envs replay_config['replay_type'] = 'uniform' replay_config['dir'] = f'data/{agent.name.lower()}-{env.name.lower()}' replay_config['n_steps'] = 1 replay_config['save'] = True replay_config['save_temp'] = True replay_config['capacity'] = int(1e6) replay_config['has_next_obs'] = True replay = create_replay(replay_config) def collect(obs, action, reward, discount, next_obs, logpi, **kwargs): replay.add(obs=obs, action=action, reward=reward, discount=discount, next_obs=next_obs, logpi=logpi) else: def collect(**kwargs): pass if n < env.n_envs: n = env.n_envs scores, epslens, video = evaluate(env, agent, n, record=record, size=size, video_len=video_len, step_fn=collect) pwc(f'After running {n} episodes', f'Score: {np.mean(scores):.3g}\tEpslen: {np.mean(epslens):.3g}', color='cyan') if save: replay.save() if record: save_video(f'{algo_name}-{env_name}', video, fps=fps) if use_ray: ray.shutdown()