def __init__(self, cfg): self.cfg = cfg if self.cfg.seed is not None: log.info('Settings fixed seed %d', self.cfg.seed) torch.manual_seed(self.cfg.seed) np.random.seed(self.cfg.seed) self.device = torch.device('cuda') self.train_step = self.env_steps = 0 self.total_train_seconds = 0 self.last_training_step = time.time() self.best_avg_reward = math.nan self.summary_rate_decay = LinearDecay([(0, 100), (1000000, 2000), (10000000, 10000)]) self.last_summary_written = -1e9 self.save_rate_decay = LinearDecay([(0, self.cfg.initial_save_rate), (1000000, 5000)], staircase=100) summary_dir = summaries_dir(experiment_dir(cfg=self.cfg)) self.writer = SummaryWriter(summary_dir, flush_secs=10)
def __init__(self, params): super(AgentLearner, self).__init__(params) self.session = None # actually created in "initialize" method self.saver = None tf.reset_default_graph() if self.params.seed >= 0: tf.random.set_random_seed(self.params.seed) self.summary_rate_decay = LinearDecay([(0, 100), (1000000, 2000), (10000000, 10000)], staircase=100) self.save_rate_decay = LinearDecay([(0, self.params.initial_save_rate), (1000000, 5000)], staircase=100) self.initial_best_avg_reward = tf.constant(-1e3) self.best_avg_reward = tf.Variable(self.initial_best_avg_reward) self.total_env_steps = tf.Variable(0, dtype=tf.int64) def update_best_value(best_value, new_value): return tf.assign(best_value, tf.maximum(new_value, best_value)) self.avg_reward_placeholder = tf.placeholder(tf.float32, [], 'new_avg_reward') self.update_best_reward = update_best_value( self.best_avg_reward, self.avg_reward_placeholder) self.total_env_steps_placeholder = tf.placeholder( tf.int64, [], 'new_env_steps') self.update_env_steps = tf.assign(self.total_env_steps, self.total_env_steps_placeholder) summary_dir = summaries_dir(self.params.experiment_dir()) self.summary_writer = tf.summary.FileWriter(summary_dir) self.position_histograms = deque( [], maxlen=self.params.num_position_histograms) self._last_trajectory_summary = 0 # timestamp of the latest trajectory summary written self._last_coverage_summary = 0 # timestamp of the latest coverage summary written self.map_img = self.coord_limits = None
def __init__(self, make_env_func, params): """Initialize PPO computation graph and some auxiliary tensors.""" super(AgentRandom, self).__init__(params) self.make_env_func = make_env_func env = make_env_func() self.action_space = env.action_space env.close() self.objectives = None self.last_action = None self.best_reward = None summary_dir = summaries_dir(self.params.experiment_dir()) self.summary_writer = tf.summary.FileWriter(summary_dir) if self.params.use_env_map: self.map_img, self.coord_limits = generate_env_map(make_env_func)
def __init__(self, make_env_func, params): """Initialize PPO computation graph and some auxiliary tensors.""" super(AgentPPO, self).__init__(params) self.actor_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='actor_step') self.critic_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='critic_step') self.make_env_func = make_env_func env = make_env_func() # we need the env to query observation shape, number of actions, etc. self.obs_shape = [-1] + list(main_observation_space(env).shape) self.ph_observations = placeholder_from_space(main_observation_space(env)) self.ph_actions = placeholder_from_space(env.action_space) # actions sampled from the policy self.ph_advantages, self.ph_returns, self.ph_old_action_probs = placeholders(None, None, None) self.actor_critic = ActorCritic(env, self.ph_observations, self.params) env.close() self.objectives = self.add_ppo_objectives( self.actor_critic, self.ph_actions, self.ph_old_action_probs, self.ph_advantages, self.ph_returns, self.params, self.actor_step, ) # optimizers actor_opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='actor_opt') self.train_actor = actor_opt.minimize(self.objectives.actor_loss, global_step=self.actor_step) critic_opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='critic_opt') self.train_critic = critic_opt.minimize(self.objectives.critic_loss, global_step=self.critic_step) self.add_ppo_summaries() summary_dir = summaries_dir(self.params.experiment_dir()) self.summary_writer = tf.summary.FileWriter(summary_dir) self.actor_summaries = merge_summaries(collections=['actor']) self.critic_summaries = merge_summaries(collections=['critic']) if self.params.use_env_map: self.map_img, self.coord_limits = generate_env_map(make_env_func)
def test_graph_tensorboard(self): graph = nx.DiGraph() for i in range(100): graph.add_node(i) for i in range(100): nonedges = list(nx.non_edges(graph)) chosen_nonedge = random.choice(nonedges) graph.add_edge(chosen_nonedge[0], chosen_nonedge[1]) params = AgentLearner.AgentParams('__test_graph__') summary_dir = summaries_dir(params.experiment_dir()) summary_writer = tf.summary.FileWriter(summary_dir) start_summary = time.time() summary = visualize_graph_tensorboard(graph, tag='test/graph') log.debug('Took %.3f seconds to write graph summary', time.time() - start_summary) summary_writer.add_summary(summary, global_step=1) shutil.rmtree(params.experiment_dir())
def __init__(self, cfg): super().__init__(cfg) # we should not use CUDA in the main thread, only on the workers set_global_cuda_envvars(cfg) tmp_env = make_env_func(self.cfg, env_config=None) self.obs_space = tmp_env.observation_space self.action_space = tmp_env.action_space self.num_agents = tmp_env.num_agents self.reward_shaping_scheme = None if self.cfg.with_pbt: if hasattr(tmp_env.unwrapped, '_reward_shaping_wrapper'): # noinspection PyProtectedMember self.reward_shaping_scheme = tmp_env.unwrapped._reward_shaping_wrapper.reward_shaping_scheme else: try: from envs.doom.multiplayer.doom_multiagent_wrapper import MultiAgentEnv if isinstance(tmp_env.unwrapped, MultiAgentEnv): self.reward_shaping_scheme = tmp_env.unwrapped.default_reward_shaping except ImportError: pass tmp_env.close() # shared memory allocation self.traj_buffers = SharedBuffers(self.cfg, self.num_agents, self.obs_space, self.action_space) self.actor_workers = None self.report_queue = MpQueue(20 * 1000 * 1000) self.policy_workers = dict() self.policy_queues = dict() self.learner_workers = dict() self.workers_by_handle = None self.policy_inputs = [[] for _ in range(self.cfg.num_policies)] self.policy_outputs = dict() for worker_idx in range(self.cfg.num_workers): for split_idx in range(self.cfg.worker_num_splits): self.policy_outputs[(worker_idx, split_idx)] = dict() self.policy_avg_stats = dict() self.policy_lag = [dict() for _ in range(self.cfg.num_policies)] self.last_timing = dict() self.env_steps = dict() self.samples_collected = [0 for _ in range(self.cfg.num_policies)] self.total_env_steps_since_resume = 0 # currently this applies only to the current run, not experiment as a whole # to change this behavior we'd need to save the state of the main loop to a filesystem self.total_train_seconds = 0 self.last_report = time.time() self.last_experiment_summaries = 0 self.report_interval = 5.0 # sec self.experiment_summaries_interval = self.cfg.experiment_summaries_interval # sec self.avg_stats_intervals = (2, 12, 60 ) # 10 seconds, 1 minute, 5 minutes self.fps_stats = deque([], maxlen=max(self.avg_stats_intervals)) self.throughput_stats = [ deque([], maxlen=5) for _ in range(self.cfg.num_policies) ] self.avg_stats = dict() self.stats = dict() # regular (non-averaged) stats self.writers = dict() writer_keys = list(range(self.cfg.num_policies)) for key in writer_keys: summary_dir = join(summaries_dir(experiment_dir(cfg=self.cfg)), str(key)) summary_dir = ensure_dir_exists(summary_dir) self.writers[key] = SummaryWriter(summary_dir, flush_secs=20) self.pbt = PopulationBasedTraining(self.cfg, self.reward_shaping_scheme, self.writers)
def main(): """Script entry point.""" stop_at = 80 * 1000 * 1000 prefix = 'simple' # noinspection PyUnusedLocal experiments_very_sparse = [ Experiment('doom_curious_vs_vanilla/doom_maze_very_sparse/doom_maze_very_sparse_pre_0.0', 'A2C (no curiosity)'), Experiment('doom_sweep_very_sparse/doom_sweep_i_0.5_p_0.05', 'A2C+ICM (curious)'), ] # noinspection PyUnusedLocal experiments_sparse = [ Experiment('doom_curious_vs_vanilla/doom_maze_sparse/doom_maze_sparse_pre_0.0', 'A2C (no curiosity)'), Experiment('doom_curious_vs_vanilla/doom_maze_sparse/doom_maze_sparse_pre_0.05', 'A2C+ICM (curious)'), ] # noinspection PyUnusedLocal experiments_basic = [ Experiment('doom_curious_vs_vanilla/doom_maze/doom_maze_pre_0.0', 'A2C (no curiosity)'), Experiment('doom_curious_vs_vanilla/doom_maze/doom_maze_pre_0.05', 'A2C+ICM (curious)'), ] experiments = [ Experiment('doom_curious_vs_vanilla/doom_basic/doom_basic_pre_0.0', 'A2C (no curiosity)'), Experiment('doom_curious_vs_vanilla/doom_basic/doom_basic_pre_0.05', 'A2C+ICM (curious)'), ] plots = [ Plot('a2c_aux_summary/avg_reward', 'average reward', 'Avg. reward for the last 1000 episodes'), Plot( 'a2c_agent_summary/policy_entropy', 'policy entropy, nats', 'Stochastic policy entropy', ), ] for plot in plots: fig = plt.figure(figsize=(5, 4)) fig.add_subplot() for ex_i, experiment in enumerate(experiments): experiment_name = experiment.name.split(os.sep)[-1] experiments_root = join(*(experiment.name.split(os.sep)[:-1])) exp_dir = experiment_dir(experiment_name, experiments_root) path_to_events_dir = summaries_dir(exp_dir) events_files = [] for f in os.listdir(path_to_events_dir): if f.startswith('events'): events_files.append(join(path_to_events_dir, f)) if len(events_files) == 0: log.error('No events file for %s', experiment) continue events_files = sorted(events_files) steps, values = [], [] for events_file in events_files: iterator = tf.train.summary_iterator(events_file) while True: try: e = next(iterator, None) except Exception as exc: log.warning(str(exc)) break if e is None: break for v in e.summary.value: if e.step >= stop_at: break if v.tag == plot.name: steps.append(e.step) values.append(v.simple_value) # just in case values = np.nan_to_num(values) smooth = 10 values_smooth = running_mean(values, smooth) steps = steps[smooth:] values = values[smooth:] plt.plot(steps, values, color=COLORS[ex_i], alpha=0.2, label='__nolegend__') plt.plot(steps, values_smooth, color=COLORS[ex_i], label=experiment.descr, linewidth=2) plt.xlabel('environment steps') plt.ylabel(plot.axis) plt.title(plot.descr) plt.grid(True) plt.legend() plt.tight_layout() plots_dir = ensure_dir_exists(join(experiments_dir(), 'plots')) plot_name = plot.name.replace('/', '_') plt.savefig(join(plots_dir, f'{prefix}_{plot_name}.png')) plt.close() return 0