def _set_up(self, eval_mode): """Sets up the runner by creating and initializing the agent.""" # Reset the tf default graph to avoid name collisions from previous runs # before doing anything else. tf.reset_default_graph() self._summary_writer = tf.summary.FileWriter(self._output_dir) if self._episode_log_file: self._episode_writer = tf.io.TFRecordWriter( os.path.join(self._output_dir, self._episode_log_file)) # Set up a session and initialize variables. self._sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self._agent = self._create_agent_fn( self._sess, self._env, summary_writer=self._summary_writer, eval_mode=eval_mode) # type check: env/agent must both be multi- or single-user if self._agent.multi_user and not isinstance( self._env.environment, environment.MultiUserEnvironment): raise ValueError( 'Multi-user agent requires multi-user environment.') if not self._agent.multi_user and isinstance( self._env.environment, environment.MultiUserEnvironment): raise ValueError( 'Single-user agent requires single-user environment.') self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._sess.run(tf.local_variables_initializer())
def _set_up(self, eval_mode): """Sets up the runner by creating and initializing the agent.""" # Reset the tf default graph to avoid name collisions from previous runs # before doing anything else. tf.reset_default_graph() self._summary_writer = tf.summary.FileWriter(self._output_dir) if self._episode_log_file: self._episode_writer = tf.python_io.TFRecordWriter( os.path.join(self._output_dir, self._episode_log_file)) # Set up a session and initialize variables. self._sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self._agent = self._create_agent_fn( self._sess, self._env, summary_writer=self._summary_writer, eval_mode=eval_mode) self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._sess.run(tf.local_variables_initializer())
def __init__(self, base_dir, data_load_fn=load_data, checkpoint_file_prefix='ckpt', logging_file_prefix='log', log_every_n=1, num_iterations=200, training_steps=250, batch_size=100, evaluation_inputs=None, evaluation_size=None): """Initialize the Runner object in charge of running a full experiment. Args: base_dir: str, the base directory to host all required sub-directories. data_load_fn: function that returns data as a tuple (inputs, outputs). checkpoint_file_prefix: str, the prefix to use for checkpoint files. logging_file_prefix: str, prefix to use for the log files. log_every_n: int, the frequency for writing logs. num_iterations: int, the iteration number threshold (must be greater than start_iteration). training_steps: int, the number of training steps to perform. batch_size: int, batch size used for the training. evaluation_inputs: tuple of inputs to the generator that can be used during qualitative evaluation. If None, inputs set passed above will be used. evaluation_size: int, the number of images that should be generated randomly sampling from the data specified in evaluation_inputs. If None, all evaluation_inputs are generated. This constructor will take the following actions: - Initialize a `tf.Session`. - Initialize a logger. - Initialize a generator. - Reload from the latest checkpoint, if available, and initialize the Checkpointer object. """ assert base_dir is not None inputs, data_to_generate = data_load_fn() assert inputs is None or inputs.shape[0] == data_to_generate.shape[0] assert evaluation_inputs is None or \ evaluation_inputs.shape[1:] == inputs.shape[1:] assert evaluation_inputs is not None or evaluation_size is not None, \ 'Either evaluation_inputs or evaluation_size has to be initialised.' self._logging_file_prefix = logging_file_prefix self._log_every_n = log_every_n self._data_to_generate = data_to_generate self._inputs = inputs self._num_iterations = num_iterations self._training_steps = training_steps self._batch_size = batch_size self._evaluation_inputs = evaluation_inputs if self._evaluation_inputs is None: self._evaluation_inputs = inputs self._evaluation_size = evaluation_size self._base_dir = base_dir self._create_directories() self._summary_writer = tf.summary.FileWriter(self._base_dir) config = tf.ConfigProto(allow_soft_placement=True) # Allocate only subset of the GPU memory as needed which allows for running # multiple workers on the same GPU. config.gpu_options.allow_growth = True # Set up a session and initialize variables. self._sess = tf.Session('', config=config) self._generator = create_generator(self._sess, data_to_generate, inputs, summary_writer=self._summary_writer) self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix)
def __init__(self, base_dir, create_agent_fn, create_environment_fn=atari_lib.create_atari_environment, checkpoint_file_prefix='ckpt', logging_file_prefix='log', log_every_n=1, num_iterations=200, training_steps=250000, evaluation_steps=125000, max_steps_per_episode=27000, reward_clipping=(-1, 1)): """Initialize the Runner object in charge of running a full experiment. Args: base_dir: str, the base directory to host all required sub-directories. create_agent_fn: A function that takes as args a Tensorflow session and an environment, and returns an agent. create_environment_fn: A function which receives a problem name and creates a Gym environment for that problem (e.g. an Atari 2600 game). checkpoint_file_prefix: str, the prefix to use for checkpoint files. logging_file_prefix: str, prefix to use for the log files. log_every_n: int, the frequency for writing logs. num_iterations: int, the iteration number threshold (must be greater than start_iteration). training_steps: int, the number of training steps to perform. evaluation_steps: int, the number of evaluation steps to perform. max_steps_per_episode: int, maximum number of steps after which an episode terminates. reward_clipping: Tuple(int, int), with the minimum and maximum bounds for reward at each step. If `None` no clipping is applied. This constructor will take the following actions: - Initialize an environment. - Initialize a `tf.Session`. - Initialize a logger. - Initialize an agent. - Reload from the latest checkpoint, if available, and initialize the Checkpointer object. """ assert base_dir is not None self._logging_file_prefix = logging_file_prefix self._log_every_n = log_every_n self._num_iterations = num_iterations self._training_steps = training_steps self._evaluation_steps = evaluation_steps self._max_steps_per_episode = max_steps_per_episode self._base_dir = base_dir self._create_directories() self._summary_writer = tf.summary.FileWriter(self._base_dir) self._environment = create_environment_fn() # Set up a session and initialize variables. config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self._sess = tf.Session('', config=config) self._agent = create_agent_fn(self._sess, self._environment, summary_writer=self._summary_writer) self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix) self._reward_clipping = reward_clipping
def learn_metric(self, verbose=False): """Approximate the bisimulation metric by learning. Args: verbose: bool, whether to print verbose messages. """ summary_writer = tf.summary.FileWriter(self.base_dir) global_step = tf.Variable(0, trainable=False) inc_global_step_op = tf.assign_add(global_step, 1) bisim_horizon = 0.0 bisim_horizon_discount_value = 1.0 if self.use_decayed_learning_rate: learning_rate = tf.train.exponential_decay(self.starting_learning_rate, global_step, self.num_iterations, self.learning_rate_decay, staircase=self.staircase) else: learning_rate = self.starting_learning_rate tf.summary.scalar('Learning/LearningRate', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=self.epsilon) train_op = self._build_train_op(optimizer) sync_op = self._build_sync_op() eval_op = tf.stop_gradient(self._build_eval_metric()) eval_states = [] # Build the evaluation tensor. for state in range(self.num_states): row, col = self.inverse_index_states[state] # We make the evaluation states at the center of each grid cell. eval_states.append([row + 0.5, col + 0.5]) eval_states = np.array(eval_states, dtype=np.float64) normalized_bisim_metric = ( self.bisim_metric / np.linalg.norm(self.bisim_metric)) metric_errors = [] average_metric_errors = [] normalized_metric_errors = [] average_normalized_metric_errors = [] saver = tf.train.Saver(max_to_keep=3) with tf.Session() as sess: summary_writer.add_graph(graph=tf.get_default_graph()) sess.run(tf.global_variables_initializer()) merged_summaries = tf.summary.merge_all() for i in range(self.num_iterations): sampled_states = np.random.randint(self.num_states, size=(self.batch_size,)) sampled_actions = np.random.randint(4, size=(self.batch_size,)) if self.add_noise: sampled_noise = np.clip( np.random.normal(0, 0.1, size=(self.batch_size, 2)), -0.3, 0.3) sampled_action_names = [self.actions[x] for x in sampled_actions] next_states = [self.next_states[a][s] for s, a in zip(sampled_states, sampled_action_names)] rewards = np.array([self.rewards[a][s] for s, a in zip(sampled_states, sampled_action_names)]) states = np.array( [self.inverse_index_states[x] for x in sampled_states]) next_states = np.array([self.inverse_index_states[x] for x in next_states]) states = states.astype(np.float64) states += 0.5 # Place points in center of grid. next_states = next_states.astype(np.float64) next_states += 0.5 if self.add_noise: states += sampled_noise next_states += sampled_noise _, summary = sess.run( [train_op, merged_summaries], feed_dict={self.s1_ph: states, self.s2_ph: next_states, self.action_ph: sampled_actions, self.rewards_ph: rewards, self.bisim_horizon_ph: bisim_horizon, self.eval_states_ph: eval_states}) summary_writer.add_summary(summary, i) if self.double_period_halfway and i > self.num_iterations / 2.: self.target_update_period *= 2 self.double_period_halfway = False if i % self.target_update_period == 0: bisim_horizon = 1.0 - bisim_horizon_discount_value bisim_horizon_discount_value *= self.bisim_horizon_discount sess.run(sync_op) # Now compute difference with exact metric. self.learned_distance = sess.run( eval_op, feed_dict={self.eval_states_ph: eval_states}) self.learned_distance = np.reshape(self.learned_distance, (self.num_states, self.num_states)) metric_difference = np.max( abs(self.learned_distance - self.bisim_metric)) average_metric_difference = np.mean( abs(self.learned_distance - self.bisim_metric)) normalized_learned_distance = ( self.learned_distance / np.linalg.norm(self.learned_distance)) normalized_metric_difference = np.max( abs(normalized_learned_distance - normalized_bisim_metric)) average_normalized_metric_difference = np.mean( abs(normalized_learned_distance - normalized_bisim_metric)) error_summary = tf.Summary(value=[ tf.Summary.Value(tag='Approx/Error', simple_value=metric_difference), tf.Summary.Value(tag='Approx/AvgError', simple_value=average_metric_difference), tf.Summary.Value(tag='Approx/NormalizedError', simple_value=normalized_metric_difference), tf.Summary.Value(tag='Approx/AvgNormalizedError', simple_value=average_normalized_metric_difference), ]) summary_writer.add_summary(error_summary, i) sess.run(inc_global_step_op) if i % 100 == 0: # Collect statistics every 100 steps. metric_errors.append(metric_difference) average_metric_errors.append(average_metric_difference) normalized_metric_errors.append(normalized_metric_difference) average_normalized_metric_errors.append( average_normalized_metric_difference) saver.save(sess, os.path.join(self.base_dir, 'tf_ckpt'), global_step=i) if self.debug and i % 100 == 0: self.pretty_print_metric(metric_type='learned') print('Iteration: {}'.format(i)) print('Metric difference: {}'.format(metric_difference)) print('Normalized metric difference: {}'.format( normalized_metric_difference)) if self.add_noise: # Finally, if we have noise, we draw a bunch of samples to get estimates # of the distances between states. sampled_distances = {} for _ in range(self.total_final_samples): eval_states = [] for state in range(self.num_states): row, col = self.inverse_index_states[state] # We make the evaluation states at the center of each grid cell. eval_states.append([row + 0.5, col + 0.5]) eval_states = np.array(eval_states, dtype=np.float64) eval_noise = np.clip( np.random.normal(0, 0.1, size=(self.num_states, 2)), -0.3, 0.3) eval_states += eval_noise distance_samples = sess.run( eval_op, feed_dict={self.eval_states_ph: eval_states}) distance_samples = np.reshape(distance_samples, (self.num_states, self.num_states)) for s1 in range(self.num_states): for s2 in range(self.num_states): sampled_distances[(tuple(eval_states[s1]), tuple(eval_states[s2]))] = ( distance_samples[s1, s2]) else: # Otherwise we just use the last evaluation metric. sampled_distances = self.learned_distance learned_statistics = { 'num_iterations': self.num_iterations, 'metric_errors': metric_errors, 'average_metric_errors': average_metric_errors, 'normalized_metric_errors': normalized_metric_errors, 'average_normalized_metric_errors': average_normalized_metric_errors, 'learned_distances': sampled_distances, } self.statistics['learned'] = learned_statistics if verbose: self.pretty_print_metric(metric_type='learned')
def reset(): with tf.get_default_graph() as g: tf.reset_default_graph()
def __init__(self, base_dir, agent_creator, create_environment_fn=create_atari_environment, game_name=None, checkpoint_file_prefix='ckpt', logging_file_prefix='log', log_every_n=1, num_iterations=200, training_steps=250000, evaluation_steps=125000, max_steps_per_episode=27000): """Initialize the Runner object in charge of running a full experiment. Args: base_dir: str, the base directory to host all required sub-directories. agent_creator: A function that takes as args a Tensorflow session and an Atari 2600 Gym environment, and returns an agent. create_environment_fn: A function which receives a game name and creates an Atari 2600 Gym environment. game_name: str, name of the Atari 2600 domain to run. sticky_actions: bool, whether to enable sticky actions in the environment. checkpoint_file_prefix: str, the prefix to use for checkpoint files. logging_file_prefix: str, prefix to use for the log files. log_every_n: int, the frequency for writing logs. num_iterations: int, the iteration number threshold (must be greater than start_iteration). training_steps: int, the number of training steps to perform. evaluation_steps: int, the number of evaluation steps to perform. max_steps_per_episode: int, maximum number of steps after which an episode terminates. This constructor will take the following actions: - Initialize an environment. - Initialize a `tf.Session`. - Initialize a logger. - Initialize an agent. - Reload from the latest checkpoint, if available, and initialize the Checkpointer object. """ assert base_dir is not None self._logging_file_prefix = logging_file_prefix self._log_every_n = log_every_n self._num_iterations = num_iterations self._training_steps = training_steps self._evaluation_steps = evaluation_steps self._max_steps_per_episode = max_steps_per_episode self._base_dir = base_dir self._create_directories() self._summary_writer = tf.summary.FileWriter(self._base_dir) self._environment = create_environment_fn() # Set up a session and initialize variables. self._sess = tf.Session( '', config=tf.ConfigProto(allow_soft_placement=True)) self._agent = agent_creator(self._sess, self._environment, summary_writer=self._summary_writer) self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._summary_helper = SummaryHelper(self._summary_writer) self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix) self._steps_done = 0 self._total_timer = None