def __init__(self, network_creator, environment_creator, args): super(ActorLearner, self).__init__() tf.reset_default_graph() self.global_step = 0 self.environment_creator = environment_creator self.network_creator = network_creator self.n_steps = args.n_steps self.state_shape = args.state_shape self.num_actions = args.num_actions self.initial_lr = args.initial_lr self.lr_annealing_steps = args.lr_annealing_steps self.n_emulators_per_emulator_runner = args.n_emulators_per_emulator_runner self.n_emulator_runners = args.n_emulator_runners self.device = args.device self.debugging_folder = args.debugging_folder self.network_checkpoint_folder = os.path.join(self.debugging_folder, 'checkpoints/') self.optimizer_checkpoint_folder = os.path.join( self.debugging_folder, 'optimizer_checkpoints/') self.last_saving_step = 0 self.summary_writer = tf.summary.FileWriter( os.path.join(self.debugging_folder, 'tf')) self.max_global_steps = args.max_global_steps self.gamma = args.gamma self.game = args.game self.arch = args.arch self.network = network_creator(name='value_learning') self.target_network = network_creator(name='value_target', learning_network=self.network) self.target_update_freq = args.target_update_freq self.train_step, flat_raw_gradients, flat_clipped_gradients, global_norm, self.learning_rate = q_network.train_operation( self.network, args) config = tf.ConfigProto() if 'gpu' in self.device: logger.debug('Dynamic gpu mem allocation') config.gpu_options.allow_growth = True self.session = tf.Session(config=config) self.network_saver = tf.train.Saver() # Summaries variable_summaries(flat_raw_gradients, 'raw_gradients') variable_summaries(flat_clipped_gradients, 'clipped_gradients') tf.summary.scalar('global_norm', global_norm) tf.summary.scalar("Weighted_TD_loss", self.network.weighted_td_loss)
def __init__(self, network_creator, environment_creator, explo_policy, args): super(ActorLearner, self).__init__() # Folder and debug settings self.checkpoint_interval = args.checkpoint_interval self.debugging_folder = args.debugging_folder self.network_checkpoint_folder = os.path.join(self.debugging_folder, 'checkpoints/') self.optimizer_checkpoint_folder = os.path.join( self.debugging_folder, 'optimizer_checkpoints/') self.last_saving_step = 0 self.device = args.device # Reinforcement learning settings self.game = args.game self.global_step = 0 self.max_global_steps = args.max_global_steps self.max_local_steps = args.max_local_steps self.num_actions = args.num_actions self.explo_policy = explo_policy self.gamma = args.gamma self.initial_lr = args.initial_lr self.lr_annealing_steps = args.lr_annealing_steps self.emulator_counts = args.emulator_counts self.emulators = np.asarray([ environment_creator.create_environment(i) for i in range(self.emulator_counts) ]) self.max_global_steps = args.max_global_steps self.gamma = args.gamma self.network = network_creator() with tf.name_scope('Optimizer'): self.learning_rate = tf.placeholder(tf.float32, shape=[], name='lr') # Optimizer optimizer_variable_names = 'OptimizerVariables' self.optimizer = tf.train.RMSPropOptimizer( self.learning_rate, decay=args.alpha, epsilon=args.e, name=optimizer_variable_names) grads_and_vars = self.optimizer.compute_gradients( self.network.loss) self.flat_raw_gradients = tf.concat( [tf.reshape(g, [-1]) for g, v in grads_and_vars], axis=0) # This is not really an operation, but a list of gradient Tensors. # When calling run() on it, the value of those Tensors # (i.e., of the gradients) will be calculated if args.clip_norm_type == 'ignore': # Unclipped gradients global_norm = tf.global_norm([g for g, v in grads_and_vars], name='global_norm') elif args.clip_norm_type == 'global': # Clip network grads by network norm gradients_n_norm = tf.clip_by_global_norm( [g for g, v in grads_and_vars], args.clip_norm) global_norm = tf.identity(gradients_n_norm[1], name='global_norm') grads_and_vars = list( zip(gradients_n_norm[0], [v for g, v in grads_and_vars])) elif args.clip_norm_type == 'local': # Clip layer grads by layer norm gradients = [ tf.clip_by_norm(g, args.clip_norm) for g in grads_and_vars ] grads_and_vars = list( zip(gradients, [v for g, v in grads_and_vars])) global_norm = tf.global_norm([g for g, v in grads_and_vars], name='global_norm') else: raise Exception('Norm type not recognized') self.flat_clipped_gradients = tf.concat( [tf.reshape(g, [-1]) for g, v in grads_and_vars], axis=0) self.train_step = self.optimizer.apply_gradients(grads_and_vars) config = tf.ConfigProto(allow_soft_placement=True) if 'gpu' in self.device: logging.debug('Dynamic gpu mem allocation') config.gpu_options.allow_growth = True self.session = tf.Session(config=config) self.summary_writer = tf.summary.FileWriter( os.path.join(self.debugging_folder, 'tf'), self.session.graph) self.network_saver = tf.train.Saver() self.optimizer_variables = [ var for var in tf.global_variables() if optimizer_variable_names in var.name ] self.optimizer_saver = tf.train.Saver(self.optimizer_variables, max_to_keep=1, name='OptimizerSaver') # Summaries variable_summaries(self.flat_raw_gradients, 'raw_gradients')