def _buildInitialVars(self, shape, dev_list): values = [] num_devices = len(dev_list) dim = np.prod(shape, dtype=int) if shape else 1 for d in range(0, num_devices): with ops.device(dev_list[d]): npt = np.zeros(shape).astype(np.float32) alias = np.frombuffer(npt.data, dtype=np.float32) for i in range(0, dim): alias[i] = i + 0.01 * d var = state_ops.variable_op(shape, types_pb2.DT_FLOAT) state_ops.init_variable(var, npt).op.run() values.append(var) return values
def _buildInitialVars(self, shape, dev_list): values = [] num_devices = len(dev_list) dim = np.prod(shape) if shape else 1 for d in range(0, num_devices): with ops.device(dev_list[d]): npt = np.zeros(shape).astype(np.float32) alias = np.frombuffer(npt.data, dtype=np.float32) for i in range(0, dim): alias[i] = i + 0.01 * d var = state_ops.variable_op(shape, types_pb2.DT_FLOAT) state_ops.init_variable(var, npt).op.run() values.append(var) return values
def _AddParam(self, shape, dtype, name, initializer=None, return_average=False): """Add a model parameter w.r.t. we expect to compute gradients. _AddParam creates both regular parameters (usually for training) and averaged nodes (usually for inference). It returns one or the other based on the 'return_average' arg. Args: shape: int list, tensor shape of the parameter to create dtype: tf.DataType, data type of the parameter name: string, name of the parameter in the TF graph initializer: optional initializer for the paramter return_average: if False, return parameter otherwise return moving average Returns: parameter or averaged parameter """ if name not in self.params: with tf.device('/cpu:0'): step = tf.cast(self.GetStep(), tf.float32) # Put all parameters and their initializing ops in their own scope # irrespective of the current scope (training or eval). with tf.name_scope(self._param_scope): self.params[name] = tf.get_variable(name, shape, dtype, initializer) param = self.params[name] if initializer is not None: self.inits[name] = state_ops.init_variable( param, initializer) if self._averaging_decay == 1: #logging.info('Using vanilla averaging of parameters.') ema = tf.train.ExponentialMovingAverage( decay=(step / (step + 1.0)), num_updates=None) else: ema = tf.train.ExponentialMovingAverage( decay=self._averaging_decay, num_updates=step) self._averaging[name + '_avg_update'] = ema.apply([param]) self.variables[name + '_avg_var'] = ema.average(param) self.inits[name + '_avg_init'] = state_ops.init_variable( ema.average(param), tf.zeros_initializer) return (self.variables[name + '_avg_var'] if return_average else self.params[name])
def AddTraining(self, task_context, batch_size, learning_rate=0.1, decay_steps=4000, momentum=0.9, corpus_name='documents'): """Builds a trainer to minimize the cross entropy cost function. Args: task_context: file path from which to read the task context batch_size: batch size to request from reader op learning_rate: initial value of the learning rate decay_steps: decay learning rate by 0.96 every this many steps momentum: momentum parameter used when training with momentum corpus_name: name of the task input to read parses from Returns: Dictionary of named training nodes. """ with tf.name_scope('training'): nodes = self.training nodes.update(self._AddGoldReader(task_context, batch_size, corpus_name)) nodes.update(self._BuildNetwork(nodes['feature_endpoints'], return_average=False)) nodes.update(self._AddCostFunction(batch_size, nodes['gold_actions'], nodes['logits'])) # Add the optimizer if self._only_train: trainable_params = [v for k, v in self.params.iteritems() if k in self._only_train] else: trainable_params = self.params.values() lr = self._AddLearningRate(learning_rate, decay_steps) optimizer = tf.train.MomentumOptimizer(lr, momentum, use_locking=self._use_locking) train_op = optimizer.minimize(nodes['cost'], var_list=trainable_params) for param in trainable_params: slot = optimizer.get_slot(param, 'momentum') self.inits[slot.name] = state_ops.init_variable(slot, tf.zeros_initializer) self.variables[slot.name] = slot numerical_checks = [ tf.check_numerics(param, message='Parameter is not finite.') for param in trainable_params if param.dtype.base_dtype in [tf.float32, tf.float64] ] check_op = tf.group(*numerical_checks) avg_update_op = tf.group(*self._averaging.values()) train_ops = [train_op] if self._check_parameters: train_ops.append(check_op) if self._use_averaging: train_ops.append(avg_update_op) nodes['train_op'] = tf.group(*train_ops, name='train_op') return nodes
def _AddVariable(self, shape, dtype, name, initializer=None): if name in self.variables: return self.variables[name] self.variables[name] = tf.get_variable(name, shape, dtype, initializer) if initializer is not None: self.inits[name] = state_ops.init_variable(self.variables[name], initializer) return self.variables[name]
def _AddParam(self, shape, dtype, name, initializer=None, return_average=False): """Add a model parameter w.r.t. we expect to compute gradients. _AddParam creates both regular parameters (usually for training) and averaged nodes (usually for inference). It returns one or the other based on the 'return_average' arg. Args: shape: int list, tensor shape of the parameter to create dtype: tf.DataType, data type of the parameter name: string, name of the parameter in the TF graph initializer: optional initializer for the paramter return_average: if False, return parameter otherwise return moving average Returns: parameter or averaged parameter """ if name not in self.params: step = tf.cast(self.GetStep(), tf.float32) # Put all parameters and their initializing ops in their own scope # irrespective of the current scope (training or eval). with tf.name_scope(self._param_scope): self.params[name] = tf.get_variable(name, shape, dtype, initializer) param = self.params[name] if initializer is not None: self.inits[name] = state_ops.init_variable(param, initializer) if self._averaging_decay == 1: logging.info('Using vanilla averaging of parameters.') ema = tf.train.ExponentialMovingAverage(decay=(step / (step + 1.0)), num_updates=None) else: ema = tf.train.ExponentialMovingAverage(decay=self._averaging_decay, num_updates=step) self._averaging[name + '_avg_update'] = ema.apply([param]) self.variables[name + '_avg_var'] = ema.average(param) self.inits[name + '_avg_init'] = state_ops.init_variable( ema.average(param), tf.zeros_initializer) return (self.variables[name + '_avg_var'] if return_average else self.params[name])
def addParam(self, shape, dtype, name, initializer=None, return_average=False): # this isn't a problem. we reload variables if they already exist. #if name in self.params: # self.logger.warning(name + ' already exists!') if name not in self.params: step = tf.cast(self.getStep(), tf.float32) with tf.name_scope(self._param_scope): # Put all parameters and their initializing ops in their own # scope irrespective of the current scope (training or eval). self.params[name] = tf.get_variable(name, shape, dtype, initializer) param = self.params[name] if initializer is not None: self.inits[name] = state_ops.init_variable( param, initializer) if self.averaging_decay == 1: self.logging.info('Using vanilla averaging of parameters.') ema = tf.train.ExponentialMovingAverage( decay=(step / (step + 1.0)), num_updates=None) else: ema = tf.train.ExponentialMovingAverage( decay=self.averaging_decay, num_updates=step) self.averaging[name + '_avg_update'] = ema.apply([param]) self.variables[name + '_avg_var'] = ema.average(param) self.inits[name + '_avg_init'] = state_ops.init_variable( ema.average(param), tf.zeros_initializer()) return (self.variables[name + '_avg_var'] if return_average else self.params[name])
def AddTraining(self, task_context, batch_size, learning_rate=0.1, decay_steps=4000, momentum=None, corpus_name='documents'): with tf.name_scope('training'): n = self.training n['accumulated_alive_steps'] = self._AddVariable( [batch_size], tf.int32, 'accumulated_alive_steps', tf.zeros_initializer()) n.update(self._AddBeamReader(task_context, batch_size, corpus_name)) # This adds a required 'step' node too: learning_rate = tf.constant(learning_rate, dtype=tf.float32) n['learning_rate'] = self._AddLearningRate(learning_rate, decay_steps) # Call BuildNetwork *only* to set up the params outside of the main loop. self._BuildNetwork(list(n['features'])) n.update(self._BuildSequence(batch_size, self._max_steps, n['features'], n['state'])) flat_concat_scores = tf.reshape(n['concat_scores'], [-1]) (indices_and_paths, beams_and_slots, n['gold_slot'], n[ 'beam_path_scores']) = gen_parser_ops.beam_parser_output(n[ 'state']) n['indices'] = tf.reshape(tf.gather(indices_and_paths, [0]), [-1]) n['path_ids'] = tf.reshape(tf.gather(indices_and_paths, [1]), [-1]) n['all_path_scores'] = tf.sparse_segment_sum( flat_concat_scores, n['indices'], n['path_ids']) n['beam_ids'] = tf.reshape(tf.gather(beams_and_slots, [0]), [-1]) n.update(AddCrossEntropy(batch_size, n)) if self._only_train: trainable_params = {k: v for k, v in self.params.iteritems() if k in self._only_train} else: trainable_params = self.params for p in trainable_params: tf.logging.info('trainable_param: %s', p) regularized_params = [ tf.nn.l2_loss(p) for k, p in trainable_params.iteritems() if k.startswith('weights') or k.startswith('bias')] l2_loss = 1e-4 * tf.add_n(regularized_params) if regularized_params else 0 n['cost'] = tf.add(n['cross_entropy'], l2_loss, name='cost') n['gradients'] = tf.gradients(n['cost'], trainable_params.values()) with tf.control_dependencies([n['alive_steps']]): update_accumulators = tf.group( tf.assign_add(n['accumulated_alive_steps'], n['alive_steps'])) def ResetAccumulators(): return tf.assign( n['accumulated_alive_steps'], tf.zeros([batch_size], tf.int32)) n['reset_accumulators_func'] = ResetAccumulators optimizer = tf.train.MomentumOptimizer(n['learning_rate'], momentum, use_locking=self._use_locking) train_op = optimizer.minimize(n['cost'], var_list=trainable_params.values()) for param in trainable_params.values(): slot = optimizer.get_slot(param, 'momentum') self.inits[slot.name] = state_ops.init_variable(slot, tf.zeros_initializer()) self.variables[slot.name] = slot def NumericalChecks(): return tf.group(*[ tf.check_numerics(param, message='Parameter is not finite.') for param in trainable_params.values() if param.dtype.base_dtype in [tf.float32, tf.float64]]) check_op = cf.cond(tf.equal(tf.mod(self.GetStep(), self._check_every), 0), NumericalChecks, tf.no_op) avg_update_op = tf.group(*self._averaging.values()) train_ops = [train_op] if self._check_parameters: train_ops.append(check_op) if self._use_averaging: train_ops.append(avg_update_op) with tf.control_dependencies([update_accumulators]): n['train_op'] = tf.group(*train_ops, name='train_op') n['alive_steps'] = tf.identity(n['alive_steps'], name='alive_steps') return n
def AddTraining(self, task_context, batch_size, learning_rate=0.1, decay_steps=4000, momentum=None, corpus_name='documents'): with tf.name_scope('training'): n = self.training n['accumulated_alive_steps'] = self._AddVariable( [batch_size], tf.int32, 'accumulated_alive_steps', tf.zeros_initializer()) n.update(self._AddBeamReader(task_context, batch_size, corpus_name)) # This adds a required 'step' node too: learning_rate = tf.constant(learning_rate, dtype=tf.float32) n['learning_rate'] = self._AddLearningRate(learning_rate, decay_steps) # Call BuildNetwork *only* to set up the params outside of the main loop. self._BuildNetwork(list(n['features'])) n.update( self._BuildSequence(batch_size, self._max_steps, n['features'], n['state'])) flat_concat_scores = tf.reshape(n['concat_scores'], [-1]) (indices_and_paths, beams_and_slots, n['gold_slot'], n['beam_path_scores']) = gen_parser_ops.beam_parser_output( n['state']) n['indices'] = tf.reshape(tf.gather(indices_and_paths, [0]), [-1]) n['path_ids'] = tf.reshape(tf.gather(indices_and_paths, [1]), [-1]) n['all_path_scores'] = tf.sparse_segment_sum( flat_concat_scores, n['indices'], n['path_ids']) n['beam_ids'] = tf.reshape(tf.gather(beams_and_slots, [0]), [-1]) n.update(AddCrossEntropy(batch_size, n)) if self._only_train: trainable_params = { k: v for k, v in self.params.iteritems() if k in self._only_train } else: trainable_params = self.params for p in trainable_params: tf.logging.info('trainable_param: %s', p) regularized_params = [ tf.nn.l2_loss(p) for k, p in trainable_params.iteritems() if k.startswith('weights') or k.startswith('bias') ] l2_loss = 1e-4 * tf.add_n( regularized_params) if regularized_params else 0 n['cost'] = tf.add(n['cross_entropy'], l2_loss, name='cost') n['gradients'] = tf.gradients(n['cost'], trainable_params.values()) with tf.control_dependencies([n['alive_steps']]): update_accumulators = tf.group( tf.assign_add(n['accumulated_alive_steps'], n['alive_steps'])) def ResetAccumulators(): return tf.assign(n['accumulated_alive_steps'], tf.zeros([batch_size], tf.int32)) n['reset_accumulators_func'] = ResetAccumulators optimizer = tf.train.MomentumOptimizer( n['learning_rate'], momentum, use_locking=self._use_locking) train_op = optimizer.minimize(n['cost'], var_list=trainable_params.values()) for param in trainable_params.values(): slot = optimizer.get_slot(param, 'momentum') self.inits[slot.name] = state_ops.init_variable( slot, tf.zeros_initializer()) self.variables[slot.name] = slot def NumericalChecks(): return tf.group(*[ tf.check_numerics(param, message='Parameter is not finite.') for param in trainable_params.values() if param.dtype.base_dtype in [tf.float32, tf.float64] ]) check_op = cf.cond( tf.equal(tf.mod(self.GetStep(), self._check_every), 0), NumericalChecks, tf.no_op) avg_update_op = tf.group(*self._averaging.values()) train_ops = [train_op] if self._check_parameters: train_ops.append(check_op) if self._use_averaging: train_ops.append(avg_update_op) with tf.control_dependencies([update_accumulators]): n['train_op'] = tf.group(*train_ops, name='train_op') n['alive_steps'] = tf.identity(n['alive_steps'], name='alive_steps') return n
def buildNetwork(self, mode='train'): assert mode == 'train' or mode == 'eval' if mode == 'train': return_average = False nodes = self.training else: return_average = self.use_averaging nodes = self.evaluation learningRate = self.modelParams.cfg['learningRate'] decaySteps = self.modelParams.cfg['decaySteps'] # FIXME: does momentum/learning rate reload properly when retraining? momentum = self.modelParams.cfg['momentum'] topK = self.modelParams.cfg['topK'] hiddenLayerSizes = self.modelParams.cfg['hiddenLayerSizes'] batchSize = self.modelParams.cfg['batchSize'] with tf.name_scope(mode): weights = [] biases = [] embeddings = [] nodes['feature_endpoints'] = [] for i in range(len(self.featureMajorTypeGroups)): major_type = self.featureMajorTypeGroups[i] # shape will be [-1, number of sparse integer features in group] nodes['feature_endpoints'].append(tf.placeholder(tf.int32, \ [None, len(self.featureNames[i])], name="ph_feature_endpoints_%s" % major_type)) embeddings.append(self.addEmbedding( \ nodes['feature_endpoints'][i], len(self.featureNames[i]), self.featureDomainSizes[i], self.featureEmbeddingSizes[i], major_type, return_average=return_average)) # Input layer last_layer = tf.concat(embeddings, 1) last_layer_size = self.BAG_OF_FEATURES_LEN # Hidden layers for i in range(len(hiddenLayerSizes)): h = hiddenLayerSizes[i] weights.append( self.addParam([last_layer_size, h], tf.float32, 'layer_%d_weights' % i, tf.random_normal_initializer(stddev=1e-4, seed=0), return_average=return_average)) biases.append( self.addParam([h], tf.float32, 'layer_%d_biases' % i, tf.constant_initializer(0.2), return_average=return_average)) last_layer = tf.nn.relu_layer(last_layer, weights[-1], biases[-1], name='layer_%d' % i) last_layer_size = h # Output layer weights.append( self.addParam([last_layer_size, self.ACTION_COUNT], tf.float32, 'softmax_weights', tf.random_normal_initializer(stddev=1e-4, seed=0), return_average=return_average)) biases.append( self.addParam([self.ACTION_COUNT], tf.float32, 'softmax_biases', tf.zeros_initializer(), return_average=return_average)) logits = tf.nn.xw_plus_b(last_layer, weights[-1], biases[-1], name='logits') if mode == 'train': nodes['gold_actions'] = tf.placeholder(tf.int32, [None], \ name='ph_gold_actions') nodes['filled_slots'] = tf.placeholder(tf.int32, \ name='ph_filled_slots') # one-hot encoding for each batch dense_golden = batchedSparseToDense(nodes['gold_actions'], \ self.ACTION_COUNT) #cross_entropy = tf.div( # tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits( # logits=logits, labels=dense_golden)), # tf.cast(nodes['filled_slots'], tf.float32)) # we should divide by batch size here, not filled slots # seems to fix the accuracy issue for whatever reason, # even though cost seems to go crazy momentarily # (plummets because only a few slots are filled) cross_entropy = tf.div( tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=dense_golden)), batchSize) # regularize all parameters except output layer regularized_params = [tf.nn.l2_loss(p) for p in weights[:-1]] regularized_params += [tf.nn.l2_loss(p) for p in biases[:-1]] l2_loss = 1e-4 * tf.add_n(regularized_params) \ if regularized_params else 0 cost = tf.add(cross_entropy, l2_loss, name='cost') lr = self.addLearningRate(learningRate, decaySteps) optimizer = tf.train.MomentumOptimizer(lr, momentum, use_locking=False) trainableParams = self.params.values() train_op = optimizer.minimize(cost, var_list=trainableParams) for param in trainableParams: slot = optimizer.get_slot(param, 'momentum') self.inits[slot.name] = state_ops.init_variable( slot, tf.zeros_initializer()) self.variables[slot.name] = slot numerical_checks = [ tf.check_numerics(param, message='Parameter is not finite.') for param in trainableParams if param.dtype.base_dtype in [tf.float32, tf.float64] ] check_op = tf.group(*numerical_checks) avg_update_op = tf.group(*self.averaging.values()) train_ops = [train_op] if self.check_parameters: train_ops.append(check_op) if self.use_averaging: train_ops.append(avg_update_op) nodes['train_op'] = tf.group(*train_ops, name='train_op') nodes['cost'] = cost nodes['logits'] = logits #nodes['softmax'] = tf.nn.softmax(logits) else: nodes['logits'] = logits