def evaluate(self): '''evaluate the performance of the model Returns: - the loss as a scalar tensor - the number of batches in the validation set as an integer ''' with tf.name_scope('evaluate'): inputs=dict() seq_lengths=dict() targets=dict() for linkedset in self.linkedsets: data_queue_elements, _ = input_pipeline.get_filenames( self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset]) max_number_of_elements = len(data_queue_elements) number_of_elements = min([max_number_of_elements,self.requested_utts]) #compute the number of batches in the validation set numbatches = number_of_elements/self.batch_size number_of_elements = numbatches*self.batch_size print '%d utterances will be used for evaluation' %(number_of_elements) #cut the data so it has a whole number of batches data_queue_elements = data_queue_elements[:number_of_elements] #create the data queue and queue runners (inputs are allowed to get shuffled. I already did this so set to False) data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=False, seed=None, capacity=self.batch_size*2) #create the input pipeline data, seq_length = input_pipeline.input_pipeline( data_queue=data_queue, batch_size=self.batch_size, numbuckets=1, dataconfs=self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset] ) #split data into inputs and targets for ind,input_name in enumerate(self.linkedsets[linkedset]['inputs']): inputs[input_name] = data[ind] seq_lengths[input_name] = seq_length[ind] for ind,target_name in enumerate(self.linkedsets[linkedset]['targets']): targets[target_name]=data[len(self.linkedsets[linkedset]['inputs'])+ind] #get the logits logits = self._get_outputs( inputs=inputs, seq_lengths=seq_lengths) loss, norm = self.compute_loss(targets, logits, seq_lengths) return loss, norm, numbatches, logits, seq_lengths
def begin(self): '''this will be run at session creation''' #pylint: disable=W0201 with tf.variable_scope('compute_fisher'): data_queue_elements, _ = input_pipeline.get_filenames( self.dataconfs.values()) self.num_samples = len(data_queue_elements) data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=False, seed=None, capacity=1) inputs, input_seq_length, _ = input_pipeline.input_pipeline( data_queue=data_queue, batch_size=1, numbuckets=1, dataconfs=self.dataconfs.values(), variable_batch_size=False) inputs = { self.dataconfs.keys()[i]: d for i, d in enumerate(inputs) } input_seq_length = { self.dataconfs.keys()[i]: d for i, d in enumerate(input_seq_length) } #get the input log likelihood using the random sample decoder logprob = self.decoder(inputs, input_seq_length).values()[0][2][0] #get the derivative the logprob gradients = tf.gradients(logprob, self.fisher.keys()) #create an op to update the fisher information update_ops = [] for var, grad in zip(self.fisher.keys(), gradients): update_ops.append(self.fisher[var].assign_add( tf.square(grad) / self.num_samples).op) self.update_fisher = tf.group(*update_ops)
def set_dataqueues(self): """sets the data queues""" # check if running in distributed model self.data_queue = dict() for linkedset in self.linkedsets: data_queue_name = 'data_queue_%s_%s' % (self.task_name, linkedset) data_queue_elements, _ = input_pipeline.get_filenames( self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset]) number_of_elements = len(data_queue_elements) if 'trainset_frac' in self.taskconf: number_of_elements = int( float(number_of_elements) * float(self.taskconf['trainset_frac'])) print '%d utterances will be used for training' % number_of_elements data_queue_elements = data_queue_elements[:number_of_elements] # create the data queue and queue runners self.data_queue[linkedset] = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=False, seed=None, capacity=self.batch_size * 2, shared_name=data_queue_name) # compute the number of steps if int(self.trainerconf['numbatches_to_aggregate']) == 0: num_steps = int(self.trainerconf['num_epochs']) * len( data_queue_elements) / self.batch_size else: num_steps = int(self.trainerconf['num_epochs']) * len(data_queue_elements) / \ (self.batch_size * int(self.trainerconf['numbatches_to_aggregate'])) done_ops = [tf.no_op()] return num_steps, done_ops
def __init__(self, conf, modelconf, dataconf, server, task_index): ''' NnetTrainer constructor, creates the training graph Args: conf: the trainer config modelconf: the model configuration dataconf: the data configuration as a ConfigParser server: optional server to be used for distributed training task_index: optional index of the worker task in the cluster ''' self.graph = tf.Graph() self.server = server self.task_index = task_index #distributed training cluster = tf.train.ClusterSpec(server.server_def.cluster) num_replicas = len(cluster.as_dict()['worker']) with self.graph.as_default(): #the chief parameter server should create the data queue if task_index == 0: #get the database configurations inputs = modelconf.get('io', 'inputs').split(' ') if inputs == ['']: inputs = [] input_sections = [conf[i] for i in inputs] input_dataconfs = [] for section in input_sections: input_dataconfs.append(dict(dataconf.items(section))) outputs = modelconf.get('io', 'outputs').split(' ') if outputs == ['']: outputs = [] target_sections = [conf[o] for o in outputs] target_dataconfs = [] for section in target_sections: target_dataconfs.append(dict(dataconf.items(section))) data_queue_elements, _ = input_pipeline.get_filenames( input_dataconfs + target_dataconfs) tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=True, seed=None, capacity=int(conf['batch_size']) * 2, shared_name='data_queue') #create a queue for the workers to signiy that they are done done_queue = tf.FIFOQueue(capacity=num_replicas, dtypes=[tf.bool], shapes=[[]], shared_name='done_queue%d' % task_index, name='done_queue%d' % task_index) self.wait_op = done_queue.dequeue_many(num_replicas).op self.scaffold = tf.train.Scaffold()
def _data(self, chief_ps): ''' create the input pipeline args: -chief_ps: the chief parameter server device returns: - the inputs - the input sequence lengths - the targets - the target sequence lengths - the number of global steps in an epoch - an operation to read a batch data - the number of local steps in this step ''' with tf.name_scope('get_batch'): #get the database configurations input_names = self.model.conf.get('io', 'inputs').split(' ') if input_names == ['']: input_names = [] input_sections = [self.conf[i].split(' ') for i in input_names] input_dataconfs = [] for sectionset in input_sections: input_dataconfs.append([]) for section in sectionset: input_dataconfs[-1].append( dict(self.dataconf.items(section))) output_names = self.conf['targets'].split(' ') if output_names == ['']: output_names = [] target_sections = [self.conf[o].split(' ') for o in output_names] target_dataconfs = [] for sectionset in target_sections: target_dataconfs.append([]) for section in sectionset: target_dataconfs[-1].append( dict(self.dataconf.items(section))) #check if running in distributed model if chief_ps is None: #get the filenames data_queue_elements, _ = input_pipeline.get_filenames( input_dataconfs + target_dataconfs) #create the data queue and queue runners data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=True, seed=None, capacity=int(self.conf['batch_size']) * 2, shared_name='data_queue') else: with tf.device(chief_ps): #get the data queue data_queue = tf.FIFOQueue( capacity=int(self.conf['batch_size']) * 2, shared_name='data_queue', name='data_queue', dtypes=[tf.string], shapes=[[]]) #create the input pipeline data, seq_length, num_steps, max_length = \ input_pipeline.input_pipeline( data_queue=data_queue, batch_size=int(self.conf['batch_size']), numbuckets=int(self.conf['numbuckets']), dataconfs=input_dataconfs + target_dataconfs, variable_batch_size=( self.conf['variable_batch_size'] == 'True') ) if int(self.conf['cut_sequence_length']): #make sure that all the sequence lengths are the same assertops = [ tf.assert_equal(seq_length[0], l) for l in seq_length ] with tf.control_dependencies(assertops): #cut each data component read_ops = [] components = [] component_lengths = [] for i, batch in enumerate(data): cut, cut_length, read_op, num_local_steps = \ _cut_sequence( batch, seq_length[i], int(self.conf['cut_sequence_length']), max_length) components.append(cut) component_lengths.append(cut_length) read_ops.append(read_op) else: num_local_steps = tf.constant(1) queues = [tf.FIFOQueue(1, b.dtype) for b in data] length_queues = [tf.FIFOQueue(1, b.dtype) for b in seq_length] components = [q.dequeue() for q in queues] component_lengths = [q.dequeue() for q in length_queues] for i, c in enumerate(components): c.set_shape(data[i].shape) component_lengths[i].set_shape(seq_length[i].shape) #create an op to read the data into the queues read_ops = [q.enqueue(data[i]) for i, q in enumerate(queues)] read_ops += [ q.enqueue(seq_length[i]) for i, q in enumerate(length_queues) ] #create an op for reading the data read_data = tf.group(*read_ops) inputs = { input_names[i]: d for i, d in enumerate(components[:len(input_sections)]) } input_seq_length = { input_names[i]: d for i, d in enumerate(component_lengths[:len(input_sections)]) } targets = { output_names[i]: d for i, d in enumerate(components[len(input_sections):]) } target_seq_length = { output_names[i]: d for i, d in enumerate(component_lengths[len(input_sections):]) } return (inputs, input_seq_length, targets, target_seq_length, num_steps, read_data, num_local_steps)
def evaluate(self, start_utt_ind=0): """evaluate the performance of the model Returns: - the loss as a scalar tensor - the number of batches in the validation set as an integer """ with tf.name_scope('evaluate'): inputs = dict() seq_lengths = dict() targets = dict() loss = [] norm = [] for set_ind, linkedset in enumerate(self.linkedsets): data_queue_elements, _ = input_pipeline.get_filenames( self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset]) max_number_of_elements = len(data_queue_elements) number_of_elements = min( [max_number_of_elements, self.requested_utts]) # compute the number of batches in the validation set numbatches = number_of_elements / self.batch_size number_of_elements = numbatches * self.batch_size if number_of_elements == 0: raise BaseException( 'The number of elements used for validation must be larger than 0.' ) print('%d utterances will be used for evaluation' % number_of_elements) # cut the data so it has a whole number of batches data_queue_elements = data_queue_elements[ start_utt_ind:number_of_elements] # create the data queue and queue runners (inputs are not allowed to get shuffled. I already did this so set to False) data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=False, seed=None, capacity=self.batch_size * 2) # create the input pipeline data, seq_length = input_pipeline.input_pipeline( data_queue=data_queue, batch_size=self.batch_size, numbuckets=1, dataconfs=self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset]) # split data into inputs and targets for ind, input_name in enumerate(self.input_names): inputs[input_name] = data[ind] seq_lengths[input_name] = seq_length[ind] # out_seq_lengths = { # seq_name: seq for seq_name, seq in seq_lengths.iteritems() # if seq_name in self.output_names} out_seq_lengths = { output_name: seq_lengths[self.input_names[0]] for output_name in self.output_names } for ind, target_name in enumerate(self.target_names): targets[target_name] = data[len(self.input_names) + ind] # get the logits logits = self._get_outputs(inputs=inputs, seq_lengths=seq_lengths) set_loss, set_norm = self.compute_loss(targets, logits, seq_lengths) set_loss *= self.linkedset_weighting[linkedset] set_norm *= self.linkedset_weighting[linkedset] loss.append(set_loss) norm.append(set_norm) loss = tf.reduce_sum(loss) norm = tf.reduce_sum(norm) return loss, norm, numbatches, logits, targets, out_seq_lengths
def __init__(self, conf, modelconf, dataconf, server, task_index): ''' NnetTrainer constructor, creates the training graph Args: conf: the trainer config modelconf: the model configuration dataconf: the data configuration as a ConfigParser server: optional server to be used for distributed training task_index: optional index of the worker task in the cluster ''' raise 'class parameterserver has not yet been adapted to the multi taks trainer' self.graph = tf.Graph() self.server = server self.task_index = task_index self.batch_size = int(conf['batch_size']) #distributed training cluster = tf.train.ClusterSpec(server.server_def.cluster) num_replicas = len(cluster.as_dict()['worker']) with self.graph.as_default(): #the chief parameter server should create the data queue if task_index == 0: #get the database configurations inputs = modelconf.get('io', 'inputs').split(' ') if inputs == ['']: inputs = [] input_sections = [conf[i].split(' ') for i in inputs] input_dataconfs = [] for sectionset in input_sections: input_dataconfs.append([]) for section in sectionset: input_dataconfs[-1].append( dict(dataconf.items(section))) output_names = conf['targets'].split(' ') if output_names == ['']: output_names = [] target_sections = [conf[o].split(' ') for o in output_names] target_dataconfs = [] for sectionset in target_sections: target_dataconfs.append([]) for section in sectionset: target_dataconfs[-1].append( dict(dataconf.items(section))) data_queue_elements, _ = input_pipeline.get_filenames( input_dataconfs + target_dataconfs) tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=True, seed=None, capacity=self.batch_size * (num_replicas + 1), shared_name='data_queue') if int(conf['numbatches_to_aggregate']) == 0: num_steps = (int(conf['num_epochs']) * len(data_queue_elements) / self.batch_size) else: num_steps = (int(conf['num_epochs']) * len(data_queue_elements) / (self.batch_size * int(conf['numbatches_to_aggregate']))) #create a queue to communicate the number of steps num_steps_queue = tf.FIFOQueue(capacity=num_replicas, dtypes=[tf.int32], shapes=[[]], shared_name='num_steps_queue', name='num_steps_queue') self.set_num_steps = num_steps_queue.enqueue_many( tf.constant([num_steps] * num_replicas)) #create a queue for the workers to signiy that they are done done_queue = tf.FIFOQueue(capacity=num_replicas, dtypes=[tf.bool], shapes=[[]], shared_name='done_queue%d' % task_index, name='done_queue%d' % task_index) self.wait_op = done_queue.dequeue_many(num_replicas).op self.scaffold = tf.train.Scaffold()
def __init__(self, conf, tasksconf, dataconf, modelconf, evaluatorconf, expdir, init_filename, server, task_index): ''' NnetTrainer constructor, creates the training graph Args: conf: the trainer config taskconf: the config file for each task dataconf: the data configuration as a ConfigParser modelconf: the neural net model configuration evaluatorconf: the evaluator configuration for evaluating if None no evaluation will be done expdir: directory where the summaries will be written init_filename: filename of the network that should be used to initialize the model. Put to None if no network is available/wanted. server: optional server to be used for distributed training task_index: optional index of the worker task in the cluster ''' self.expdir = expdir self.server = server self.conf = conf self.tasksconf = tasksconf self.task_index = task_index self.init_filename = init_filename self.batch_size = int(conf['batch_size']) cluster = tf.train.ClusterSpec(server.server_def.cluster) #create the graph self.graph = tf.Graph() #3 model types for multi task: single one to one; single one to many; multiple one to one #single one to one: the whole model is shared for all tasks, only loss function can be different #single one to many: each task has a separate output so only part of the network is shared, eg evrything but the output layer #multiple one to one: each task has its own network. Possibly the outputs are combined in a loss function #create the model modelfile = os.path.join(expdir, 'model', 'model.pkl') with open(modelfile, 'wb') as fid: self.model = model_factory.factory( modelconf.get('model', 'architecture'))(conf=modelconf) pickle.dump(self.model, fid) evaltype = evaluatorconf.get('evaluator', 'evaluator') #get the database configurations input_dataconfs = dict() target_dataconfs = dict() loss_computers = dict() nr_input_sections = dict() if evaltype != 'None': evaluators = dict() for task in self.conf['tasks'].split(' '): taskconf = self.tasksconf[task] #get the database configurations input_names = modelconf.get('io', 'inputs').split(' ') if input_names == ['']: input_names = [] input_sections = [taskconf[i].split(' ') for i in input_names] nr_input_sections[task] = len(input_sections) task_input_dataconfs = [] for sectionset in input_sections: task_input_dataconfs.append([]) for section in sectionset: task_input_dataconfs[-1].append( dict(dataconf.items(section))) input_dataconfs[task] = task_input_dataconfs output_names = taskconf['targets'].split(' ') if output_names == ['']: output_names = [] target_sections = [taskconf[o].split(' ') for o in output_names] task_target_dataconfs = [] for sectionset in target_sections: task_target_dataconfs.append([]) for section in sectionset: task_target_dataconfs[-1].append( dict(dataconf.items(section))) target_dataconfs[task] = task_target_dataconfs #create the loss computer loss_computer = loss_computer_factory.factory( taskconf['loss_type'])(self.batch_size) loss_computers[task] = loss_computer if evaltype != 'None': evaluator = evaluator_factory.factory(evaltype)( conf=evaluatorconf, dataconf=dataconf, model=self.model, task=task) evaluators[task] = evaluator if 'local' in cluster.as_dict(): num_replicas = 1 device = tf.DeviceSpec(job='local') else: #distributed training num_replicas = len(cluster.as_dict()['worker']) num_servers = len(cluster.as_dict()['ps']) ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy( num_tasks=num_servers, load_fn=tf.contrib.training.byte_size_load_fn) device = tf.train.replica_device_setter(ps_tasks=num_servers, ps_strategy=ps_strategy) chief_ps = tf.DeviceSpec(job='ps', task=0) self.is_chief = task_index == 0 #define the placeholders in the graph with self.graph.as_default(): #create a local num_steps variable self.num_steps = tf.get_variable( name='num_steps', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) #a variable to hold the amount of steps already taken self.global_step = tf.get_variable( name='global_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) should_terminate = tf.get_variable( name='should_terminate', shape=[], dtype=tf.bool, initializer=tf.constant_initializer(False), trainable=False) self.terminate = should_terminate.assign(True).op #create a check if training should continue self.should_stop = tf.logical_or( tf.greater_equal(self.global_step, self.num_steps), should_terminate) with tf.device(device): data_queues = dict() num_steps = [] done_ops = [] for task in self.conf['tasks'].split(' '): #check if running in distributed model if 'local' in cluster.as_dict(): #get the filenames data_queue_elements, _ = input_pipeline.get_filenames( input_dataconfs[task] + target_dataconfs[task]) #create the data queue and queue runners (inputs get shuffled! I already did this so set to False) data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=False, seed=None, capacity=self.batch_size * 2, shared_name='data_queue_' + task) data_queues[task] = data_queue #compute the number of steps if int(conf['numbatches_to_aggregate']) == 0: task_num_steps = (int(conf['num_epochs']) * len(data_queue_elements) / self.batch_size) else: task_num_steps = ( int(conf['num_epochs']) * len(data_queue_elements) / (self.batch_size * int(conf['numbatches_to_aggregate']))) #set the number of steps num_steps.append(task_num_steps) done_ops.append(tf.no_op()) else: with tf.device(chief_ps): #get the data queue data_queue = tf.FIFOQueue( capacity=self.batch_size * (num_replicas + 1), shared_name='data_queue_' + task, name='data_queue_' + task, dtypes=[tf.string], shapes=[[]]) data_queues[task] = data_queue #get the number of steps from the parameter server num_steps_queue = tf.FIFOQueue( capacity=num_replicas, dtypes=[tf.int32], shared_name='num_steps_queue', name='num_steps_queue', shapes=[[]]) #set the number of steps task_num_steps = num_steps_queue.dequeue() #get the done queues for i in range(num_servers): with tf.device('job:ps/task:%d' % i): done_queue = tf.FIFOQueue( capacity=num_replicas, dtypes=[tf.bool], shapes=[[]], shared_name='done_queue%d' % i, name='done_queue%d' % i) done_ops.append(done_queue.enqueue(True)) self.set_num_steps = self.num_steps.assign(min(num_steps)).op self.done = tf.group(*done_ops) #training part with tf.variable_scope('train'): #a variable to scale the learning rate (used to reduce the #learning rate in case validation performance drops) learning_rate_fact = tf.get_variable( name='learning_rate_fact', shape=[], initializer=tf.constant_initializer(1.0), trainable=False) #compute the learning rate with exponential decay and scale #with the learning rate factor self.learning_rate = (tf.train.exponential_decay( learning_rate=float(conf['initial_learning_rate']), global_step=self.global_step, decay_steps=self.num_steps, decay_rate=float(conf['learning_rate_decay'])) * learning_rate_fact) #create the optimizer optimizer = tf.train.AdamOptimizer(self.learning_rate) self.total_loss = tf.get_variable( name='total_loss', shape=[], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) self.reset_loss = self.total_loss.assign(0.0) loss = [] for task in self.conf['tasks'].split(' '): with tf.variable_scope(task): #create the input pipeline data, seq_length = input_pipeline.input_pipeline( data_queue=data_queues[task], batch_size=self.batch_size, numbuckets=int(conf['numbuckets']), dataconfs=input_dataconfs[task] + target_dataconfs[task]) inputs = { input_names[i]: d for i, d in enumerate( data[:nr_input_sections[task]]) } seq_length = { input_names[i]: d for i, d in enumerate( seq_length[:nr_input_sections[task]]) } targets = { output_names[i]: d for i, d in enumerate( data[nr_input_sections[task]:]) } #target_seq_length = { #output_names[i]: d #for i, d in enumerate(seq_length[nr_input_sections[task]:])} #compute the training outputs of the model logits = self.model(inputs=inputs, input_seq_length=seq_length, is_training=True) #TODO: The proper way to exploit data paralellism is via the #SyncReplicasOptimizer defined below. However for some reason it hangs #and I have not yet found a solution for it. For the moment the gradients #are accumulated in a way that does not allow data paralellism and there # is no advantage on having multiple workers. (We also accumulate the loss) #create an optimizer that aggregates gradients #if int(conf['numbatches_to_aggregate']) > 0: #optimizer = tf.train.SyncReplicasOptimizer( #opt=optimizer, #replicas_to_aggregate=int( #conf['numbatches_to_aggregate'])#, ##total_num_replicas=num_replicas #) #compute the loss task_loss = loss_computers[task](targets, logits, seq_length) #append the task loss to the global loss loss.append(task_loss) #accumulate losses from tasks with tf.variable_scope('accumulate_loss_from_tasks'): loss = tf.reduce_mean(loss) #accumulate losses from batches self.acc_loss = self.total_loss.assign_add(loss) ##compute the gradients #grads_and_vars = optimizer.compute_gradients(self.loss) #with tf.variable_scope('clip'): #clip_value = float(conf['clip_grad_value']) ##clip the gradients #grads_and_vars = [(tf.clip_by_value(grad, -clip_value, clip_value), var) #for grad, var in grads_and_vars] self.params = tf.trainable_variables() grads = [ tf.get_variable(param.op.name, param.get_shape().as_list(), initializer=tf.constant_initializer(0), trainable=False) for param in self.params ] self.reset_grad = tf.variables_initializer(grads) #compute the gradients minibatch_grads_and_vars = optimizer.compute_gradients( loss) with tf.variable_scope('clip'): clip_value = float(conf['clip_grad_value']) #clip the gradients minibatch_grads_and_vars = [ (tf.clip_by_value(grad, -clip_value, clip_value), var) for grad, var in minibatch_grads_and_vars ] (minibatchgrads, minibatchvars) = zip(*minibatch_grads_and_vars) #update gradients by accumulating them self.update_gradients = [ grad.assign_add(batchgrad) for batchgrad, grad in zip(minibatchgrads, grads) ] #opperation to apply the gradients grads_and_vars = list(zip(grads, minibatchvars)) apply_gradients_op = optimizer.apply_gradients( grads_and_vars=grads_and_vars, global_step=self.global_step, name='apply_gradients') #all remaining operations with the UPDATE_OPS GraphKeys update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) #create an operation to update the gradients, the batch_loss #and do all other update ops self.update_op = tf.group(*([apply_gradients_op] + update_ops), name='update') if evaltype != 'None': #validation part with tf.variable_scope('validate'): #create a variable to hold the validation loss self.validation_loss = tf.get_variable( name='validation_loss', shape=[], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) #create a variable to save the last step where the model #was validated validated_step = tf.get_variable( name='validated_step', shape=[], dtype=tf.int32, initializer=tf.constant_initializer( -int(conf['valid_frequency'])), trainable=False) #a check if validation is due self.should_validate = tf.greater_equal( self.global_step - validated_step, int(conf['valid_frequency'])) val_batch_loss = [] valbatches = [] for task in self.conf['tasks'].split(' '): with tf.variable_scope(task): task_val_batch_loss, task_valbatches, _, _ = evaluators[ task].evaluate() val_batch_loss.append(task_val_batch_loss) valbatches.append(task_valbatches) val_batch_loss = tf.reduce_mean(val_batch_loss) self.valbatches = min(valbatches) self.update_loss = self.validation_loss.assign( self.validation_loss + val_batch_loss #/self.valbatches ).op #update the learning rate factor self.half_lr = learning_rate_fact.assign( learning_rate_fact / 2).op #create an operation to updated the validated step self.update_validated_step = validated_step.assign( self.global_step).op #variable to hold the best validation loss so far self.best_validation = tf.get_variable( name='best_validation', shape=[], dtype=tf.float32, initializer=tf.constant_initializer(1.79e+308), trainable=False) #op to update the best velidation loss self.update_best = self.best_validation.assign( self.validation_loss).op #a variable that holds the amount of workers at the #validation point waiting_workers = tf.get_variable( name='waiting_workers', shape=[], dtype=tf.int32, initializer=tf.constant_initializer(0), trainable=False) #an operation to signal a waiting worker self.waiting = waiting_workers.assign_add(1).op #an operation to set the waiting workers to zero self.reset_waiting = waiting_workers.initializer #an operation to check if all workers are waiting self.all_waiting = tf.equal(waiting_workers, num_replicas - 1) tf.summary.scalar('validation loss', self.validation_loss) else: self.update_loss = None tf.summary.scalar('learning rate', self.learning_rate) #create a histogram for all trainable parameters for param in tf.trainable_variables(): tf.summary.histogram(param.name, param) #create the scaffold self.scaffold = tf.train.Scaffold()
def _data(self, chief_ps): ''' create the input pipeline args: -chief_ps: the chief parameter server device returns: - the inputs - the input sequence lengths - the targets - the target sequence lengths - the number of steps in an epoch ''' #get the database configurations input_names = self.model.conf.get('io', 'inputs').split(' ') if input_names == ['']: input_names = [] input_sections = [self.conf[i].split(' ') for i in input_names] input_dataconfs = [] for sectionset in input_sections: input_dataconfs.append([]) for section in sectionset: input_dataconfs[-1].append(dict(self.dataconf.items(section))) output_names = self.conf['targets'].split(' ') if output_names == ['']: output_names = [] target_sections = [self.conf[o].split(' ') for o in output_names] target_dataconfs = [] for sectionset in target_sections: target_dataconfs.append([]) for section in sectionset: target_dataconfs[-1].append(dict(self.dataconf.items(section))) #check if running in distributed model if chief_ps is None: #get the filenames data_queue_elements, _ = input_pipeline.get_filenames( input_dataconfs + target_dataconfs) #create the data queue and queue runners data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=True, seed=None, capacity=int(self.conf['batch_size']) * 2, shared_name='data_queue') else: with tf.device(chief_ps): #get the data queue data_queue = tf.FIFOQueue( capacity=int(self.conf['batch_size']) * 2, shared_name='data_queue', name='data_queue', dtypes=[tf.string], shapes=[[]]) #create the input pipeline data, seq_length, num_steps = input_pipeline.input_pipeline( data_queue=data_queue, batch_size=int(self.conf['batch_size']), numbuckets=int(self.conf['numbuckets']), dataconfs=input_dataconfs + target_dataconfs, variable_batch_size=(self.conf['variable_batch_size'] == 'True')) inputs = { input_names[i]: d for i, d in enumerate(data[:len(input_sections)]) } input_seq_length = { input_names[i]: d for i, d in enumerate(seq_length[:len(input_sections)]) } targets = { output_names[i]: d for i, d in enumerate(data[len(input_sections):]) } target_seq_length = { output_names[i]: d for i, d in enumerate(seq_length[len(input_sections):]) } return inputs, input_seq_length, targets, target_seq_length, num_steps
def evaluate(self): '''evaluate the performance of the model Returns: - the loss as a scalar tensor - the number of batches in the validation set as an integer ''' batch_size = int(self.conf.get('evaluator', 'batch_size')) requested_utts = int(self.conf.get('evaluator', 'requested_utts')) with tf.name_scope('evaluate'): #get the list of filenames fo the validation set data_queue_elements, _ = input_pipeline.get_filenames( self.input_dataconfs + self.target_dataconfs) max_number_of_elements = len(data_queue_elements) number_of_elements = min([max_number_of_elements, requested_utts]) #compute the number of batches in the validation set numbatches = number_of_elements / batch_size number_of_elements = numbatches * batch_size print '%d utterances will be used for evaluation' % ( number_of_elements) #cut the data so it has a whole numbe of batches data_queue_elements = data_queue_elements[:number_of_elements] #create a queue to hold the filenames data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=False, seed=None, capacity=batch_size * 2) #create the input pipeline data, seq_length = input_pipeline.input_pipeline( data_queue=data_queue, batch_size=batch_size, numbuckets=1, dataconfs=self.input_dataconfs + self.target_dataconfs) inputs = { self.model.input_names[i]: d for i, d in enumerate(data[:len(self.input_dataconfs)]) } seq_length = { self.model.input_names[i]: d for i, d in enumerate(seq_length[:len(self.input_dataconfs)]) } target_names = self.conf.get(self.task, 'targets').split(' ') targets = { target_names[i]: d for i, d in enumerate(data[len(self.input_dataconfs):]) } #target_seq_length = { #target_names[i]: d #for i, d in enumerate(seq_length[len(self.input_dataconfs):])} outputs = self._get_outputs(inputs, seq_length) loss = self.compute_loss(targets, outputs, seq_length) return loss, numbatches, outputs, seq_length
def evaluate(self): '''evaluate the performance of the model Returns: - the loss as a scalar tensor - an operation to update the loss - the number of batches in the validation set as an integer ''' batch_size = int(self.conf['batch_size']) with tf.name_scope('evaluate'): #a variable to hold the validation loss loss = tf.get_variable(name='validation_loss', shape=[], dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=False) #get the list of filenames fo the validation set data_queue_elements, _ = input_pipeline.get_filenames( self.input_dataconfs + self.target_dataconfs) #compute the number of batches in the validation set numbatches = len(data_queue_elements) / batch_size #cut the data so it has a whole numbe of batches data_queue_elements = data_queue_elements[:numbatches * batch_size] #create a queue to hold the filenames data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=False, seed=None, capacity=batch_size * 2) #create the input pipeline data, seq_length, _ = input_pipeline.input_pipeline( data_queue=data_queue, batch_size=batch_size, numbuckets=1, dataconfs=self.input_dataconfs + self.target_dataconfs) inputs = { self.model.input_names[i]: d for i, d in enumerate(data[:len(self.input_dataconfs)]) } input_seq_length = { self.model.input_names[i]: d for i, d in enumerate(seq_length[:len(self.input_dataconfs)]) } target_names = self.conf['targets'].split(' ') targets = { target_names[i]: d for i, d in enumerate(data[len(self.input_dataconfs):]) } target_seq_length = { target_names[i]: d for i, d in enumerate(seq_length[len(self.input_dataconfs):]) } update_loss = self.update_loss(loss, inputs, input_seq_length, targets, target_seq_length) return loss, update_loss, numbatches
def __init__(self, model, conf, dataconf, expdir): '''Recognizer constructor Args: model: the model to be tested conf: the recognizer configuration as a configparser modelconf: the model configuration as a configparser dataconf: the database configuration as a configparser expdir: the experiments directory ''' self.conf = dict(conf.items('recognizer')) self.expdir = expdir self.model = model #get the database configurations input_sections = [ self.conf[i].split(' ') for i in self.model.input_names ] self.input_dataconfs = [] for sectionset in input_sections: self.input_dataconfs.append([]) for section in sectionset: self.input_dataconfs[-1].append(dict(dataconf.items(section))) #create a decoder self.decoder = decoder_factory.factory(conf.get('decoder', 'decoder'))(conf, self.model) self.batch_size = int(self.conf['batch_size']) #create the graph self.graph = tf.Graph() with self.graph.as_default(): #get the list of filenames fo the validation set data_queue_elements, self.names = input_pipeline.get_filenames( self.input_dataconfs) #compute the number of batches in the validation set self.numbatches = int( math.ceil(float(len(data_queue_elements)) / self.batch_size)) #create a queue to hold the filenames data_queue = tf.train.string_input_producer( string_tensor=data_queue_elements, num_epochs=1, shuffle=False, seed=None, capacity=self.batch_size * 2) #create the input pipeline inputs, input_seq_length, _ = input_pipeline.input_pipeline( data_queue=data_queue, batch_size=self.batch_size, numbuckets=1, allow_smaller_final_batch=True, dataconfs=self.input_dataconfs) inputs = { self.model.input_names[i]: d for i, d in enumerate(inputs) } input_seq_length = { self.model.input_names[i]: d for i, d in enumerate(input_seq_length) } self.decoded = self.decoder(inputs, input_seq_length) #create a histogram for all trainable parameters for param in tf.trainable_variables(): tf.summary.histogram(param.name, param)
def set_dataqueues(self, cluster): '''sets the data queues''' #check if running in distributed model self.data_queue = dict() for linkedset in self.linkedsets: data_queue_name = 'data_queue_%s_%s' % (self.task_name, linkedset) if 'local' in cluster.as_dict(): data_queue_elements, _ = input_pipeline.get_filenames( self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset]) number_of_elements = len(data_queue_elements) if 'trainset_frac' in self.taskconf: number_of_elements = int( float(number_of_elements) * float(self.taskconf['trainset_frac'])) print '%d utterances will be used for training' % ( number_of_elements) data_queue_elements = data_queue_elements[:number_of_elements] #create the data queue and queue runners self.data_queue[linkedset] = tf.train.string_input_producer( string_tensor=data_queue_elements, shuffle=False, seed=None, capacity=self.batch_size * 2, shared_name=data_queue_name) #compute the number of steps if int(self.trainerconf['numbatches_to_aggregate']) == 0: num_steps = (int(self.trainerconf['num_epochs']) * len(data_queue_elements) / self.batch_size) else: num_steps = ( int(self.trainerconf['num_epochs']) * len(data_queue_elements) / (self.batch_size * int(self.trainerconf['numbatches_to_aggregate']))) done_ops = [tf.no_op()] else: #get the data queue self.data_queue[linkedset] = tf.FIFOQueue( capacity=self.batch_size * (num_replicas + 1), shared_name=data_queue_name, name=data_queue_name, dtypes=[tf.string], shapes=[[]]) #get the number of steps from the parameter server num_steps_queue = tf.FIFOQueue(capacity=num_replicas, dtypes=[tf.int32], shared_name='num_steps_queue', name='num_steps_queue', shapes=[[]]) #set the number of steps num_steps = num_steps_queue.dequeue() #get the done queues for i in range(num_servers): with tf.device('job:ps/task:%d' % i): done_queue = tf.FIFOQueue(capacity=num_replicas, dtypes=[tf.bool], shapes=[[]], shared_name='done_queue%d' % i, name='done_queue%d' % i) done_ops.append(done_queue.enqueue(True)) return num_steps, done_ops