Esempio n. 1
0
    def evaluate(self):
        '''evaluate the performance of the model

        Returns:
            - the loss as a scalar tensor
            - the number of batches in the validation set as an integer
        '''


        with tf.name_scope('evaluate'):
	    inputs=dict()
	    seq_lengths=dict()
	    targets=dict()
	    for linkedset in self.linkedsets:
		data_queue_elements, _ = input_pipeline.get_filenames(
		    self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset])
		
		max_number_of_elements = len(data_queue_elements)
		number_of_elements = min([max_number_of_elements,self.requested_utts])
		
		#compute the number of batches in the validation set
		numbatches = number_of_elements/self.batch_size
		number_of_elements = numbatches*self.batch_size
		print '%d utterances will be used for evaluation' %(number_of_elements)

		#cut the data so it has a whole number of batches
		data_queue_elements = data_queue_elements[:number_of_elements]
		    
		
		#create the data queue and queue runners (inputs are allowed to get shuffled. I already did this so set to False)
		data_queue = tf.train.string_input_producer(
		    string_tensor=data_queue_elements,
		    shuffle=False,
		    seed=None,
		    capacity=self.batch_size*2)
		    
		#create the input pipeline
		data, seq_length = input_pipeline.input_pipeline(
		    data_queue=data_queue,
		    batch_size=self.batch_size,
		    numbuckets=1,
		    dataconfs=self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset]
		)
	  
		#split data into inputs and targets
		for ind,input_name in enumerate(self.linkedsets[linkedset]['inputs']):
		    inputs[input_name] = data[ind]
		    seq_lengths[input_name] = seq_length[ind]
			
		for ind,target_name in enumerate(self.linkedsets[linkedset]['targets']):
		    targets[target_name]=data[len(self.linkedsets[linkedset]['inputs'])+ind]

	    #get the logits
	    logits = self._get_outputs(
		inputs=inputs, 
		seq_lengths=seq_lengths)
	    
            loss, norm = self.compute_loss(targets, logits, seq_lengths)

        return loss, norm, numbatches, logits, seq_lengths
    def begin(self):
        '''this will be run at session creation'''

        #pylint: disable=W0201

        with tf.variable_scope('compute_fisher'):

            data_queue_elements, _ = input_pipeline.get_filenames(
                self.dataconfs.values())

            self.num_samples = len(data_queue_elements)

            data_queue = tf.train.string_input_producer(
                string_tensor=data_queue_elements,
                shuffle=False,
                seed=None,
                capacity=1)

            inputs, input_seq_length, _ = input_pipeline.input_pipeline(
                data_queue=data_queue,
                batch_size=1,
                numbuckets=1,
                dataconfs=self.dataconfs.values(),
                variable_batch_size=False)

            inputs = {
                self.dataconfs.keys()[i]: d
                for i, d in enumerate(inputs)
            }
            input_seq_length = {
                self.dataconfs.keys()[i]: d
                for i, d in enumerate(input_seq_length)
            }

            #get the input log likelihood using the random sample decoder
            logprob = self.decoder(inputs, input_seq_length).values()[0][2][0]

            #get the derivative the logprob
            gradients = tf.gradients(logprob, self.fisher.keys())

            #create an op to update the fisher information
            update_ops = []
            for var, grad in zip(self.fisher.keys(), gradients):
                update_ops.append(self.fisher[var].assign_add(
                    tf.square(grad) / self.num_samples).op)
            self.update_fisher = tf.group(*update_ops)
Esempio n. 3
0
    def set_dataqueues(self):
        """sets the data queues"""

        # check if running in distributed model
        self.data_queue = dict()
        for linkedset in self.linkedsets:
            data_queue_name = 'data_queue_%s_%s' % (self.task_name, linkedset)

            data_queue_elements, _ = input_pipeline.get_filenames(
                self.input_dataconfs[linkedset] +
                self.target_dataconfs[linkedset])

            number_of_elements = len(data_queue_elements)
            if 'trainset_frac' in self.taskconf:
                number_of_elements = int(
                    float(number_of_elements) *
                    float(self.taskconf['trainset_frac']))
            print '%d utterances will be used for training' % number_of_elements

            data_queue_elements = data_queue_elements[:number_of_elements]

            # create the data queue and queue runners
            self.data_queue[linkedset] = tf.train.string_input_producer(
                string_tensor=data_queue_elements,
                shuffle=False,
                seed=None,
                capacity=self.batch_size * 2,
                shared_name=data_queue_name)

            # compute the number of steps
            if int(self.trainerconf['numbatches_to_aggregate']) == 0:
                num_steps = int(self.trainerconf['num_epochs']) * len(
                    data_queue_elements) / self.batch_size
            else:
                num_steps = int(self.trainerconf['num_epochs']) * len(data_queue_elements) / \
                   (self.batch_size * int(self.trainerconf['numbatches_to_aggregate']))

            done_ops = [tf.no_op()]

        return num_steps, done_ops
Esempio n. 4
0
    def __init__(self, conf, modelconf, dataconf, server, task_index):
        '''
        NnetTrainer constructor, creates the training graph

        Args:
            conf: the trainer config
            modelconf: the model configuration
            dataconf: the data configuration as a ConfigParser
            server: optional server to be used for distributed training
            task_index: optional index of the worker task in the cluster
        '''

        self.graph = tf.Graph()
        self.server = server
        self.task_index = task_index

        #distributed training
        cluster = tf.train.ClusterSpec(server.server_def.cluster)
        num_replicas = len(cluster.as_dict()['worker'])

        with self.graph.as_default():

            #the chief parameter server should create the data queue
            if task_index == 0:
                #get the database configurations
                inputs = modelconf.get('io', 'inputs').split(' ')
                if inputs == ['']:
                    inputs = []
                input_sections = [conf[i] for i in inputs]
                input_dataconfs = []
                for section in input_sections:
                    input_dataconfs.append(dict(dataconf.items(section)))
                outputs = modelconf.get('io', 'outputs').split(' ')
                if outputs == ['']:
                    outputs = []
                target_sections = [conf[o] for o in outputs]
                target_dataconfs = []
                for section in target_sections:
                    target_dataconfs.append(dict(dataconf.items(section)))

                data_queue_elements, _ = input_pipeline.get_filenames(
                    input_dataconfs + target_dataconfs)

                tf.train.string_input_producer(
                    string_tensor=data_queue_elements,
                    shuffle=True,
                    seed=None,
                    capacity=int(conf['batch_size']) * 2,
                    shared_name='data_queue')

                #create a queue for the workers to signiy that they are done
                done_queue = tf.FIFOQueue(capacity=num_replicas,
                                          dtypes=[tf.bool],
                                          shapes=[[]],
                                          shared_name='done_queue%d' %
                                          task_index,
                                          name='done_queue%d' % task_index)

                self.wait_op = done_queue.dequeue_many(num_replicas).op

            self.scaffold = tf.train.Scaffold()
Esempio n. 5
0
    def _data(self, chief_ps):
        '''
        create the input pipeline

        args:
            -chief_ps: the chief parameter server device

        returns:
            - the inputs
            - the input sequence lengths
            - the targets
            - the target sequence lengths
            - the number of global steps in an epoch
            - an operation to read a batch data
            - the number of local steps in this step
        '''

        with tf.name_scope('get_batch'):

            #get the database configurations
            input_names = self.model.conf.get('io', 'inputs').split(' ')
            if input_names == ['']:
                input_names = []
            input_sections = [self.conf[i].split(' ') for i in input_names]
            input_dataconfs = []
            for sectionset in input_sections:
                input_dataconfs.append([])
                for section in sectionset:
                    input_dataconfs[-1].append(
                        dict(self.dataconf.items(section)))

            output_names = self.conf['targets'].split(' ')
            if output_names == ['']:
                output_names = []
            target_sections = [self.conf[o].split(' ') for o in output_names]
            target_dataconfs = []
            for sectionset in target_sections:
                target_dataconfs.append([])
                for section in sectionset:
                    target_dataconfs[-1].append(
                        dict(self.dataconf.items(section)))

            #check if running in distributed model
            if chief_ps is None:

                #get the filenames
                data_queue_elements, _ = input_pipeline.get_filenames(
                    input_dataconfs + target_dataconfs)

                #create the data queue and queue runners
                data_queue = tf.train.string_input_producer(
                    string_tensor=data_queue_elements,
                    shuffle=True,
                    seed=None,
                    capacity=int(self.conf['batch_size']) * 2,
                    shared_name='data_queue')

            else:
                with tf.device(chief_ps):

                    #get the data queue
                    data_queue = tf.FIFOQueue(
                        capacity=int(self.conf['batch_size']) * 2,
                        shared_name='data_queue',
                        name='data_queue',
                        dtypes=[tf.string],
                        shapes=[[]])

            #create the input pipeline
            data, seq_length, num_steps, max_length = \
                input_pipeline.input_pipeline(
                    data_queue=data_queue,
                    batch_size=int(self.conf['batch_size']),
                    numbuckets=int(self.conf['numbuckets']),
                    dataconfs=input_dataconfs + target_dataconfs,
                    variable_batch_size=(
                        self.conf['variable_batch_size'] == 'True')
                )

            if int(self.conf['cut_sequence_length']):

                #make sure that all the sequence lengths are the same
                assertops = [
                    tf.assert_equal(seq_length[0], l) for l in seq_length
                ]

                with tf.control_dependencies(assertops):
                    #cut each data component
                    read_ops = []
                    components = []
                    component_lengths = []
                    for i, batch in enumerate(data):
                        cut, cut_length, read_op, num_local_steps = \
                            _cut_sequence(
                                batch,
                                seq_length[i],
                                int(self.conf['cut_sequence_length']),
                                max_length)
                        components.append(cut)
                        component_lengths.append(cut_length)
                        read_ops.append(read_op)
            else:
                num_local_steps = tf.constant(1)

                queues = [tf.FIFOQueue(1, b.dtype) for b in data]
                length_queues = [tf.FIFOQueue(1, b.dtype) for b in seq_length]
                components = [q.dequeue() for q in queues]
                component_lengths = [q.dequeue() for q in length_queues]
                for i, c in enumerate(components):
                    c.set_shape(data[i].shape)
                    component_lengths[i].set_shape(seq_length[i].shape)

                #create an op to read the data into the queues
                read_ops = [q.enqueue(data[i]) for i, q in enumerate(queues)]
                read_ops += [
                    q.enqueue(seq_length[i])
                    for i, q in enumerate(length_queues)
                ]

            #create an op for reading the data
            read_data = tf.group(*read_ops)

            inputs = {
                input_names[i]: d
                for i, d in enumerate(components[:len(input_sections)])
            }
            input_seq_length = {
                input_names[i]: d
                for i, d in enumerate(component_lengths[:len(input_sections)])
            }
            targets = {
                output_names[i]: d
                for i, d in enumerate(components[len(input_sections):])
            }
            target_seq_length = {
                output_names[i]: d
                for i, d in enumerate(component_lengths[len(input_sections):])
            }

        return (inputs, input_seq_length, targets, target_seq_length,
                num_steps, read_data, num_local_steps)
Esempio n. 6
0
    def evaluate(self, start_utt_ind=0):
        """evaluate the performance of the model

		Returns:
			- the loss as a scalar tensor
			- the number of batches in the validation set as an integer
		"""

        with tf.name_scope('evaluate'):
            inputs = dict()
            seq_lengths = dict()
            targets = dict()
            loss = []
            norm = []
            for set_ind, linkedset in enumerate(self.linkedsets):
                data_queue_elements, _ = input_pipeline.get_filenames(
                    self.input_dataconfs[linkedset] +
                    self.target_dataconfs[linkedset])

                max_number_of_elements = len(data_queue_elements)
                number_of_elements = min(
                    [max_number_of_elements, self.requested_utts])

                # compute the number of batches in the validation set
                numbatches = number_of_elements / self.batch_size
                number_of_elements = numbatches * self.batch_size
                if number_of_elements == 0:
                    raise BaseException(
                        'The number of elements used for validation must be larger than 0.'
                    )
                print('%d utterances will be used for evaluation' %
                      number_of_elements)

                # cut the data so it has a whole number of batches
                data_queue_elements = data_queue_elements[
                    start_utt_ind:number_of_elements]

                # create the data queue and queue runners (inputs are not allowed to get shuffled. I already did this so set to False)
                data_queue = tf.train.string_input_producer(
                    string_tensor=data_queue_elements,
                    shuffle=False,
                    seed=None,
                    capacity=self.batch_size * 2)

                # create the input pipeline
                data, seq_length = input_pipeline.input_pipeline(
                    data_queue=data_queue,
                    batch_size=self.batch_size,
                    numbuckets=1,
                    dataconfs=self.input_dataconfs[linkedset] +
                    self.target_dataconfs[linkedset])
                # split data into inputs and targets
                for ind, input_name in enumerate(self.input_names):
                    inputs[input_name] = data[ind]
                    seq_lengths[input_name] = seq_length[ind]

                # out_seq_lengths = {
                # 	seq_name: seq for seq_name, seq in seq_lengths.iteritems()
                # 	if seq_name in self.output_names}
                out_seq_lengths = {
                    output_name: seq_lengths[self.input_names[0]]
                    for output_name in self.output_names
                }

                for ind, target_name in enumerate(self.target_names):
                    targets[target_name] = data[len(self.input_names) + ind]

                # get the logits
                logits = self._get_outputs(inputs=inputs,
                                           seq_lengths=seq_lengths)

                set_loss, set_norm = self.compute_loss(targets, logits,
                                                       seq_lengths)
                set_loss *= self.linkedset_weighting[linkedset]
                set_norm *= self.linkedset_weighting[linkedset]
                loss.append(set_loss)
                norm.append(set_norm)
        loss = tf.reduce_sum(loss)
        norm = tf.reduce_sum(norm)

        return loss, norm, numbatches, logits, targets, out_seq_lengths
Esempio n. 7
0
    def __init__(self, conf, modelconf, dataconf, server, task_index):
        '''
        NnetTrainer constructor, creates the training graph

        Args:
            conf: the trainer config
            modelconf: the model configuration
            dataconf: the data configuration as a ConfigParser
            server: optional server to be used for distributed training
            task_index: optional index of the worker task in the cluster
        '''

        raise 'class parameterserver has not yet been adapted to the multi taks trainer'

        self.graph = tf.Graph()
        self.server = server
        self.task_index = task_index
        self.batch_size = int(conf['batch_size'])

        #distributed training
        cluster = tf.train.ClusterSpec(server.server_def.cluster)
        num_replicas = len(cluster.as_dict()['worker'])

        with self.graph.as_default():

            #the chief parameter server should create the data queue
            if task_index == 0:
                #get the database configurations
                inputs = modelconf.get('io', 'inputs').split(' ')
                if inputs == ['']:
                    inputs = []
                input_sections = [conf[i].split(' ') for i in inputs]
                input_dataconfs = []
                for sectionset in input_sections:
                    input_dataconfs.append([])
                    for section in sectionset:
                        input_dataconfs[-1].append(
                            dict(dataconf.items(section)))
                output_names = conf['targets'].split(' ')
                if output_names == ['']:
                    output_names = []
                target_sections = [conf[o].split(' ') for o in output_names]
                target_dataconfs = []
                for sectionset in target_sections:
                    target_dataconfs.append([])
                    for section in sectionset:
                        target_dataconfs[-1].append(
                            dict(dataconf.items(section)))

                data_queue_elements, _ = input_pipeline.get_filenames(
                    input_dataconfs + target_dataconfs)

                tf.train.string_input_producer(
                    string_tensor=data_queue_elements,
                    shuffle=True,
                    seed=None,
                    capacity=self.batch_size * (num_replicas + 1),
                    shared_name='data_queue')
                if int(conf['numbatches_to_aggregate']) == 0:
                    num_steps = (int(conf['num_epochs']) *
                                 len(data_queue_elements) / self.batch_size)
                else:
                    num_steps = (int(conf['num_epochs']) *
                                 len(data_queue_elements) /
                                 (self.batch_size *
                                  int(conf['numbatches_to_aggregate'])))

                #create a queue to communicate the number of steps
                num_steps_queue = tf.FIFOQueue(capacity=num_replicas,
                                               dtypes=[tf.int32],
                                               shapes=[[]],
                                               shared_name='num_steps_queue',
                                               name='num_steps_queue')

                self.set_num_steps = num_steps_queue.enqueue_many(
                    tf.constant([num_steps] * num_replicas))

                #create a queue for the workers to signiy that they are done
                done_queue = tf.FIFOQueue(capacity=num_replicas,
                                          dtypes=[tf.bool],
                                          shapes=[[]],
                                          shared_name='done_queue%d' %
                                          task_index,
                                          name='done_queue%d' % task_index)

                self.wait_op = done_queue.dequeue_many(num_replicas).op

            self.scaffold = tf.train.Scaffold()
Esempio n. 8
0
    def __init__(self, conf, tasksconf, dataconf, modelconf, evaluatorconf,
                 expdir, init_filename, server, task_index):
        '''
        NnetTrainer constructor, creates the training graph

        Args:
            conf: the trainer config
            taskconf: the config file for each task
            dataconf: the data configuration as a ConfigParser
            modelconf: the neural net model configuration
            evaluatorconf: the evaluator configuration for evaluating
                if None no evaluation will be done
            expdir: directory where the summaries will be written
            init_filename: filename of the network that should be used to
            initialize the model. Put to None if no network is available/wanted.
            server: optional server to be used for distributed training
            task_index: optional index of the worker task in the cluster
        '''

        self.expdir = expdir
        self.server = server
        self.conf = conf
        self.tasksconf = tasksconf
        self.task_index = task_index
        self.init_filename = init_filename

        self.batch_size = int(conf['batch_size'])

        cluster = tf.train.ClusterSpec(server.server_def.cluster)

        #create the graph
        self.graph = tf.Graph()

        #3 model types for multi task: single one to one; single one to many; multiple one to one
        #single one to one: the whole model is shared for all tasks, only loss function can be different
        #single one to many: each task has a separate output so only part of the network is shared, eg evrything but the output layer
        #multiple one to one: each task has its own network. Possibly the outputs are combined in a loss function

        #create the model
        modelfile = os.path.join(expdir, 'model', 'model.pkl')
        with open(modelfile, 'wb') as fid:
            self.model = model_factory.factory(
                modelconf.get('model', 'architecture'))(conf=modelconf)
            pickle.dump(self.model, fid)

        evaltype = evaluatorconf.get('evaluator', 'evaluator')

        #get the database configurations
        input_dataconfs = dict()
        target_dataconfs = dict()
        loss_computers = dict()
        nr_input_sections = dict()
        if evaltype != 'None':
            evaluators = dict()

        for task in self.conf['tasks'].split(' '):
            taskconf = self.tasksconf[task]

            #get the database configurations
            input_names = modelconf.get('io', 'inputs').split(' ')
            if input_names == ['']:
                input_names = []
            input_sections = [taskconf[i].split(' ') for i in input_names]
            nr_input_sections[task] = len(input_sections)
            task_input_dataconfs = []
            for sectionset in input_sections:
                task_input_dataconfs.append([])
                for section in sectionset:
                    task_input_dataconfs[-1].append(
                        dict(dataconf.items(section)))
            input_dataconfs[task] = task_input_dataconfs

            output_names = taskconf['targets'].split(' ')
            if output_names == ['']:
                output_names = []
            target_sections = [taskconf[o].split(' ') for o in output_names]
            task_target_dataconfs = []
            for sectionset in target_sections:
                task_target_dataconfs.append([])
                for section in sectionset:
                    task_target_dataconfs[-1].append(
                        dict(dataconf.items(section)))
            target_dataconfs[task] = task_target_dataconfs

            #create the loss computer
            loss_computer = loss_computer_factory.factory(
                taskconf['loss_type'])(self.batch_size)

            loss_computers[task] = loss_computer

            if evaltype != 'None':
                evaluator = evaluator_factory.factory(evaltype)(
                    conf=evaluatorconf,
                    dataconf=dataconf,
                    model=self.model,
                    task=task)

                evaluators[task] = evaluator

        if 'local' in cluster.as_dict():
            num_replicas = 1
            device = tf.DeviceSpec(job='local')
        else:
            #distributed training
            num_replicas = len(cluster.as_dict()['worker'])
            num_servers = len(cluster.as_dict()['ps'])
            ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
                num_tasks=num_servers,
                load_fn=tf.contrib.training.byte_size_load_fn)
            device = tf.train.replica_device_setter(ps_tasks=num_servers,
                                                    ps_strategy=ps_strategy)
            chief_ps = tf.DeviceSpec(job='ps', task=0)

        self.is_chief = task_index == 0

        #define the placeholders in the graph
        with self.graph.as_default():

            #create a local num_steps variable
            self.num_steps = tf.get_variable(
                name='num_steps',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)

            #a variable to hold the amount of steps already taken
            self.global_step = tf.get_variable(
                name='global_step',
                shape=[],
                dtype=tf.int32,
                initializer=tf.constant_initializer(0),
                trainable=False)

            should_terminate = tf.get_variable(
                name='should_terminate',
                shape=[],
                dtype=tf.bool,
                initializer=tf.constant_initializer(False),
                trainable=False)

            self.terminate = should_terminate.assign(True).op

            #create a check if training should continue
            self.should_stop = tf.logical_or(
                tf.greater_equal(self.global_step, self.num_steps),
                should_terminate)

            with tf.device(device):
                data_queues = dict()
                num_steps = []
                done_ops = []
                for task in self.conf['tasks'].split(' '):

                    #check if running in distributed model
                    if 'local' in cluster.as_dict():

                        #get the filenames
                        data_queue_elements, _ = input_pipeline.get_filenames(
                            input_dataconfs[task] + target_dataconfs[task])

                        #create the data queue and queue runners (inputs get shuffled! I already did this so set to False)
                        data_queue = tf.train.string_input_producer(
                            string_tensor=data_queue_elements,
                            shuffle=False,
                            seed=None,
                            capacity=self.batch_size * 2,
                            shared_name='data_queue_' + task)

                        data_queues[task] = data_queue

                        #compute the number of steps
                        if int(conf['numbatches_to_aggregate']) == 0:
                            task_num_steps = (int(conf['num_epochs']) *
                                              len(data_queue_elements) /
                                              self.batch_size)
                        else:
                            task_num_steps = (
                                int(conf['num_epochs']) *
                                len(data_queue_elements) /
                                (self.batch_size *
                                 int(conf['numbatches_to_aggregate'])))

                        #set the number of steps
                        num_steps.append(task_num_steps)
                        done_ops.append(tf.no_op())

                    else:
                        with tf.device(chief_ps):

                            #get the data queue
                            data_queue = tf.FIFOQueue(
                                capacity=self.batch_size * (num_replicas + 1),
                                shared_name='data_queue_' + task,
                                name='data_queue_' + task,
                                dtypes=[tf.string],
                                shapes=[[]])

                            data_queues[task] = data_queue

                            #get the number of steps from the parameter server
                            num_steps_queue = tf.FIFOQueue(
                                capacity=num_replicas,
                                dtypes=[tf.int32],
                                shared_name='num_steps_queue',
                                name='num_steps_queue',
                                shapes=[[]])

                            #set the number of steps
                            task_num_steps = num_steps_queue.dequeue()

                        #get the done queues
                        for i in range(num_servers):
                            with tf.device('job:ps/task:%d' % i):
                                done_queue = tf.FIFOQueue(
                                    capacity=num_replicas,
                                    dtypes=[tf.bool],
                                    shapes=[[]],
                                    shared_name='done_queue%d' % i,
                                    name='done_queue%d' % i)

                                done_ops.append(done_queue.enqueue(True))

                self.set_num_steps = self.num_steps.assign(min(num_steps)).op
                self.done = tf.group(*done_ops)

                #training part
                with tf.variable_scope('train'):

                    #a variable to scale the learning rate (used to reduce the
                    #learning rate in case validation performance drops)
                    learning_rate_fact = tf.get_variable(
                        name='learning_rate_fact',
                        shape=[],
                        initializer=tf.constant_initializer(1.0),
                        trainable=False)

                    #compute the learning rate with exponential decay and scale
                    #with the learning rate factor
                    self.learning_rate = (tf.train.exponential_decay(
                        learning_rate=float(conf['initial_learning_rate']),
                        global_step=self.global_step,
                        decay_steps=self.num_steps,
                        decay_rate=float(conf['learning_rate_decay'])) *
                                          learning_rate_fact)

                    #create the optimizer
                    optimizer = tf.train.AdamOptimizer(self.learning_rate)

                    self.total_loss = tf.get_variable(
                        name='total_loss',
                        shape=[],
                        dtype=tf.float32,
                        initializer=tf.constant_initializer(0),
                        trainable=False)

                    self.reset_loss = self.total_loss.assign(0.0)

                    loss = []

                    for task in self.conf['tasks'].split(' '):

                        with tf.variable_scope(task):

                            #create the input pipeline
                            data, seq_length = input_pipeline.input_pipeline(
                                data_queue=data_queues[task],
                                batch_size=self.batch_size,
                                numbuckets=int(conf['numbuckets']),
                                dataconfs=input_dataconfs[task] +
                                target_dataconfs[task])

                            inputs = {
                                input_names[i]: d
                                for i, d in enumerate(
                                    data[:nr_input_sections[task]])
                            }
                            seq_length = {
                                input_names[i]: d
                                for i, d in enumerate(
                                    seq_length[:nr_input_sections[task]])
                            }
                            targets = {
                                output_names[i]: d
                                for i, d in enumerate(
                                    data[nr_input_sections[task]:])
                            }
                            #target_seq_length = {
                            #output_names[i]: d
                            #for i, d in enumerate(seq_length[nr_input_sections[task]:])}

                            #compute the training outputs of the model
                            logits = self.model(inputs=inputs,
                                                input_seq_length=seq_length,
                                                is_training=True)

                            #TODO: The proper way to exploit data paralellism is via the
                            #SyncReplicasOptimizer defined below. However for some reason it hangs
                            #and I have not yet found a solution for it. For the moment the gradients
                            #are accumulated in a way that does not allow data paralellism and there
                            # is no advantage on having multiple workers. (We also accumulate the loss)

                            #create an optimizer that aggregates gradients
                            #if int(conf['numbatches_to_aggregate']) > 0:
                            #optimizer = tf.train.SyncReplicasOptimizer(
                            #opt=optimizer,
                            #replicas_to_aggregate=int(
                            #conf['numbatches_to_aggregate'])#,
                            ##total_num_replicas=num_replicas
                            #)

                            #compute the loss
                            task_loss = loss_computers[task](targets, logits,
                                                             seq_length)

                            #append the task loss to the global loss
                            loss.append(task_loss)

                #accumulate losses from tasks
                    with tf.variable_scope('accumulate_loss_from_tasks'):
                        loss = tf.reduce_mean(loss)

                #accumulate losses from batches
                    self.acc_loss = self.total_loss.assign_add(loss)

                    ##compute the gradients
                    #grads_and_vars = optimizer.compute_gradients(self.loss)

                    #with tf.variable_scope('clip'):
                    #clip_value = float(conf['clip_grad_value'])
                    ##clip the gradients
                    #grads_and_vars = [(tf.clip_by_value(grad, -clip_value, clip_value), var)
                    #for grad, var in grads_and_vars]

                    self.params = tf.trainable_variables()

                    grads = [
                        tf.get_variable(param.op.name,
                                        param.get_shape().as_list(),
                                        initializer=tf.constant_initializer(0),
                                        trainable=False)
                        for param in self.params
                    ]

                    self.reset_grad = tf.variables_initializer(grads)

                    #compute the gradients
                    minibatch_grads_and_vars = optimizer.compute_gradients(
                        loss)

                    with tf.variable_scope('clip'):
                        clip_value = float(conf['clip_grad_value'])
                        #clip the gradients
                        minibatch_grads_and_vars = [
                            (tf.clip_by_value(grad, -clip_value,
                                              clip_value), var)
                            for grad, var in minibatch_grads_and_vars
                        ]

                    (minibatchgrads,
                     minibatchvars) = zip(*minibatch_grads_and_vars)

                    #update gradients by accumulating them
                    self.update_gradients = [
                        grad.assign_add(batchgrad)
                        for batchgrad, grad in zip(minibatchgrads, grads)
                    ]

                    #opperation to apply the gradients
                    grads_and_vars = list(zip(grads, minibatchvars))
                    apply_gradients_op = optimizer.apply_gradients(
                        grads_and_vars=grads_and_vars,
                        global_step=self.global_step,
                        name='apply_gradients')

                    #all remaining operations with the UPDATE_OPS GraphKeys
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

                    #create an operation to update the gradients, the batch_loss
                    #and do all other update ops
                    self.update_op = tf.group(*([apply_gradients_op] +
                                                update_ops),
                                              name='update')

                if evaltype != 'None':

                    #validation part
                    with tf.variable_scope('validate'):

                        #create a variable to hold the validation loss
                        self.validation_loss = tf.get_variable(
                            name='validation_loss',
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.constant_initializer(0),
                            trainable=False)

                        #create a variable to save the last step where the model
                        #was validated
                        validated_step = tf.get_variable(
                            name='validated_step',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.constant_initializer(
                                -int(conf['valid_frequency'])),
                            trainable=False)

                        #a check if validation is due
                        self.should_validate = tf.greater_equal(
                            self.global_step - validated_step,
                            int(conf['valid_frequency']))

                        val_batch_loss = []
                        valbatches = []

                        for task in self.conf['tasks'].split(' '):

                            with tf.variable_scope(task):

                                task_val_batch_loss, task_valbatches, _, _ = evaluators[
                                    task].evaluate()
                                val_batch_loss.append(task_val_batch_loss)
                                valbatches.append(task_valbatches)

                        val_batch_loss = tf.reduce_mean(val_batch_loss)
                        self.valbatches = min(valbatches)

                        self.update_loss = self.validation_loss.assign(
                            self.validation_loss +
                            val_batch_loss  #/self.valbatches
                        ).op

                        #update the learning rate factor
                        self.half_lr = learning_rate_fact.assign(
                            learning_rate_fact / 2).op

                        #create an operation to updated the validated step
                        self.update_validated_step = validated_step.assign(
                            self.global_step).op

                        #variable to hold the best validation loss so far
                        self.best_validation = tf.get_variable(
                            name='best_validation',
                            shape=[],
                            dtype=tf.float32,
                            initializer=tf.constant_initializer(1.79e+308),
                            trainable=False)

                        #op to update the best velidation loss
                        self.update_best = self.best_validation.assign(
                            self.validation_loss).op

                        #a variable that holds the amount of workers at the
                        #validation point
                        waiting_workers = tf.get_variable(
                            name='waiting_workers',
                            shape=[],
                            dtype=tf.int32,
                            initializer=tf.constant_initializer(0),
                            trainable=False)

                        #an operation to signal a waiting worker
                        self.waiting = waiting_workers.assign_add(1).op

                        #an operation to set the waiting workers to zero
                        self.reset_waiting = waiting_workers.initializer

                        #an operation to check if all workers are waiting
                        self.all_waiting = tf.equal(waiting_workers,
                                                    num_replicas - 1)

                        tf.summary.scalar('validation loss',
                                          self.validation_loss)

                else:
                    self.update_loss = None

                tf.summary.scalar('learning rate', self.learning_rate)

                #create a histogram for all trainable parameters
                for param in tf.trainable_variables():
                    tf.summary.histogram(param.name, param)

                #create the scaffold
                self.scaffold = tf.train.Scaffold()
Esempio n. 9
0
    def _data(self, chief_ps):
        '''
        create the input pipeline

        args:
            -chief_ps: the chief parameter server device

        returns:
            - the inputs
            - the input sequence lengths
            - the targets
            - the target sequence lengths
            - the number of steps in an epoch
        '''

        #get the database configurations
        input_names = self.model.conf.get('io', 'inputs').split(' ')
        if input_names == ['']:
            input_names = []
        input_sections = [self.conf[i].split(' ') for i in input_names]
        input_dataconfs = []
        for sectionset in input_sections:
            input_dataconfs.append([])
            for section in sectionset:
                input_dataconfs[-1].append(dict(self.dataconf.items(section)))

        output_names = self.conf['targets'].split(' ')
        if output_names == ['']:
            output_names = []
        target_sections = [self.conf[o].split(' ') for o in output_names]
        target_dataconfs = []
        for sectionset in target_sections:
            target_dataconfs.append([])
            for section in sectionset:
                target_dataconfs[-1].append(dict(self.dataconf.items(section)))

        #check if running in distributed model
        if chief_ps is None:

            #get the filenames
            data_queue_elements, _ = input_pipeline.get_filenames(
                input_dataconfs + target_dataconfs)

            #create the data queue and queue runners
            data_queue = tf.train.string_input_producer(
                string_tensor=data_queue_elements,
                shuffle=True,
                seed=None,
                capacity=int(self.conf['batch_size']) * 2,
                shared_name='data_queue')

        else:
            with tf.device(chief_ps):

                #get the data queue
                data_queue = tf.FIFOQueue(
                    capacity=int(self.conf['batch_size']) * 2,
                    shared_name='data_queue',
                    name='data_queue',
                    dtypes=[tf.string],
                    shapes=[[]])

        #create the input pipeline
        data, seq_length, num_steps = input_pipeline.input_pipeline(
            data_queue=data_queue,
            batch_size=int(self.conf['batch_size']),
            numbuckets=int(self.conf['numbuckets']),
            dataconfs=input_dataconfs + target_dataconfs,
            variable_batch_size=(self.conf['variable_batch_size'] == 'True'))

        inputs = {
            input_names[i]: d
            for i, d in enumerate(data[:len(input_sections)])
        }
        input_seq_length = {
            input_names[i]: d
            for i, d in enumerate(seq_length[:len(input_sections)])
        }
        targets = {
            output_names[i]: d
            for i, d in enumerate(data[len(input_sections):])
        }
        target_seq_length = {
            output_names[i]: d
            for i, d in enumerate(seq_length[len(input_sections):])
        }

        return inputs, input_seq_length, targets, target_seq_length, num_steps
Esempio n. 10
0
    def evaluate(self):
        '''evaluate the performance of the model

        Returns:
            - the loss as a scalar tensor
            - the number of batches in the validation set as an integer
        '''

        batch_size = int(self.conf.get('evaluator', 'batch_size'))
        requested_utts = int(self.conf.get('evaluator', 'requested_utts'))

        with tf.name_scope('evaluate'):

            #get the list of filenames fo the validation set
            data_queue_elements, _ = input_pipeline.get_filenames(
                self.input_dataconfs + self.target_dataconfs)

            max_number_of_elements = len(data_queue_elements)
            number_of_elements = min([max_number_of_elements, requested_utts])

            #compute the number of batches in the validation set
            numbatches = number_of_elements / batch_size
            number_of_elements = numbatches * batch_size
            print '%d utterances will be used for evaluation' % (
                number_of_elements)

            #cut the data so it has a whole numbe of batches
            data_queue_elements = data_queue_elements[:number_of_elements]

            #create a queue to hold the filenames
            data_queue = tf.train.string_input_producer(
                string_tensor=data_queue_elements,
                shuffle=False,
                seed=None,
                capacity=batch_size * 2)

            #create the input pipeline
            data, seq_length = input_pipeline.input_pipeline(
                data_queue=data_queue,
                batch_size=batch_size,
                numbuckets=1,
                dataconfs=self.input_dataconfs + self.target_dataconfs)

            inputs = {
                self.model.input_names[i]: d
                for i, d in enumerate(data[:len(self.input_dataconfs)])
            }

            seq_length = {
                self.model.input_names[i]: d
                for i, d in enumerate(seq_length[:len(self.input_dataconfs)])
            }

            target_names = self.conf.get(self.task, 'targets').split(' ')
            targets = {
                target_names[i]: d
                for i, d in enumerate(data[len(self.input_dataconfs):])
            }

            #target_seq_length = {
            #target_names[i]: d
            #for i, d in enumerate(seq_length[len(self.input_dataconfs):])}

            outputs = self._get_outputs(inputs, seq_length)

            loss = self.compute_loss(targets, outputs, seq_length)

        return loss, numbatches, outputs, seq_length
Esempio n. 11
0
    def evaluate(self):
        '''evaluate the performance of the model

        Returns:
            - the loss as a scalar tensor
            - an operation to update the loss
            - the number of batches in the validation set as an integer
        '''

        batch_size = int(self.conf['batch_size'])

        with tf.name_scope('evaluate'):

            #a variable to hold the validation loss
            loss = tf.get_variable(name='validation_loss',
                                   shape=[],
                                   dtype=tf.float32,
                                   initializer=tf.zeros_initializer(),
                                   trainable=False)

            #get the list of filenames fo the validation set
            data_queue_elements, _ = input_pipeline.get_filenames(
                self.input_dataconfs + self.target_dataconfs)

            #compute the number of batches in the validation set
            numbatches = len(data_queue_elements) / batch_size

            #cut the data so it has a whole numbe of batches
            data_queue_elements = data_queue_elements[:numbatches * batch_size]

            #create a queue to hold the filenames
            data_queue = tf.train.string_input_producer(
                string_tensor=data_queue_elements,
                shuffle=False,
                seed=None,
                capacity=batch_size * 2)

            #create the input pipeline
            data, seq_length, _ = input_pipeline.input_pipeline(
                data_queue=data_queue,
                batch_size=batch_size,
                numbuckets=1,
                dataconfs=self.input_dataconfs + self.target_dataconfs)

            inputs = {
                self.model.input_names[i]: d
                for i, d in enumerate(data[:len(self.input_dataconfs)])
            }

            input_seq_length = {
                self.model.input_names[i]: d
                for i, d in enumerate(seq_length[:len(self.input_dataconfs)])
            }

            target_names = self.conf['targets'].split(' ')
            targets = {
                target_names[i]: d
                for i, d in enumerate(data[len(self.input_dataconfs):])
            }

            target_seq_length = {
                target_names[i]: d
                for i, d in enumerate(seq_length[len(self.input_dataconfs):])
            }

            update_loss = self.update_loss(loss, inputs, input_seq_length,
                                           targets, target_seq_length)

        return loss, update_loss, numbatches
Esempio n. 12
0
    def __init__(self, model, conf, dataconf, expdir):
        '''Recognizer constructor

        Args:
            model: the model to be tested
            conf: the recognizer configuration as a configparser
            modelconf: the model configuration as a configparser
            dataconf: the database configuration as a configparser
            expdir: the experiments directory
        '''

        self.conf = dict(conf.items('recognizer'))
        self.expdir = expdir
        self.model = model

        #get the database configurations
        input_sections = [
            self.conf[i].split(' ') for i in self.model.input_names
        ]
        self.input_dataconfs = []
        for sectionset in input_sections:
            self.input_dataconfs.append([])
            for section in sectionset:
                self.input_dataconfs[-1].append(dict(dataconf.items(section)))

        #create a decoder
        self.decoder = decoder_factory.factory(conf.get('decoder',
                                                        'decoder'))(conf,
                                                                    self.model)

        self.batch_size = int(self.conf['batch_size'])

        #create the graph
        self.graph = tf.Graph()
        with self.graph.as_default():
            #get the list of filenames fo the validation set
            data_queue_elements, self.names = input_pipeline.get_filenames(
                self.input_dataconfs)

            #compute the number of batches in the validation set
            self.numbatches = int(
                math.ceil(float(len(data_queue_elements)) / self.batch_size))

            #create a queue to hold the filenames
            data_queue = tf.train.string_input_producer(
                string_tensor=data_queue_elements,
                num_epochs=1,
                shuffle=False,
                seed=None,
                capacity=self.batch_size * 2)

            #create the input pipeline
            inputs, input_seq_length, _ = input_pipeline.input_pipeline(
                data_queue=data_queue,
                batch_size=self.batch_size,
                numbuckets=1,
                allow_smaller_final_batch=True,
                dataconfs=self.input_dataconfs)

            inputs = {
                self.model.input_names[i]: d
                for i, d in enumerate(inputs)
            }
            input_seq_length = {
                self.model.input_names[i]: d
                for i, d in enumerate(input_seq_length)
            }

            self.decoded = self.decoder(inputs, input_seq_length)

            #create a histogram for all trainable parameters
            for param in tf.trainable_variables():
                tf.summary.histogram(param.name, param)
Esempio n. 13
0
    def set_dataqueues(self, cluster):
        '''sets the data queues'''

        #check if running in distributed model
        self.data_queue = dict()
        for linkedset in self.linkedsets:
            data_queue_name = 'data_queue_%s_%s' % (self.task_name, linkedset)
            if 'local' in cluster.as_dict():
                data_queue_elements, _ = input_pipeline.get_filenames(
                    self.input_dataconfs[linkedset] +
                    self.target_dataconfs[linkedset])

                number_of_elements = len(data_queue_elements)
                if 'trainset_frac' in self.taskconf:
                    number_of_elements = int(
                        float(number_of_elements) *
                        float(self.taskconf['trainset_frac']))
                print '%d utterances will be used for training' % (
                    number_of_elements)

                data_queue_elements = data_queue_elements[:number_of_elements]

                #create the data queue and queue runners
                self.data_queue[linkedset] = tf.train.string_input_producer(
                    string_tensor=data_queue_elements,
                    shuffle=False,
                    seed=None,
                    capacity=self.batch_size * 2,
                    shared_name=data_queue_name)

                #compute the number of steps
                if int(self.trainerconf['numbatches_to_aggregate']) == 0:
                    num_steps = (int(self.trainerconf['num_epochs']) *
                                 len(data_queue_elements) / self.batch_size)
                else:
                    num_steps = (
                        int(self.trainerconf['num_epochs']) *
                        len(data_queue_elements) /
                        (self.batch_size *
                         int(self.trainerconf['numbatches_to_aggregate'])))

                done_ops = [tf.no_op()]

            else:
                #get the data queue
                self.data_queue[linkedset] = tf.FIFOQueue(
                    capacity=self.batch_size * (num_replicas + 1),
                    shared_name=data_queue_name,
                    name=data_queue_name,
                    dtypes=[tf.string],
                    shapes=[[]])

                #get the number of steps from the parameter server
                num_steps_queue = tf.FIFOQueue(capacity=num_replicas,
                                               dtypes=[tf.int32],
                                               shared_name='num_steps_queue',
                                               name='num_steps_queue',
                                               shapes=[[]])

                #set the number of steps
                num_steps = num_steps_queue.dequeue()

                #get the done queues
                for i in range(num_servers):
                    with tf.device('job:ps/task:%d' % i):
                        done_queue = tf.FIFOQueue(capacity=num_replicas,
                                                  dtypes=[tf.bool],
                                                  shapes=[[]],
                                                  shared_name='done_queue%d' %
                                                  i,
                                                  name='done_queue%d' % i)

                        done_ops.append(done_queue.enqueue(True))

        return num_steps, done_ops