def update_progressbar(set_name):
        if not hasattr(update_progressbar, 'current_set_name'):
            update_progressbar.current_set_name = None

        if (update_progressbar.current_set_name != set_name or
            update_progressbar.current_job_index == update_progressbar.total_jobs):

            # finish prev pbar if it exists
            if hasattr(update_progressbar, 'pbar') and update_progressbar.pbar:
                update_progressbar.pbar.finish()

            update_progressbar.total_jobs = None
            update_progressbar.current_job_index = 0

            current_epoch = coord._epoch-1

            if set_name == "train":
                log_info('Training epoch %i...' % current_epoch)
                update_progressbar.total_jobs = coord._num_jobs_train
            else:
                log_info('Validating epoch %i...' % current_epoch)
                update_progressbar.total_jobs = coord._num_jobs_dev

            # recreate pbar
            update_progressbar.pbar = progressbar.ProgressBar(max_value=update_progressbar.total_jobs,
                                                              redirect_stdout=True).start()

            update_progressbar.current_set_name = set_name

        if update_progressbar.pbar:
            update_progressbar.pbar.update(update_progressbar.current_job_index+1, force=True)

        update_progressbar.current_job_index += 1
Beispiel #2
0
    def update_progressbar(set_name):
        if not hasattr(update_progressbar, 'current_set_name'):
            update_progressbar.current_set_name = None

        if (update_progressbar.current_set_name != set_name or
            update_progressbar.current_job_index == update_progressbar.total_jobs):

            # finish prev pbar if it exists
            if hasattr(update_progressbar, 'pbar') and update_progressbar.pbar:
                update_progressbar.pbar.finish()

            update_progressbar.total_jobs = None
            update_progressbar.current_job_index = 0

            current_epoch = coord._epoch-1

            if set_name == "train":
                log_info('Training epoch %i...' % current_epoch)
                update_progressbar.total_jobs = coord._num_jobs_train
            else:
                log_info('Validating epoch %i...' % current_epoch)
                update_progressbar.total_jobs = coord._num_jobs_dev

            # recreate pbar
            update_progressbar.pbar = progressbar.ProgressBar(max_value=update_progressbar.total_jobs,
                                                              redirect_stdout=True).start()

            update_progressbar.current_set_name = set_name

        if update_progressbar.pbar:
            update_progressbar.pbar.update(update_progressbar.current_job_index+1, force=True)

        update_progressbar.current_job_index += 1
Beispiel #3
0
def load_or_init_graph(session, method_order):
    '''
    Load variables from checkpoint or initialize variables following the method
    order specified in the method_order parameter.

    Valid methods are 'best', 'last' and 'init'.
    '''
    for method in method_order:
        # Load best validating checkpoint, saved in checkpoint file 'best_dev_checkpoint'
        if method == 'best':
            ckpt_path = _checkpoint_path_or_none('best_dev_checkpoint')
            if ckpt_path:
                log_info('Loading best validating checkpoint from {}'.format(ckpt_path))
                return _load_checkpoint(session, ckpt_path)
            log_info('Could not find best validating checkpoint.')

        # Load most recent checkpoint, saved in checkpoint file 'checkpoint'
        elif method == 'last':
            ckpt_path = _checkpoint_path_or_none('checkpoint')
            if ckpt_path:
                log_info('Loading most recent checkpoint from {}'.format(ckpt_path))
                return _load_checkpoint(session, ckpt_path)
            log_info('Could not find most recent checkpoint.')

        # Initialize all variables
        elif method == 'init':
            log_info('Initializing all variables.')
            return _initialize_all_variables(session)

        else:
            log_error('Unknown initialization method: {}'.format(method))
            sys.exit(1)

    log_error('All initialization methods failed ({}).'.format(method_order))
    sys.exit(1)
Beispiel #4
0
def try_loading(session,
                saver,
                checkpoint_filename,
                caption,
                load_step=True,
                log_success=True):
    try:
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir,
                                                   checkpoint_filename)
        if not checkpoint:
            return False
        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)
        if load_step:
            restored_step = session.run(tfv1.train.get_global_step())
            if log_success:
                log_info(
                    'Restored variables from %s checkpoint at %s, step %d' %
                    (caption, checkpoint_path, restored_step))
        elif log_success:
            log_info('Restored variables from %s checkpoint at %s' %
                     (caption, checkpoint_path))
        return True
    except tf.errors.InvalidArgumentError as e:
        log_error(str(e))
        log_error(
            'The checkpoint in {0} does not match the shapes of the model.'
            ' Did you change alphabet.txt or the --n_hidden parameter'
            ' between train runs using the same checkpoint dir? Try moving'
            ' or removing the contents of {0}.'.format(checkpoint_path))
        sys.exit(1)
    def next_job(self, job):
        '''Sends a finished job back to the coordinator and retrieves in exchange the next one.

        Kwargs:
            job (WorkerJob): job that was finished by a worker and who's results are to be
                digested by the coordinator

        Returns:
            WorkerJob. next job of one of the running epochs that will get
                associated with the worker from the finished job and put into state 'running'
        '''
        if self.is_chief:
            # Try to find the epoch the job belongs to
            epoch = next((epoch for epoch in self._epochs_running if epoch.id == job.epoch_id), None)
            if epoch:
                # We are going to manipulate things - let's avoid undefined state
                with self._lock:
                    # Let the epoch finish the job
                    epoch.finish_job(job)
                    # Check, if epoch is done now
                    if epoch.done():
                        # If it declares itself done, move it from 'running' to 'done' collection
                        self._epochs_running.remove(epoch)
                        self._epochs_done.append(epoch)
                        log_info('%s' % epoch)
            else:
                # There was no running epoch found for this job - this should never happen.
                log_error('There is no running epoch of ID %d for job with ID %d. This should never happen.' % (job.epoch_id, job.id))
            return self.get_job(job.worker)

        # We are a remote worker and have to hand over to the chief worker by HTTP
        result = self._talk_to_chief('', data=pickle.dumps(job))
        if result:
            result = pickle.loads(result)
        return result
Beispiel #6
0
def _load_checkpoint(session, checkpoint_path):
    # Load the checkpoint and put all variables into loading list
    # we will exclude variables we do not wish to load and then
    # we will initialize them instead
    ckpt = tfv1.train.load_checkpoint(checkpoint_path)
    load_vars = set(tfv1.global_variables())
    init_vars = set()

    if FLAGS.load_cudnn:
        # Initialize training from a CuDNN RNN checkpoint
        # Identify the variables which we cannot load, and set them
        # for initialization
        for v in load_vars:
            try:
                ckpt.get_tensor(v.op.name)
            except tf.errors.NotFoundError:
                log_error('CUDNN variable not found: %s' % (v.op.name))
                init_vars.add(v)

        load_vars -= init_vars

        # Check that the only missing variables (i.e. those to be initialised)
        # are the Adam moment tensors, if they aren't then we have an issue
        init_var_names = [v.op.name for v in init_vars]
        if any('Adam' not in v for v in init_var_names):
            log_error('Tried to load a CuDNN RNN checkpoint but there were '
                      'more missing variables than just the Adam moment '
                      'tensors. Missing variables: {}'.format(init_var_names))
            sys.exit(1)

    if FLAGS.drop_source_layers > 0:
        # This transfer learning approach requires supplying
        # the layers which we exclude from the source model.
        # Say we want to exclude all layers except for the first one,
        # then we are dropping five layers total, so: drop_source_layers=5
        # If we want to use all layers from the source model except
        # the last one, we use this: drop_source_layers=1
        if FLAGS.drop_source_layers >= 6:
            log_warn(
                'The checkpoint only has 6 layers, but you are trying to drop '
                'all of them or more than all of them. Continuing and '
                'dropping only 5 layers.')
            FLAGS.drop_source_layers = 5

        dropped_layers = ['2', '3', 'lstm', '5',
                          '6'][-1 * int(FLAGS.drop_source_layers):]
        # Initialize all variables needed for DS, but not loaded from ckpt
        for v in load_vars:
            if any(layer in v.op.name for layer in dropped_layers):
                init_vars.add(v)
        load_vars -= init_vars

    for v in sorted(load_vars, key=lambda v: v.op.name):
        log_info('Loading variable from checkpoint: %s' % (v.op.name))
        v.load(ckpt.get_tensor(v.op.name), session=session)

    for v in sorted(init_vars, key=lambda v: v.op.name):
        log_info('Initializing variable: %s' % (v.op.name))
        session.run(v.initializer)
Beispiel #7
0
def package_zip():
    # --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
    export_dir = os.path.join(os.path.abspath(FLAGS.export_dir), '') # Force ending '/'
    zip_filename = os.path.dirname(export_dir)

    shutil.copy(FLAGS.scorer_path, export_dir)

    archive = shutil.make_archive(zip_filename, 'zip', export_dir)
    log_info('Exported packaged model {}'.format(archive))
    def _next_epoch(self):
        # State-machine of the coordination process

        # Indicates, if there were 'new' epoch(s) provided
        result = False

        # Make sure that early stop is enabled and validation part is enabled
        if (FLAGS.early_stop is True) and (FLAGS.validation_step > 0) and (len(self._dev_losses) >= FLAGS.earlystop_nsteps):

            # Calculate the mean of losses for past epochs
            mean_loss = np.mean(self._dev_losses[-FLAGS.earlystop_nsteps:-1])
            # Calculate the standard deviation for losses from validation part in the past epochs
            std_loss = np.std(self._dev_losses[-FLAGS.earlystop_nsteps:-1])
            # Update the list of losses incurred
            self._dev_losses = self._dev_losses[-FLAGS.earlystop_nsteps:]
            log_debug('Checking for early stopping (last %d steps) validation loss: %f, with standard deviation: %f and mean: %f' % (FLAGS.earlystop_nsteps, self._dev_losses[-1], std_loss, mean_loss))

            # Check if validation loss has started increasing or is not decreasing substantially, making sure slight fluctuations don't bother the early stopping from working
            if self._dev_losses[-1] > np.max(self._dev_losses[:-1]) or (abs(self._dev_losses[-1] - mean_loss) < FLAGS.estop_mean_thresh and std_loss < FLAGS.estop_std_thresh):
                # Time to early stop
                log_info('Early stop triggered as (for last %d steps) validation loss: %f with standard deviation: %f and mean: %f' % (FLAGS.earlystop_nsteps, self._dev_losses[-1], std_loss, mean_loss))
                self._dev_losses = []
                self._end_training()
                self._train = False

        if self._train:
            # We are in train mode
            if self._num_jobs_train_left > 0:
                # There are still jobs left
                num_jobs_train = min(self._num_jobs_train_left, self._num_jobs_train)
                self._num_jobs_train_left -= num_jobs_train

                # Let's try our best to keep the notion of curriculum learning
                self._reset_counters()

                # Append the training epoch
                self._epochs_running.append(Epoch(self, self._epoch, num_jobs_train, set_name='train'))

                if FLAGS.validation_step > 0 and (FLAGS.validation_step == 1 or self._epoch > 0) and self._epoch % FLAGS.validation_step == 0:
                    # The current epoch should also have a validation part
                    self._epochs_running.append(Epoch(self, self._epoch, self._num_jobs_dev, set_name='dev'))


                # Indicating that there were 'new' epoch(s) provided
                result = True
            else:
                # No jobs left, but still in train mode: concluding training
                self._end_training()
                self._train = False

        if result:
            # Increment the epoch index
            self._epoch += 1
        return result
Beispiel #9
0
def package_zip():
    # --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
    export_dir = os.path.join(os.path.abspath(FLAGS.export_dir),
                              '')  # Force ending '/'
    zip_filename = os.path.dirname(export_dir)

    with open(os.path.join(export_dir, 'info.json'), 'w') as f:
        json.dump({
            'name': FLAGS.export_language,
        }, f)

    shutil.copy(FLAGS.scorer_path, export_dir)

    archive = shutil.make_archive(zip_filename, 'zip', export_dir)
    log_info('Exported packaged model {}'.format(archive))
Beispiel #10
0
def package_zip():
    # --export_dir path/to/export/LANG_CODE/ => path/to/export/LANG_CODE.zip
    export_dir = os.path.join(os.path.abspath(FLAGS.export_dir), '') # Force ending '/'
    zip_filename = os.path.dirname(export_dir)

    with open(os.path.join(export_dir, 'info.json'), 'w') as f:
        json.dump({
            'name': FLAGS.export_language,
            'parameters': {
                'beamWidth': FLAGS.beam_width,
                'lmAlpha': FLAGS.lm_alpha,
                'lmBeta': FLAGS.lm_beta
            }
        }, f)

    shutil.copy(FLAGS.lm_binary_path, export_dir)
    shutil.copy(FLAGS.lm_trie_path, export_dir)

    archive = shutil.make_archive(zip_filename, 'zip', export_dir)
    log_info('Exported packaged model {}'.format(archive))
Beispiel #11
0
def train():
    do_cache_dataset = True

    # pylint: disable=too-many-boolean-expressions
    if (FLAGS.data_aug_features_multiplicative > 0 or
            FLAGS.data_aug_features_additive > 0 or
            FLAGS.augmentation_spec_dropout_keeprate < 1 or
            FLAGS.augmentation_freq_and_time_masking or
            FLAGS.augmentation_pitch_and_tempo_scaling or
            FLAGS.augmentation_speed_up_std > 0 or
            FLAGS.augmentation_sparse_warp):
        do_cache_dataset = False

    exception_box = ExceptionBox()

    # Create training and validation datasets
    train_set = create_dataset(FLAGS.train_files.split(','),
                               batch_size=FLAGS.train_batch_size,
                               enable_cache=FLAGS.feature_cache and do_cache_dataset,
                               cache_path=FLAGS.feature_cache,
                               train_phase=True,
                               exception_box=exception_box,
                               process_ahead=len(Config.available_devices) * FLAGS.train_batch_size * 2,
                               buffering=FLAGS.read_buffer)

    iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                 tfv1.data.get_output_shapes(train_set),
                                                 output_classes=tfv1.data.get_output_classes(train_set))

    # Make initialization ops for switching between the two sets
    train_init_op = iterator.make_initializer(train_set)

    if FLAGS.dev_files:
        dev_sources = FLAGS.dev_files.split(',')
        dev_sets = [create_dataset([source],
                                   batch_size=FLAGS.dev_batch_size,
                                   train_phase=False,
                                   exception_box=exception_box,
                                   process_ahead=len(Config.available_devices) * FLAGS.dev_batch_size * 2,
                                   buffering=FLAGS.read_buffer) for source in dev_sources]
        dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]

    # Dropout
    dropout_rates = [tfv1.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6)]
    dropout_feed_dict = {
        dropout_rates[0]: FLAGS.dropout_rate,
        dropout_rates[1]: FLAGS.dropout_rate2,
        dropout_rates[2]: FLAGS.dropout_rate3,
        dropout_rates[3]: FLAGS.dropout_rate4,
        dropout_rates[4]: FLAGS.dropout_rate5,
        dropout_rates[5]: FLAGS.dropout_rate6,
    }
    no_dropout_feed_dict = {
        rate: 0. for rate in dropout_rates
    }

    # Building the graph
    learning_rate_var = tfv1.get_variable('learning_rate', initializer=FLAGS.learning_rate, trainable=False)
    reduce_learning_rate_op = learning_rate_var.assign(tf.multiply(learning_rate_var, FLAGS.plateau_reduction))
    optimizer = create_optimizer(learning_rate_var)

    # Enable mixed precision training
    if FLAGS.automatic_mixed_precision:
        log_info('Enabling automatic mixed precision training.')
        optimizer = tfv1.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)

    gradients, loss, non_finite_files = get_tower_results(iterator, optimizer, dropout_rates)

    # Average tower gradients across GPUs
    avg_tower_gradients = average_gradients(gradients)
    log_grads_and_vars(avg_tower_gradients)

    # global_step is automagically incremented by the optimizer
    global_step = tfv1.train.get_or_create_global_step()
    apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step)

    # Summaries
    step_summaries_op = tfv1.summary.merge_all('step_summaries')
    step_summary_writers = {
        'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120),
        'dev': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120)
    }

    # Checkpointing
    checkpoint_saver = tfv1.train.Saver(max_to_keep=FLAGS.max_to_keep)
    checkpoint_path = os.path.join(FLAGS.save_checkpoint_dir, 'train')

    best_dev_saver = tfv1.train.Saver(max_to_keep=1)
    best_dev_path = os.path.join(FLAGS.save_checkpoint_dir, 'best_dev')

    # Save flags next to checkpoints
    os.makedirs(FLAGS.save_checkpoint_dir, exist_ok=True)
    flags_file = os.path.join(FLAGS.save_checkpoint_dir, 'flags.txt')
    with open(flags_file, 'w') as fout:
        fout.write(FLAGS.flags_into_string())

    with tfv1.Session(config=Config.session_config) as session:
        log_debug('Session opened.')

        # Prevent further graph changes
        tfv1.get_default_graph().finalize()

        # Load checkpoint or initialize variables
        if FLAGS.load == 'auto':
            method_order = ['best', 'last', 'init']
        else:
            method_order = [FLAGS.load]
        load_or_init_graph(session, method_order)

        def run_set(set_name, epoch, init_op, dataset=None):
            is_train = set_name == 'train'
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict

            total_loss = 0.0
            step_count = 0

            step_summary_writer = step_summary_writers.get(set_name)
            checkpoint_time = time.time()

            # Setup progress bar
            class LossWidget(progressbar.widgets.FormatLabel):
                def __init__(self):
                    progressbar.widgets.FormatLabel.__init__(self, format='Loss: %(mean_loss)f')

                def __call__(self, progress, data, **kwargs):
                    data['mean_loss'] = total_loss / step_count if step_count else 0.0
                    return progressbar.widgets.FormatLabel.__call__(self, progress, data, **kwargs)

            prefix = 'Epoch {} | {:>10}'.format(epoch, 'Training' if is_train else 'Validation')
            widgets = [' | ', progressbar.widgets.Timer(),
                       ' | Steps: ', progressbar.widgets.Counter(),
                       ' | ', LossWidget()]
            suffix = ' | Dataset: {}'.format(dataset) if dataset else None
            pbar = create_progressbar(prefix=prefix, widgets=widgets, suffix=suffix).start()

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # Batch loop
            while True:
                try:
                    _, current_step, batch_loss, problem_files, step_summary = \
                        session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
                                    feed_dict=feed_dict)
                    exception_box.raise_if_set()
                except tf.errors.InvalidArgumentError as err:
                    if FLAGS.augmentation_sparse_warp:
                        log_info("Ignoring sparse warp error: {}".format(err))
                        continue
                    else:
                        raise
                except tf.errors.OutOfRangeError:
                    exception_box.raise_if_set()
                    break

                if problem_files.size > 0:
                    problem_files = [f.decode('utf8') for f in problem_files[..., 0]]
                    log_error('The following files caused an infinite (or NaN) '
                              'loss: {}'.format(','.join(problem_files)))

                total_loss += batch_loss
                step_count += 1

                pbar.update(step_count)

                step_summary_writer.add_summary(step_summary, current_step)

                if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs:
                    checkpoint_saver.save(session, checkpoint_path, global_step=current_step)
                    checkpoint_time = time.time()

            pbar.finish()
            mean_loss = total_loss / step_count if step_count > 0 else 0.0
            return mean_loss, step_count

        log_info('STARTING Optimization')
        train_start_time = datetime.utcnow()
        best_dev_loss = float('inf')
        dev_losses = []
        epochs_without_improvement = 0
        try:
            for epoch in range(FLAGS.epochs):
                # Training
                log_progress('Training epoch %d...' % epoch)
                train_loss, _ = run_set('train', epoch, train_init_op)
                log_progress('Finished training epoch %d - loss: %f' % (epoch, train_loss))
                checkpoint_saver.save(session, checkpoint_path, global_step=global_step)

                if FLAGS.dev_files:
                    # Validation
                    dev_loss = 0.0
                    total_steps = 0
                    for source, init_op in zip(dev_sources, dev_init_ops):
                        log_progress('Validating epoch %d on %s...' % (epoch, source))
                        set_loss, steps = run_set('dev', epoch, init_op, dataset=source)
                        dev_loss += set_loss * steps
                        total_steps += steps
                        log_progress('Finished validating epoch %d on %s - loss: %f' % (epoch, source, set_loss))

                    dev_loss = dev_loss / total_steps
                    dev_losses.append(dev_loss)

                    # Count epochs without an improvement for early stopping and reduction of learning rate on a plateau
                    # the improvement has to be greater than FLAGS.es_min_delta
                    if dev_loss > best_dev_loss - FLAGS.es_min_delta:
                        epochs_without_improvement += 1
                    else:
                        epochs_without_improvement = 0

                    # Save new best model
                    if dev_loss < best_dev_loss:
                        best_dev_loss = dev_loss
                        save_path = best_dev_saver.save(session, best_dev_path, global_step=global_step, latest_filename='best_dev_checkpoint')
                        log_info("Saved new best validating model with loss %f to: %s" % (best_dev_loss, save_path))

                    # Early stopping
                    if FLAGS.early_stop and epochs_without_improvement == FLAGS.es_epochs:
                        log_info('Early stop triggered as the loss did not improve the last {} epochs'.format(
                            epochs_without_improvement))
                        break

                    # Reduce learning rate on plateau
                    if (FLAGS.reduce_lr_on_plateau and
                            epochs_without_improvement % FLAGS.plateau_epochs == 0 and epochs_without_improvement > 0):
                        # If the learning rate was reduced and there is still no improvement
                        # wait FLAGS.plateau_epochs before the learning rate is reduced again
                        session.run(reduce_learning_rate_op)
                        current_learning_rate = learning_rate_var.eval()
                        log_info('Encountered a plateau, reducing learning rate to {}'.format(
                            current_learning_rate))

        except KeyboardInterrupt:
            pass
        log_info('FINISHED optimization in {}'.format(datetime.utcnow() - train_start_time))
    log_debug('Session closed.')
 def _end_training(self):
     self._training_time = stopwatch(self._training_time)
     log_info('FINISHED Optimization - training time: %s' % format_duration(self._training_time))
    def start_coordination(self, model_feeder, step=0):
        '''Starts to coordinate epochs and jobs among workers on base of
        data-set sizes, the (global) step and FLAGS parameters.

        Args:
            model_feeder (ModelFeeder): data-sets to be used for coordinated training

        Kwargs:
            step (int): global step of a loaded model to determine starting point
        '''
        with self._lock:
            self._init()

            # Number of GPUs per worker - fixed for now by local reality or cluster setup
            gpus_per_worker = len(Config.available_devices)

            # Number of batches processed per job per worker
            batches_per_job  = gpus_per_worker * max(1, FLAGS.iters_per_worker)

            # Number of batches per global step
            batches_per_step = gpus_per_worker * max(1, FLAGS.replicas_to_agg)

            # Number of global steps per epoch - to be at least 1
            steps_per_epoch = max(1, model_feeder.train.total_batches // batches_per_step)

            # The start epoch of our training
            self._epoch = step // steps_per_epoch

            # Number of additional 'jobs' trained already 'on top of' our start epoch
            jobs_trained = (step % steps_per_epoch) * batches_per_step // batches_per_job

            # Total number of train/dev jobs covering their respective whole sets (one epoch)
            self._num_jobs_train = max(1, model_feeder.train.total_batches // batches_per_job)
            self._num_jobs_dev   = max(1, model_feeder.dev.total_batches   // batches_per_job)

            if FLAGS.epoch < 0:
                # A negative epoch means to add its absolute number to the epochs already computed
                self._target_epoch = self._epoch + abs(FLAGS.epoch)
            else:
                self._target_epoch = FLAGS.epoch

            # State variables
            # We only have to train, if we are told so and are not at the target epoch yet
            self._train = FLAGS.train and self._target_epoch > self._epoch

            if self._train:
                # The total number of jobs for all additional epochs to be trained
                # Will be decremented for each job that is produced/put into state 'open'
                self._num_jobs_train_left = (self._target_epoch - self._epoch) * self._num_jobs_train - jobs_trained
                log_info('STARTING Optimization')
                self._training_time = stopwatch()

            # Important for debugging
            log_debug('step: %d' % step)
            log_debug('epoch: %d' % self._epoch)
            log_debug('target epoch: %d' % self._target_epoch)
            log_debug('steps per epoch: %d' % steps_per_epoch)
            log_debug('number of batches in train set: %d' % model_feeder.train.total_batches)
            log_debug('batches per job: %d' % batches_per_job)
            log_debug('batches per step: %d' % batches_per_step)
            log_debug('number of jobs in train set: %d' % self._num_jobs_train)
            log_debug('number of jobs already trained in first epoch: %d' % jobs_trained)

            self._next_epoch()

        # The coordinator is ready to serve
        self.started = True
Beispiel #14
0
def train():
    # Create training and validation datasets
    train_set, train_batches = create_dataset(
        FLAGS.train_files.split(','),
        batch_size=FLAGS.train_batch_size,
        cache_path=FLAGS.train_cached_features_path)

    iterator = tf.data.Iterator.from_structure(
        train_set.output_types,
        train_set.output_shapes,
        output_classes=train_set.output_classes)

    # Make initialization ops for switching between the two sets
    train_init_op = iterator.make_initializer(train_set)

    if FLAGS.dev_files:
        dev_set, dev_batches = create_dataset(
            FLAGS.dev_files.split(','),
            batch_size=FLAGS.dev_batch_size,
            cache_path=FLAGS.dev_cached_features_path)
        dev_init_op = iterator.make_initializer(dev_set)

    # Dropout
    dropout_rates = [
        tf.placeholder(tf.float32, name='dropout_{}'.format(i))
        for i in range(6)
    ]
    dropout_feed_dict = {
        dropout_rates[0]: FLAGS.dropout_rate,
        dropout_rates[1]: FLAGS.dropout_rate2,
        dropout_rates[2]: FLAGS.dropout_rate3,
        dropout_rates[3]: FLAGS.dropout_rate4,
        dropout_rates[4]: FLAGS.dropout_rate5,
        dropout_rates[5]: FLAGS.dropout_rate6,
    }
    no_dropout_feed_dict = {rate: 0. for rate in dropout_rates}

    # Building the graph
    optimizer = create_optimizer()
    gradients, loss = get_tower_results(iterator, optimizer, dropout_rates)
    # Average tower gradients across GPUs
    avg_tower_gradients = average_gradients(gradients)
    log_grads_and_vars(avg_tower_gradients)
    # global_step is automagically incremented by the optimizer
    global_step = tf.Variable(0, trainable=False, name='global_step')
    apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients,
                                                  global_step=global_step)

    # Summaries
    step_summaries_op = tf.summary.merge_all('step_summaries')
    step_summary_writers = {
        'train':
        tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'),
                              max_queue=120),
        'dev':
        tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'),
                              max_queue=120)
    }

    # Checkpointing
    checkpoint_saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep)
    checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'train')
    checkpoint_filename = 'checkpoint'

    best_dev_saver = tf.train.Saver(max_to_keep=1)
    best_dev_path = os.path.join(FLAGS.checkpoint_dir, 'best_dev')
    best_dev_filename = 'best_dev_checkpoint'

    initializer = tf.global_variables_initializer()

    with tf.Session(config=Config.session_config) as session:
        log_debug('Session opened.')

        tf.get_default_graph().finalize()

        # Loading or initializing
        loaded = False
        if FLAGS.load in ['auto', 'last']:
            loaded = try_loading(session, checkpoint_saver,
                                 checkpoint_filename, 'most recent epoch')
        if not loaded and FLAGS.load in ['auto', 'best']:
            loaded = try_loading(session, best_dev_saver, best_dev_filename,
                                 'best validation')
        if not loaded:
            if FLAGS.load in ['auto', 'init']:
                log_info('Initializing...')
                session.run(initializer)
            else:
                log_error(
                    'Unable to load %s model from specified checkpoint dir'
                    ' - consider using load option "auto" or "init".' %
                    FLAGS.load)
                sys.exit(1)

        # Retrieving global_step from restored model and setting training parameters accordingly
        step = session.run(global_step)
        num_gpus = len(Config.available_devices)
        steps_per_epoch = max(1, train_batches // num_gpus)
        current_epoch = step // steps_per_epoch
        target_epoch = current_epoch + abs(
            FLAGS.epoch) if FLAGS.epoch < 0 else FLAGS.epoch

        log_debug('step: %d' % step)
        log_debug('epoch: %d' % current_epoch)
        log_debug('target epoch: %d' % target_epoch)
        log_debug('steps per epoch: %d' % steps_per_epoch)
        log_debug('batches per step (GPUs): %d' % num_gpus)
        log_debug('number of batches in train set: %d' % train_batches)

        def run_set(set_name, init_op, num_batches):
            is_train = set_name == 'train'
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict
            total_loss = 0.0
            step_summary_writer = step_summary_writers.get(set_name)
            num_steps = max(1, num_batches // num_gpus)
            checkpoint_time = time.time()

            if FLAGS.show_progressbar:
                pbar = progressbar.ProgressBar(max_value=num_steps,
                                               redirect_stdout=True).start()
            else:
                pbar = lambda i: i

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # Batch loop
            for step_index in pbar(range(num_steps)):
                if coord.should_stop():
                    break

                _, current_step, batch_loss, step_summary = \
                    session.run([train_op, global_step, loss, step_summaries_op],
                                feed_dict=feed_dict)
                total_loss += batch_loss
                step_summary_writer.add_summary(step_summary, current_step)

                if is_train and FLAGS.checkpoint_secs > 0 and time.time(
                ) - checkpoint_time > FLAGS.checkpoint_secs:
                    checkpoint_saver.save(session,
                                          checkpoint_path,
                                          global_step=current_step)
                    checkpoint_time = time.time()

            return total_loss / num_steps

        if target_epoch > current_epoch:
            log_info('STARTING Optimization')
            best_dev_loss = float('inf')
            dev_losses = []
            coord = tf.train.Coordinator()
            with coord.stop_on_exception():
                for current_epoch in range(current_epoch, target_epoch):
                    if coord.should_stop():
                        break

                    # Training
                    log_info('Training epoch %d ...' % current_epoch)
                    train_loss = run_set('train', train_init_op, train_batches)
                    log_info('Finished training epoch %d - loss: %f' %
                             (current_epoch, train_loss))
                    checkpoint_saver.save(session,
                                          checkpoint_path,
                                          global_step=global_step)

                    if FLAGS.dev_files:
                        # Validation
                        log_info('Validating epoch %d ...' % current_epoch)
                        dev_loss = run_set('dev', dev_init_op, dev_batches)
                        dev_losses.append(dev_loss)
                        log_info('Finished validating epoch %d - loss: %f' %
                                 (current_epoch, dev_loss))

                        if dev_loss < best_dev_loss:
                            best_dev_loss = dev_loss
                            save_path = best_dev_saver.save(
                                session,
                                best_dev_path,
                                latest_filename=best_dev_filename)
                            log_info(
                                "Saved new best validating model with loss %f to: %s"
                                % (best_dev_loss, save_path))

                        # Early stopping
                        if FLAGS.early_stop and len(
                                dev_losses) >= FLAGS.es_steps:
                            mean_loss = np.mean(dev_losses[-FLAGS.es_steps:-1])
                            std_loss = np.std(dev_losses[-FLAGS.es_steps:-1])
                            dev_losses = dev_losses[-FLAGS.es_steps:]
                            log_debug(
                                'Checking for early stopping (last %d steps) validation loss: '
                                '%f, with standard deviation: %f and mean: %f'
                                % (FLAGS.es_steps, dev_losses[-1], std_loss,
                                   mean_loss))
                            if dev_losses[-1] > np.max(dev_losses[:-1]) or \
                               (abs(dev_losses[-1] - mean_loss) < FLAGS.es_mean_th and std_loss < FLAGS.es_std_th):
                                log_info(
                                    'Early stop triggered as (for last %d steps) validation loss:'
                                    ' %f with standard deviation: %f and mean: %f'
                                    % (FLAGS.es_steps, dev_losses[-1],
                                       std_loss, mean_loss))
                                break
                coord.request_stop()
        else:
            log_info('Target epoch already reached - skipped training.')
    log_debug('Session closed.')
    def update_progressbar(set_name):
        if not hasattr(update_progressbar, 'current_set_name'):
            update_progressbar.current_set_name = None

        if (update_progressbar.current_set_name != set_name
                or update_progressbar.current_job_index
                == update_progressbar.total_jobs):

            # finish prev pbar if it exists
            if hasattr(update_progressbar, 'pbar') and update_progressbar.pbar:
                update_progressbar.pbar.finish()

            update_progressbar.total_jobs = None
            update_progressbar.current_job_index = 0

            current_epoch = coord._epoch - 1
            sufix = "graph_noisySVA_CV_2layers_"
            checkpoint_stash = "/docker_files/ckpt_stash/"
            checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            checkpoint_path = checkpoint.model_checkpoint_path
            ckpt_dest_name = sufix + str(current_epoch - 118) + "_eph"
            str_to_replace = "s/" + checkpoint_path.split(
                '/')[-1] + "/" + ckpt_dest_name + "/"

            subprocess.Popen(
                ["cp", checkpoint_path + ".meta", checkpoint_stash])
            #pdb.set_trace()
            subprocess.Popen([
                "rename", str_to_replace,
                checkpoint_stash + checkpoint_path.split('/')[-1] + ".meta"
            ])

            subprocess.Popen([
                "cp", checkpoint_path + ".data-00000-of-00001",
                checkpoint_stash
            ])
            subprocess.Popen([
                "rename", str_to_replace, checkpoint_stash +
                checkpoint_path.split('/')[-1] + ".data-00000-of-00001"
            ])

            subprocess.Popen(
                ["cp", checkpoint_path + ".index", checkpoint_stash])
            subprocess.Popen([
                "rename", str_to_replace,
                checkpoint_stash + checkpoint_path.split('/')[-1] + ".index"
            ])

            #HERE

            if set_name == "train":
                log_info('Training epoch %i...' % current_epoch)
                update_progressbar.total_jobs = coord._num_jobs_train
            else:
                log_info('Validating epoch %i...' % current_epoch)
                update_progressbar.total_jobs = coord._num_jobs_dev

            # recreate pbar
            update_progressbar.pbar = progressbar.ProgressBar(
                max_value=update_progressbar.total_jobs,
                redirect_stdout=True).start()

            update_progressbar.current_set_name = set_name

        if update_progressbar.pbar:
            update_progressbar.pbar.update(
                update_progressbar.current_job_index + 1, force=True)

        update_progressbar.current_job_index += 1
Beispiel #16
0
def export():
    r'''
    Restores the trained variables into a simpler graph that will be exported for serving.
    '''
    log_info('Exporting the model...')
    with tf.device('/cpu:0'):
        from tensorflow.python.framework.ops import Tensor, Operation

        tf.reset_default_graph()
        session = tf.Session(config=Config.session_config)

        inputs, outputs, _ = create_inference_graph(batch_size=1,
                                                    n_steps=FLAGS.n_steps)
        input_names = ",".join(tensor.op.name for tensor in inputs.values())
        output_names_tensors = [
            tensor.op.name for tensor in outputs.values()
            if isinstance(tensor, Tensor)
        ]
        output_names_ops = [
            tensor.name for tensor in outputs.values()
            if isinstance(tensor, Operation)
        ]
        output_names = ",".join(output_names_tensors + output_names_ops)
        input_shapes = ":".join(",".join(map(str, tensor.shape))
                                for tensor in inputs.values())

        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }

        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        checkpoint_path = checkpoint.model_checkpoint_path

        output_filename = 'output_graph.pb'
        try:
            output_graph_path = os.path.join(FLAGS.export_dir, output_filename)

            if not os.path.isdir(FLAGS.export_dir):
                os.makedirs(FLAGS.export_dir)

            def do_graph_freeze(output_file=None,
                                output_node_names=None,
                                variables_blacklist=None):
                freeze_graph.freeze_graph_with_def_protos(
                    input_graph_def=session.graph_def,
                    input_saver_def=saver.as_saver_def(),
                    input_checkpoint=checkpoint_path,
                    output_node_names=output_node_names,
                    restore_op_name=None,
                    filename_tensor_name=None,
                    output_graph=output_file,
                    clear_devices=False,
                    variable_names_blacklist=variables_blacklist,
                    initializer_nodes='')

            do_graph_freeze(
                output_file=output_graph_path,
                output_node_names=output_names,
                variables_blacklist='previous_state_c,previous_state_h')
            log_info('Models exported at %s' % (FLAGS.export_dir))
        except RuntimeError as e:
            log_error(str(e))
Beispiel #17
0
        def run_set(set_name, epoch, init_op, dataset=None):
            is_train = set_name == 'train'
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict

            total_loss = 0.0
            step_count = 0

            step_summary_writer = step_summary_writers.get(set_name)
            checkpoint_time = time.time()

            # Setup progress bar
            class LossWidget(progressbar.widgets.FormatLabel):
                def __init__(self):
                    progressbar.widgets.FormatLabel.__init__(self, format='Loss: %(mean_loss)f')

                def __call__(self, progress, data, **kwargs):
                    data['mean_loss'] = total_loss / step_count if step_count else 0.0
                    return progressbar.widgets.FormatLabel.__call__(self, progress, data, **kwargs)

            prefix = 'Epoch {} | {:>10}'.format(epoch, 'Training' if is_train else 'Validation')
            widgets = [' | ', progressbar.widgets.Timer(),
                       ' | Steps: ', progressbar.widgets.Counter(),
                       ' | ', LossWidget()]
            suffix = ' | Dataset: {}'.format(dataset) if dataset else None
            pbar = create_progressbar(prefix=prefix, widgets=widgets, suffix=suffix).start()

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # Batch loop
            while True:
                try:
                    _, current_step, batch_loss, problem_files, step_summary = \
                        session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
                                    feed_dict=feed_dict)
                except tf.errors.InvalidArgumentError as err:
                    if FLAGS.augmentation_sparse_warp:
                        log_info("Ignoring sparse warp error: {}".format(err))
                        continue
                    else:
                        raise
                except tf.errors.OutOfRangeError:
                    break

                if problem_files.size > 0:
                    problem_files = [f.decode('utf8') for f in problem_files[..., 0]]
                    log_error('The following files caused an infinite (or NaN) '
                              'loss: {}'.format(','.join(problem_files)))

                total_loss += batch_loss
                step_count += 1

                pbar.update(step_count)

                step_summary_writer.add_summary(step_summary, current_step)

                if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs:
                    checkpoint_saver.save(session, checkpoint_path, global_step=current_step)
                    checkpoint_time = time.time()

            pbar.finish()
            mean_loss = total_loss / step_count if step_count > 0 else 0.0
            return mean_loss, step_count
Beispiel #18
0
def train():
    # Create training and validation datasets
    train_set = create_dataset(FLAGS.train_files.split(','),
                               batch_size=FLAGS.train_batch_size,
                               cache_path=FLAGS.feature_cache)

    iterator = tfv1.data.Iterator.from_structure(
        tfv1.data.get_output_types(train_set),
        tfv1.data.get_output_shapes(train_set),
        output_classes=tfv1.data.get_output_classes(train_set))

    # Make initialization ops for switching between the two sets
    train_init_op = iterator.make_initializer(train_set)

    if FLAGS.dev_files:
        dev_csvs = FLAGS.dev_files.split(',')
        dev_sets = [
            create_dataset([csv], batch_size=FLAGS.dev_batch_size)
            for csv in dev_csvs
        ]
        dev_init_ops = [
            iterator.make_initializer(dev_set) for dev_set in dev_sets
        ]

    # Dropout
    dropout_rates = [
        tfv1.placeholder(tf.float32, name='dropout_{}'.format(i))
        for i in range(6)
    ]
    dropout_feed_dict = {
        dropout_rates[0]: FLAGS.dropout_rate,
        dropout_rates[1]: FLAGS.dropout_rate2,
        dropout_rates[2]: FLAGS.dropout_rate3,
        dropout_rates[3]: FLAGS.dropout_rate4,
        dropout_rates[4]: FLAGS.dropout_rate5,
        dropout_rates[5]: FLAGS.dropout_rate6,
    }
    no_dropout_feed_dict = {rate: 0. for rate in dropout_rates}

    # Building the graph
    optimizer = create_optimizer()
    gradients, loss = get_tower_results(iterator, optimizer, dropout_rates)

    # Average tower gradients across GPUs
    avg_tower_gradients = average_gradients(gradients)
    log_grads_and_vars(avg_tower_gradients)

    # global_step is automagically incremented by the optimizer
    global_step = tfv1.train.get_or_create_global_step()
    apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients,
                                                  global_step=global_step)

    # Summaries
    step_summaries_op = tfv1.summary.merge_all('step_summaries')
    step_summary_writers = {
        'train':
        tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'),
                                max_queue=120),
        'dev':
        tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'),
                                max_queue=120)
    }

    # Checkpointing
    checkpoint_saver = tfv1.train.Saver(max_to_keep=FLAGS.max_to_keep)
    checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'train')
    checkpoint_filename = 'checkpoint'

    best_dev_saver = tfv1.train.Saver(max_to_keep=1)
    best_dev_path = os.path.join(FLAGS.checkpoint_dir, 'best_dev')
    best_dev_filename = 'best_dev_checkpoint'

    initializer = tfv1.global_variables_initializer()

    with tfv1.Session(config=Config.session_config) as session:
        log_debug('Session opened.')

        # Loading or initializing
        loaded = False

        # Initialize training from a CuDNN RNN checkpoint
        if FLAGS.cudnn_checkpoint:
            if FLAGS.use_cudnn_rnn:
                log_error(
                    'Trying to use --cudnn_checkpoint but --use_cudnn_rnn '
                    'was specified. The --cudnn_checkpoint flag is only '
                    'needed when converting a CuDNN RNN checkpoint to '
                    'a CPU-capable graph. If your system is capable of '
                    'using CuDNN RNN, you can just specify the CuDNN RNN '
                    'checkpoint normally with --checkpoint_dir.')
                exit(1)

            log_info('Converting CuDNN RNN checkpoint from {}'.format(
                FLAGS.cudnn_checkpoint))
            ckpt = tfv1.train.load_checkpoint(FLAGS.cudnn_checkpoint)
            missing_variables = []

            # Load compatible variables from checkpoint
            for v in tfv1.global_variables():
                try:
                    v.load(ckpt.get_tensor(v.op.name), session=session)
                except tf.errors.NotFoundError:
                    missing_variables.append(v)

            # Check that the only missing variables are the Adam moment tensors
            if any('Adam' not in v.op.name for v in missing_variables):
                log_error(
                    'Tried to load a CuDNN RNN checkpoint but there were '
                    'more missing variables than just the Adam moment '
                    'tensors.')
                exit(1)

            # Initialize Adam moment tensors from scratch to allow use of CuDNN
            # RNN checkpoints.
            log_info('Initializing missing Adam moment tensors.')
            init_op = tfv1.variables_initializer(missing_variables)
            session.run(init_op)
            loaded = True

        tfv1.get_default_graph().finalize()

        if not loaded and FLAGS.load in ['auto', 'last']:
            loaded = try_loading(session, checkpoint_saver,
                                 checkpoint_filename, 'most recent')
        if not loaded and FLAGS.load in ['auto', 'best']:
            loaded = try_loading(session, best_dev_saver, best_dev_filename,
                                 'best validation')
        if not loaded:
            if FLAGS.load in ['auto', 'init']:
                log_info('Initializing variables...')
                session.run(initializer)
            else:
                log_error(
                    'Unable to load %s model from specified checkpoint dir'
                    ' - consider using load option "auto" or "init".' %
                    FLAGS.load)
                sys.exit(1)

        def run_set(set_name, epoch, init_op, dataset=None):
            is_train = set_name == 'train'
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict

            total_loss = 0.0
            step_count = 0

            step_summary_writer = step_summary_writers.get(set_name)
            checkpoint_time = time.time()

            # Setup progress bar
            class LossWidget(progressbar.widgets.FormatLabel):
                def __init__(self):
                    progressbar.widgets.FormatLabel.__init__(
                        self, format='Loss: %(mean_loss)f')

                def __call__(self, progress, data, **kwargs):
                    data[
                        'mean_loss'] = total_loss / step_count if step_count else 0.0
                    return progressbar.widgets.FormatLabel.__call__(
                        self, progress, data, **kwargs)

            prefix = 'Epoch {} | {:>10}'.format(
                epoch, 'Training' if is_train else 'Validation')
            widgets = [
                ' | ',
                progressbar.widgets.Timer(), ' | Steps: ',
                progressbar.widgets.Counter(), ' | ',
                LossWidget()
            ]
            suffix = ' | Dataset: {}'.format(dataset) if dataset else None
            pbar = create_progressbar(prefix=prefix,
                                      widgets=widgets,
                                      suffix=suffix).start()

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # Batch loop
            while True:
                try:
                    _, current_step, batch_loss, step_summary = \
                        session.run([train_op, global_step, loss, step_summaries_op],
                                    feed_dict=feed_dict)
                except tf.errors.OutOfRangeError:
                    break

                total_loss += batch_loss
                step_count += 1

                pbar.update(step_count)

                step_summary_writer.add_summary(step_summary, current_step)

                if is_train and FLAGS.checkpoint_secs > 0 and time.time(
                ) - checkpoint_time > FLAGS.checkpoint_secs:
                    checkpoint_saver.save(session,
                                          checkpoint_path,
                                          global_step=current_step)
                    checkpoint_time = time.time()

            pbar.finish()
            mean_loss = total_loss / step_count if step_count > 0 else 0.0
            return mean_loss, step_count

        log_info('STARTING Optimization')
        train_start_time = datetime.utcnow()
        best_dev_loss = float('inf')
        dev_losses = []
        try:
            for epoch in range(FLAGS.epochs):
                # Training
                log_progress('Training epoch %d...' % epoch)
                train_loss, _ = run_set('train', epoch, train_init_op)
                log_progress('Finished training epoch %d - loss: %f' %
                             (epoch, train_loss))
                checkpoint_saver.save(session,
                                      checkpoint_path,
                                      global_step=global_step)

                if FLAGS.dev_files:
                    # Validation
                    dev_loss = 0.0
                    total_steps = 0
                    for csv, init_op in zip(dev_csvs, dev_init_ops):
                        log_progress('Validating epoch %d on %s...' %
                                     (epoch, csv))
                        set_loss, steps = run_set('dev',
                                                  epoch,
                                                  init_op,
                                                  dataset=csv)
                        dev_loss += set_loss * steps
                        total_steps += steps
                        log_progress(
                            'Finished validating epoch %d on %s - loss: %f' %
                            (epoch, csv, set_loss))
                    dev_loss = dev_loss / total_steps

                    dev_losses.append(dev_loss)

                    if dev_loss < best_dev_loss:
                        best_dev_loss = dev_loss
                        save_path = best_dev_saver.save(
                            session,
                            best_dev_path,
                            global_step=global_step,
                            latest_filename=best_dev_filename)
                        log_info(
                            "Saved new best validating model with loss %f to: %s"
                            % (best_dev_loss, save_path))

                    # Early stopping
                    if FLAGS.early_stop and len(dev_losses) >= FLAGS.es_steps:
                        mean_loss = np.mean(dev_losses[-FLAGS.es_steps:-1])
                        std_loss = np.std(dev_losses[-FLAGS.es_steps:-1])
                        dev_losses = dev_losses[-FLAGS.es_steps:]
                        log_debug(
                            'Checking for early stopping (last %d steps) validation loss: '
                            '%f, with standard deviation: %f and mean: %f' %
                            (FLAGS.es_steps, dev_losses[-1], std_loss,
                             mean_loss))
                        if dev_losses[-1] > np.max(dev_losses[:-1]) or \
                           (abs(dev_losses[-1] - mean_loss) < FLAGS.es_mean_th and std_loss < FLAGS.es_std_th):
                            log_info(
                                'Early stop triggered as (for last %d steps) validation loss:'
                                ' %f with standard deviation: %f and mean: %f'
                                % (FLAGS.es_steps, dev_losses[-1], std_loss,
                                   mean_loss))
                            break
        except KeyboardInterrupt:
            pass
        log_info('FINISHED optimization in {}'.format(datetime.utcnow() -
                                                      train_start_time))
    log_debug('Session closed.')
Beispiel #19
0
def train():
    r'''
    Trains the network on a given server of a cluster.
    If no server provided, it performs single process training.
    '''

    # Reading training set
    train_index = SampleIndex()

    train_data = preprocess(FLAGS.train_files.split(','),
                            FLAGS.train_batch_size,
                            Config.n_input,
                            Config.n_context,
                            Config.alphabet,
                            hdf5_cache_path=FLAGS.train_cached_features_path)

    train_set = DataSet(train_data,
                        FLAGS.train_batch_size,
                        limit=FLAGS.limit_train,
                        next_index=train_index.inc)

    # Reading validation set
    dev_index = SampleIndex()

    dev_data = preprocess(FLAGS.dev_files.split(','),
                          FLAGS.dev_batch_size,
                          Config.n_input,
                          Config.n_context,
                          Config.alphabet,
                          hdf5_cache_path=FLAGS.dev_cached_features_path)

    dev_set = DataSet(dev_data,
                      FLAGS.dev_batch_size,
                      limit=FLAGS.limit_dev,
                      next_index=dev_index.inc)

    # Combining all sets to a multi set model feeder
    model_feeder = ModelFeeder(train_set,
                               dev_set,
                               Config.n_input,
                               Config.n_context,
                               Config.alphabet,
                               tower_feeder_count=len(
                                   Config.available_devices))

    # Dropout
    dropout_rates = [
        tf.placeholder(tf.float32, name='dropout_{}'.format(i))
        for i in range(6)
    ]
    dropout_feed_dict = {
        dropout_rates[0]: FLAGS.dropout_rate,
        dropout_rates[1]: FLAGS.dropout_rate2,
        dropout_rates[2]: FLAGS.dropout_rate3,
        dropout_rates[3]: FLAGS.dropout_rate4,
        dropout_rates[4]: FLAGS.dropout_rate5,
        dropout_rates[5]: FLAGS.dropout_rate6,
    }
    no_dropout_feed_dict = {
        dropout_rates[0]: 0.,
        dropout_rates[1]: 0.,
        dropout_rates[2]: 0.,
        dropout_rates[3]: 0.,
        dropout_rates[4]: 0.,
        dropout_rates[5]: 0.,
    }

    # Building the graph
    optimizer = create_optimizer()
    gradients, loss = get_tower_results(model_feeder, optimizer, dropout_rates)
    # Average tower gradients across GPUs
    avg_tower_gradients = average_gradients(gradients)
    log_grads_and_vars(avg_tower_gradients)
    # global_step is automagically incremented by the optimizer
    global_step = tf.Variable(0, trainable=False, name='global_step')
    apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients,
                                                  global_step=global_step)

    # Summaries
    step_summaries_op = tf.summary.merge_all('step_summaries')
    step_summary_writers = {
        'train':
        tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'),
                              max_queue=120),
        'dev':
        tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'),
                              max_queue=120)
    }

    # Checkpointing
    checkpoint_saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep)
    checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'train')
    checkpoint_filename = 'checkpoint'

    best_dev_saver = tf.train.Saver(max_to_keep=1)
    best_dev_path = os.path.join(FLAGS.checkpoint_dir, 'best_dev')
    best_dev_filename = 'best_dev_checkpoint'

    initializer = tf.global_variables_initializer()

    with tf.Session(config=Config.session_config) as session:
        log_debug('Session opened.')
        tf.get_default_graph().finalize()

        # Loading or initializing
        loaded = False
        if FLAGS.load in ['auto', 'last']:
            loaded = try_loading(session, checkpoint_saver,
                                 checkpoint_filename, 'most recent epoch')
        if not loaded and FLAGS.load in ['auto', 'best']:
            loaded = try_loading(session, best_dev_saver, best_dev_filename,
                                 'best validation')
        if not loaded:
            if FLAGS.load in ['auto', 'init']:
                log_info('Initializing...')
                session.run(initializer)
            else:
                log_error(
                    'Unable to load %s model from specified checkpoint dir'
                    ' - consider using load option "auto" or "init".' %
                    FLAGS.load)
                sys.exit(1)

        # Retrieving global_step from restored model and setting training parameters accordingly
        model_feeder.set_data_set(no_dropout_feed_dict, train_set)
        step = session.run(global_step, feed_dict=no_dropout_feed_dict)
        num_gpus = len(Config.available_devices)
        steps_per_epoch = max(1, train_set.total_batches // num_gpus)
        steps_trained = step % steps_per_epoch
        current_epoch = step // steps_per_epoch
        target_epoch = current_epoch + abs(
            FLAGS.epoch) if FLAGS.epoch < 0 else FLAGS.epoch
        train_index.index = steps_trained * num_gpus

        log_debug('step: %d' % step)
        log_debug('epoch: %d' % current_epoch)
        log_debug('target epoch: %d' % target_epoch)
        log_debug('steps per epoch: %d' % steps_per_epoch)
        log_debug('batches per step (GPUs): %d' % num_gpus)
        log_debug('number of batches in train set: %d' %
                  train_set.total_batches)
        log_debug('number of batches already trained in epoch: %d' %
                  train_index.index)

        def run_set(set_name):
            data_set = getattr(model_feeder, set_name)
            is_train = set_name == 'train'
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict
            model_feeder.set_data_set(feed_dict, data_set)
            total_loss = 0.0
            step_summary_writer = step_summary_writers.get(set_name)
            num_steps = max(1, data_set.total_batches // num_gpus)
            checkpoint_time = time.time()
            if FLAGS.show_progressbar:
                pbar = progressbar.ProgressBar(max_value=num_steps,
                                               redirect_stdout=True).start()
            # Batch loop
            for step_index in range(steps_trained, num_steps):
                if coord.should_stop():
                    break
                _, current_step, batch_loss, step_summary = \
                    session.run([train_op, global_step, loss, step_summaries_op],
                                feed_dict=feed_dict)
                total_loss += batch_loss
                step_summary_writer.add_summary(step_summary, current_step)
                if FLAGS.show_progressbar:
                    pbar.update(step_index + 1, force=True)
                if is_train and FLAGS.checkpoint_secs > 0 and time.time(
                ) - checkpoint_time > FLAGS.checkpoint_secs:
                    checkpoint_saver.save(session,
                                          checkpoint_path,
                                          global_step=current_step)
                    checkpoint_time = time.time()
            if FLAGS.show_progressbar:
                pbar.finish()
            return total_loss / num_steps

        if target_epoch > current_epoch:
            log_info('STARTING Optimization')
            best_dev_loss = float('inf')
            dev_losses = []
            coord = tf.train.Coordinator()
            with coord.stop_on_exception():
                log_debug('Starting queue runners...')
                model_feeder.start_queue_threads(session, coord=coord)
                log_debug('Queue runners started.')
                # Epoch loop
                for current_epoch in range(current_epoch, target_epoch):
                    # Training
                    if coord.should_stop():
                        break
                    log_info('Training epoch %d ...' % current_epoch)
                    train_loss = run_set('train')
                    log_info('Finished training epoch %d - loss: %f' %
                             (current_epoch, train_loss))
                    checkpoint_saver.save(session,
                                          checkpoint_path,
                                          global_step=global_step)
                    steps_trained = 0
                    # Validation
                    log_info('Validating epoch %d ...' % current_epoch)
                    dev_loss = run_set('dev')
                    dev_losses.append(dev_loss)
                    log_info('Finished validating epoch %d - loss: %f' %
                             (current_epoch, dev_loss))
                    if dev_loss < best_dev_loss:
                        best_dev_loss = dev_loss
                        save_path = best_dev_saver.save(
                            session,
                            best_dev_path,
                            latest_filename=best_dev_filename)
                        log_info(
                            "Saved new best validating model with loss %f to: %s"
                            % (best_dev_loss, save_path))
                    # Early stopping
                    if FLAGS.early_stop and len(dev_losses) >= FLAGS.es_steps:
                        mean_loss = np.mean(dev_losses[-FLAGS.es_steps:-1])
                        std_loss = np.std(dev_losses[-FLAGS.es_steps:-1])
                        dev_losses = dev_losses[-FLAGS.es_steps:]
                        log_debug(
                            'Checking for early stopping (last %d steps) validation loss: '
                            '%f, with standard deviation: %f and mean: %f' %
                            (FLAGS.es_steps, dev_losses[-1], std_loss,
                             mean_loss))
                        if dev_losses[-1] > np.max(dev_losses[:-1]) or \
                           (abs(dev_losses[-1] - mean_loss) < FLAGS.es_mean_th and std_loss < FLAGS.es_std_th):
                            log_info(
                                'Early stop triggered as (for last %d steps) validation loss:'
                                ' %f with standard deviation: %f and mean: %f'
                                % (FLAGS.es_steps, dev_losses[-1], std_loss,
                                   mean_loss))
                            break
                log_debug('Closing queues...')
                coord.request_stop()
                model_feeder.close_queues(session)
                log_debug('Queues closed.')
        else:
            log_info('Target epoch already reached - skipped training.')
    log_debug('Session closed.')
Beispiel #20
0
def transcribe_one(src_path, dst_path):
    transcribe_file(src_path, dst_path)
    log_info('Transcribed file "{}" to "{}"'.format(src_path, dst_path))
Beispiel #21
0
def export():
    r'''
    Restores the trained variables into a simpler graph that will be exported for serving.
    '''
    log_info('Exporting the model...')
    with tf.device('/cpu:0'):
        from tensorflow.python.framework.ops import Tensor, Operation

        tf.reset_default_graph()
        session = tf.Session(config=Config.session_config)

        inputs, outputs, _ = create_inference_graph(
            batch_size=FLAGS.export_batch_size,
            n_steps=FLAGS.n_steps,
            tflite=FLAGS.export_tflite)
        input_names = ",".join(tensor.op.name for tensor in inputs.values())
        output_names_tensors = [
            tensor.op.name for tensor in outputs.values()
            if isinstance(tensor, Tensor)
        ]
        output_names_ops = [
            tensor.name for tensor in outputs.values()
            if isinstance(tensor, Operation)
        ]
        output_names = ",".join(output_names_tensors + output_names_ops)
        input_shapes = ":".join(",".join(map(str, tensor.shape))
                                for tensor in inputs.values())

        if not FLAGS.export_tflite:
            mapping = {
                v.op.name: v
                for v in tf.global_variables()
                if not v.op.name.startswith('previous_state_')
            }
        else:
            # Create a saver using variables from the above newly created graph
            def fixup(name):
                if name.startswith('rnn/lstm_cell/'):
                    return name.replace('rnn/lstm_cell/', 'lstm_fused_cell/')
                return name

            mapping = {fixup(v.op.name): v for v in tf.global_variables()}

        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        checkpoint_path = checkpoint.model_checkpoint_path

        output_filename = 'output_graph.pb'
        if FLAGS.remove_export:
            if os.path.isdir(FLAGS.export_dir):
                log_info('Removing old export')
                shutil.rmtree(FLAGS.export_dir)
        try:
            output_graph_path = os.path.join(FLAGS.export_dir, output_filename)

            if not os.path.isdir(FLAGS.export_dir):
                os.makedirs(FLAGS.export_dir)

            def do_graph_freeze(output_file=None,
                                output_node_names=None,
                                variables_blacklist=None):
                return freeze_graph.freeze_graph_with_def_protos(
                    input_graph_def=session.graph_def,
                    input_saver_def=saver.as_saver_def(),
                    input_checkpoint=checkpoint_path,
                    output_node_names=output_node_names,
                    restore_op_name=None,
                    filename_tensor_name=None,
                    output_graph=output_file,
                    clear_devices=False,
                    variable_names_blacklist=variables_blacklist,
                    initializer_nodes='')

            if not FLAGS.export_tflite:
                do_graph_freeze(
                    output_file=output_graph_path,
                    output_node_names=output_names,
                    variables_blacklist='previous_state_c,previous_state_h')
            else:
                frozen_graph = do_graph_freeze(output_node_names=output_names,
                                               variables_blacklist='')
                output_tflite_path = os.path.join(
                    FLAGS.export_dir,
                    output_filename.replace('.pb', '.tflite'))

                converter = tf.lite.TFLiteConverter(
                    frozen_graph,
                    input_tensors=inputs.values(),
                    output_tensors=outputs.values())
                converter.post_training_quantize = True
                tflite_model = converter.convert()

                with open(output_tflite_path, 'wb') as fout:
                    fout.write(tflite_model)

                log_info('Exported model for TF Lite engine as {}'.format(
                    os.path.basename(output_tflite_path)))

            log_info('Models exported at %s' % (FLAGS.export_dir))
        except RuntimeError as e:
            log_error(str(e))
Beispiel #22
0
def export():
    r'''
    Restores the trained variables into a simpler graph that will be exported for serving.
    '''
    log_info('Exporting the model...')
    from tensorflow.python.framework.ops import Tensor, Operation

    inputs, outputs, _ = create_inference_graph(
        batch_size=FLAGS.export_batch_size,
        n_steps=FLAGS.n_steps,
        tflite=FLAGS.export_tflite)

    graph_version = int(file_relative_read('GRAPH_VERSION').strip())
    assert graph_version > 0

    outputs['metadata_version'] = tf.constant([graph_version],
                                              name='metadata_version')
    outputs['metadata_sample_rate'] = tf.constant([FLAGS.audio_sample_rate],
                                                  name='metadata_sample_rate')
    outputs['metadata_feature_win_len'] = tf.constant(
        [FLAGS.feature_win_len], name='metadata_feature_win_len')
    outputs['metadata_feature_win_step'] = tf.constant(
        [FLAGS.feature_win_step], name='metadata_feature_win_step')
    outputs['metadata_beam_width'] = tf.constant([FLAGS.export_beam_width],
                                                 name='metadata_beam_width')
    outputs['metadata_alphabet'] = tf.constant([Config.alphabet.serialize()],
                                               name='metadata_alphabet')

    if FLAGS.export_language:
        outputs['metadata_language'] = tf.constant(
            [FLAGS.export_language.encode('utf-8')], name='metadata_language')

    # Prevent further graph changes
    tfv1.get_default_graph().finalize()

    output_names_tensors = [
        tensor.op.name for tensor in outputs.values()
        if isinstance(tensor, Tensor)
    ]
    output_names_ops = [
        op.name for op in outputs.values() if isinstance(op, Operation)
    ]
    output_names = output_names_tensors + output_names_ops

    with tf.Session() as session:
        # Restore variables from checkpoint
        if FLAGS.load == 'auto':
            method_order = ['best', 'last']
        else:
            method_order = [FLAGS.load]
        load_or_init_graph(session, method_order)

        output_filename = FLAGS.export_name + '.pb'
        if FLAGS.remove_export:
            if os.path.isdir(FLAGS.export_dir):
                log_info('Removing old export')
                shutil.rmtree(FLAGS.export_dir)

        output_graph_path = os.path.join(FLAGS.export_dir, output_filename)

        if not os.path.isdir(FLAGS.export_dir):
            os.makedirs(FLAGS.export_dir)

        frozen_graph = tfv1.graph_util.convert_variables_to_constants(
            sess=session,
            input_graph_def=tfv1.get_default_graph().as_graph_def(),
            output_node_names=output_names)

        frozen_graph = tfv1.graph_util.extract_sub_graph(
            graph_def=frozen_graph, dest_nodes=output_names)

        if not FLAGS.export_tflite:
            with open(output_graph_path, 'wb') as fout:
                fout.write(frozen_graph.SerializeToString())
        else:
            output_tflite_path = os.path.join(
                FLAGS.export_dir, output_filename.replace('.pb', '.tflite'))

            converter = tf.lite.TFLiteConverter(
                frozen_graph,
                input_tensors=inputs.values(),
                output_tensors=outputs.values())
            converter.optimizations = [tf.lite.Optimize.DEFAULT]
            # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
            converter.allow_custom_ops = True
            tflite_model = converter.convert()

            with open(output_tflite_path, 'wb') as fout:
                fout.write(tflite_model)

        log_info('Models exported at %s' % (FLAGS.export_dir))
Beispiel #23
0
def export():
    r'''
    Restores the trained variables into a simpler graph that will be exported for serving.
    '''
    log_info('Exporting the model...')
    with tf.device('/cpu:0'):
        from tensorflow.python.framework.ops import Tensor, Operation

        tf.reset_default_graph()
        session = tf.Session(config=Config.session_config)

        inputs, outputs, _ = create_inference_graph(batch_size=FLAGS.export_batch_size, n_steps=FLAGS.n_steps, tflite=FLAGS.export_tflite)
        input_names = ",".join(tensor.op.name for tensor in inputs.values())
        output_names_tensors = [ tensor.op.name for tensor in outputs.values() if isinstance(tensor, Tensor) ]
        output_names_ops = [ tensor.name for tensor in outputs.values() if isinstance(tensor, Operation) ]
        output_names = ",".join(output_names_tensors + output_names_ops)
        input_shapes = ":".join(",".join(map(str, tensor.shape)) for tensor in inputs.values())

        if not FLAGS.export_tflite:
            mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
        else:
            # Create a saver using variables from the above newly created graph
            def fixup(name):
                if name.startswith('rnn/lstm_cell/'):
                    return name.replace('rnn/lstm_cell/', 'lstm_fused_cell/')
                return name

            mapping = {fixup(v.op.name): v for v in tf.global_variables()}

        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        checkpoint_path = checkpoint.model_checkpoint_path

        output_filename = 'output_graph.pb'
        if FLAGS.remove_export:
            if os.path.isdir(FLAGS.export_dir):
                log_info('Removing old export')
                shutil.rmtree(FLAGS.export_dir)
        try:
            output_graph_path = os.path.join(FLAGS.export_dir, output_filename)

            if not os.path.isdir(FLAGS.export_dir):
                os.makedirs(FLAGS.export_dir)

            def do_graph_freeze(output_file=None, output_node_names=None, variables_blacklist=None):
                return freeze_graph.freeze_graph_with_def_protos(
                    input_graph_def=session.graph_def,
                    input_saver_def=saver.as_saver_def(),
                    input_checkpoint=checkpoint_path,
                    output_node_names=output_node_names,
                    restore_op_name=None,
                    filename_tensor_name=None,
                    output_graph=output_file,
                    clear_devices=False,
                    variable_names_blacklist=variables_blacklist,
                    initializer_nodes='')

            if not FLAGS.export_tflite:
                do_graph_freeze(output_file=output_graph_path, output_node_names=output_names, variables_blacklist='previous_state_c,previous_state_h')
            else:
                frozen_graph = do_graph_freeze(output_node_names=output_names, variables_blacklist='')
                output_tflite_path = os.path.join(FLAGS.export_dir, output_filename.replace('.pb', '.tflite'))

                converter = lite.TFLiteConverter(frozen_graph, input_tensors=inputs.values(), output_tensors=outputs.values())
                converter.post_training_quantize = True
                tflite_model = converter.convert()

                with open(output_tflite_path, 'wb') as fout:
                    fout.write(tflite_model)

                log_info('Exported model for TF Lite engine as {}'.format(os.path.basename(output_tflite_path)))

            log_info('Models exported at %s' % (FLAGS.export_dir))
        except RuntimeError as e:
            log_error(str(e))
Beispiel #24
0
def export():
    r'''
    Restores the trained variables into a simpler graph that will be exported for serving.
    '''
    log_info('Exporting the model...')
    from tensorflow.python.framework.ops import Tensor, Operation

    inputs, outputs, _ = create_inference_graph(batch_size=FLAGS.export_batch_size, n_steps=FLAGS.n_steps, tflite=FLAGS.export_tflite)

    graph_version = int(file_relative_read('GRAPH_VERSION').strip())
    assert graph_version > 0

    outputs['metadata_version'] = tf.constant([graph_version], name='metadata_version')
    outputs['metadata_sample_rate'] = tf.constant([FLAGS.audio_sample_rate], name='metadata_sample_rate')
    outputs['metadata_feature_win_len'] = tf.constant([FLAGS.feature_win_len], name='metadata_feature_win_len')
    outputs['metadata_feature_win_step'] = tf.constant([FLAGS.feature_win_step], name='metadata_feature_win_step')
    outputs['metadata_beam_width'] = tf.constant([FLAGS.export_beam_width], name='metadata_beam_width')
    outputs['metadata_alphabet'] = tf.constant([Config.alphabet.serialize()], name='metadata_alphabet')

    if FLAGS.export_language:
        outputs['metadata_language'] = tf.constant([FLAGS.export_language.encode('utf-8')], name='metadata_language')

    # Prevent further graph changes
    tfv1.get_default_graph().finalize()

    output_names_tensors = [tensor.op.name for tensor in outputs.values() if isinstance(tensor, Tensor)]
    output_names_ops = [op.name for op in outputs.values() if isinstance(op, Operation)]
    output_names = output_names_tensors + output_names_ops

    with tf.Session() as session:
        # Restore variables from checkpoint
        if FLAGS.load == 'auto':
            method_order = ['best', 'last']
        else:
            method_order = [FLAGS.load]
        load_or_init_graph(session, method_order)

        output_filename = FLAGS.export_file_name + '.pb'
        if FLAGS.remove_export:
            if os.path.isdir(FLAGS.export_dir):
                log_info('Removing old export')
                shutil.rmtree(FLAGS.export_dir)

        output_graph_path = os.path.join(FLAGS.export_dir, output_filename)

        if not os.path.isdir(FLAGS.export_dir):
            os.makedirs(FLAGS.export_dir)

        frozen_graph = tfv1.graph_util.convert_variables_to_constants(
            sess=session,
            input_graph_def=tfv1.get_default_graph().as_graph_def(),
            output_node_names=output_names)

        frozen_graph = tfv1.graph_util.extract_sub_graph(
            graph_def=frozen_graph,
            dest_nodes=output_names)

        if not FLAGS.export_tflite:
            with open(output_graph_path, 'wb') as fout:
                fout.write(frozen_graph.SerializeToString())
        else:
            output_tflite_path = os.path.join(FLAGS.export_dir, output_filename.replace('.pb', '.tflite'))

            converter = tf.lite.TFLiteConverter(frozen_graph, input_tensors=inputs.values(), output_tensors=outputs.values())
            converter.optimizations = [tf.lite.Optimize.DEFAULT]
            # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
            converter.allow_custom_ops = True
            tflite_model = converter.convert()

            with open(output_tflite_path, 'wb') as fout:
                fout.write(tflite_model)

        log_info('Models exported at %s' % (FLAGS.export_dir))

    metadata_fname = os.path.join(FLAGS.export_dir, '{}_{}_{}.md'.format(
        FLAGS.export_author_id,
        FLAGS.export_model_name,
        FLAGS.export_model_version))

    model_runtime = 'tflite' if FLAGS.export_tflite else 'tensorflow'
    with open(metadata_fname, 'w') as f:
        f.write('---\n')
        f.write('author: {}\n'.format(FLAGS.export_author_id))
        f.write('model_name: {}\n'.format(FLAGS.export_model_name))
        f.write('model_version: {}\n'.format(FLAGS.export_model_version))
        f.write('contact_info: {}\n'.format(FLAGS.export_contact_info))
        f.write('license: {}\n'.format(FLAGS.export_license))
        f.write('language: {}\n'.format(FLAGS.export_language))
        f.write('runtime: {}\n'.format(model_runtime))
        f.write('min_ds_version: {}\n'.format(FLAGS.export_min_ds_version))
        f.write('max_ds_version: {}\n'.format(FLAGS.export_max_ds_version))
        f.write('acoustic_model_url: <replace this with a publicly available URL of the acoustic model>\n')
        f.write('scorer_url: <replace this with a publicly available URL of the scorer, if present>\n')
        f.write('---\n')
        f.write('{}\n'.format(FLAGS.export_description))

    log_info('Model metadata file saved to {}. Before submitting the exported model for publishing make sure all information in the metadata file is correct, and complete the URL fields.'.format(metadata_fname))
Beispiel #25
0
def train():
    do_cache_dataset = True

    # pylint: disable=too-many-boolean-expressions
    if (FLAGS.data_aug_features_multiplicative > 0 or
            FLAGS.data_aug_features_additive > 0 or
            FLAGS.augmentation_spec_dropout_keeprate < 1 or
            FLAGS.augmentation_freq_and_time_masking or
            FLAGS.augmentation_pitch_and_tempo_scaling or
            FLAGS.augmentation_speed_up_std > 0 or
            FLAGS.augmentation_sparse_warp):
        do_cache_dataset = False

    # Create training and validation datasets
    train_set = create_dataset(FLAGS.train_files.split(','),
                               batch_size=FLAGS.train_batch_size,
                               enable_cache=FLAGS.feature_cache and do_cache_dataset,
                               cache_path=FLAGS.feature_cache,
                               train_phase=True)

    iterator = tfv1.data.Iterator.from_structure(tfv1.data.get_output_types(train_set),
                                                 tfv1.data.get_output_shapes(train_set),
                                                 output_classes=tfv1.data.get_output_classes(train_set))

    # Make initialization ops for switching between the two sets
    train_init_op = iterator.make_initializer(train_set)

    if FLAGS.dev_files:
        dev_csvs = FLAGS.dev_files.split(',')
        dev_sets = [create_dataset([csv], batch_size=FLAGS.dev_batch_size, train_phase=False) for csv in dev_csvs]
        dev_init_ops = [iterator.make_initializer(dev_set) for dev_set in dev_sets]

    # The transfer learning approach here need us to supply the layers which we
    # want to exclude from the source model.
    # Say we want to exclude all layers except for the first one, we can use this:
    #
    #    drop_source_layers=['2', '3', 'lstm', '5', '6']
    #
    # If we want to use all layers from the source model except the last one, we use this:
    #
    #    drop_source_layers=['6']
    #

    if FLAGS.load == "transfer":
        drop_source_layers = ['2', '3', 'lstm', '5', '6'][-int(FLAGS.drop_source_layers):]
    else:
        drop_source_layers=None
    
    # Dropout
    dropout_rates = [tfv1.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6)]
    dropout_feed_dict = {
        dropout_rates[0]: FLAGS.dropout_rate,
        dropout_rates[1]: FLAGS.dropout_rate2,
        dropout_rates[2]: FLAGS.dropout_rate3,
        dropout_rates[3]: FLAGS.dropout_rate4,
        dropout_rates[4]: FLAGS.dropout_rate5,
        dropout_rates[5]: FLAGS.dropout_rate6,
    }
    no_dropout_feed_dict = {
        rate: 0. for rate in dropout_rates
    }

    # Building the graph
    optimizer = create_optimizer()

    # Enable mixed precision training
    if FLAGS.automatic_mixed_precision:
        log_info('Enabling automatic mixed precision training.')
        optimizer = tfv1.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)

    gradients, loss, non_finite_files = get_tower_results(iterator, optimizer, dropout_rates, drop_source_layers)

    # Average tower gradients across GPUs
    avg_tower_gradients = average_gradients(gradients)
    log_grads_and_vars(avg_tower_gradients)

    # global_step is automagically incremented by the optimizer
    global_step = tfv1.train.get_or_create_global_step()
    apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step)

    # Summaries
    step_summaries_op = tfv1.summary.merge_all('step_summaries')
    step_summary_writers = {
        'train': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120),
        'dev': tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120)
    }

    # Checkpointing
    checkpoint_saver = tfv1.train.Saver(max_to_keep=FLAGS.max_to_keep)
    checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'train')

    best_dev_saver = tfv1.train.Saver(max_to_keep=1)
    best_dev_path = os.path.join(FLAGS.checkpoint_dir, 'best_dev')

    # Save flags next to checkpoints
    os.makedirs(FLAGS.checkpoint_dir, exist_ok=True)

    flags_file = os.path.join(FLAGS.checkpoint_dir, 'flags.txt')
    with open(flags_file, 'w') as fout:
        fout.write(FLAGS.flags_into_string())

    initializer = tfv1.global_variables_initializer()

    with tfv1.Session(config=Config.session_config) as session:
        log_debug('Session opened.')

        # Loading or initializing
        loaded = False

        # Initialize training from a CuDNN RNN checkpoint
        if FLAGS.cudnn_checkpoint:
            if FLAGS.use_cudnn_rnn:
                log_error('Trying to use --cudnn_checkpoint but --use_cudnn_rnn '
                          'was specified. The --cudnn_checkpoint flag is only '
                          'needed when converting a CuDNN RNN checkpoint to '
                          'a CPU-capable graph. If your system is capable of '
                          'using CuDNN RNN, you can just specify the CuDNN RNN '
                          'checkpoint normally with --checkpoint_dir.')
                sys.exit(1)

            log_info('Converting CuDNN RNN checkpoint from {}'.format(FLAGS.cudnn_checkpoint))
            ckpt = tfv1.train.load_checkpoint(FLAGS.cudnn_checkpoint)
            missing_variables = []

            # Load compatible variables from checkpoint
            for v in tfv1.global_variables():
                try:
                    v.load(ckpt.get_tensor(v.op.name), session=session)
                except tf.errors.NotFoundError:
                    missing_variables.append(v)

            # Check that the only missing variables are the Adam moment tensors
            if any('Adam' not in v.op.name for v in missing_variables):
                log_error('Tried to load a CuDNN RNN checkpoint but there were '
                          'more missing variables than just the Adam moment '
                          'tensors.')
                sys.exit(1)

            # Initialize Adam moment tensors from scratch to allow use of CuDNN
            # RNN checkpoints.
            log_info('Initializing missing Adam moment tensors.')
            init_op = tfv1.variables_initializer(missing_variables)
            session.run(init_op)
            loaded = True
			
        

        if not loaded and FLAGS.load in ['auto', 'last']:
            #tf.initialize_all_variables().run()
            tfv1.get_default_graph().finalize()			
            loaded = try_loading(session, checkpoint_saver, 'checkpoint', 'most recent')
        if not loaded and FLAGS.load in ['auto', 'best']:
            #tf.initialize_all_variables().run()
            tfv1.get_default_graph().finalize()
            loaded = try_loading(session, best_dev_saver, 'best_dev_checkpoint', 'best validation')
        if not loaded : 
            if FLAGS.load == "transfer":
                if FLAGS.source_model_checkpoint_dir:
                    print('Initializing model from', FLAGS.source_model_checkpoint_dir)
                    ckpt = tfv1.train.load_checkpoint(FLAGS.source_model_checkpoint_dir)
                    variables = list(ckpt.get_variable_to_shape_map().keys())
                    print('variable', variables)
                    print('global', tf.global_variables())				
                    # Load desired source variables
                    missing_variables2 = []				
                    for v in tf.global_variables():
                        if not any(layer in v.op.name for layer in drop_source_layers):
                            print('Loading', v.op.name)
                            try:						
                                v.load(ckpt.get_tensor(v.op.name), session=session)
                                print('OK')
                            except tf.errors.NotFoundError:
                                missing_variables2.append(v)
                                print('KO')
                            except ValueError:
                                #missing_variables2.append(v)
                                print('KO for valueError')						
                    print('missing_variables =', missing_variables2)					
                    # Initialize all variables needed for DS, but not loaded from ckpt
                    
                    init_op = tfv1.variables_initializer(
                        [v for v in tf.global_variables()
                        if any(layer in v.op.name
                                for layer in drop_source_layers)
                        ] + missing_variables2)
                    tfv1.get_default_graph().finalize()
                    session.run(init_op)
                   
			
            elif FLAGS.load in ['auto', 'init']:
                log_info('Initializing variables...')
                tfv1.get_default_graph().finalize()
                session.run(initializer)
            else:
                log_error('Unable to load %s model from specified checkpoint dir'
                        ' - consider using load option "auto" or "init".' % FLAGS.load)
                sys.exit(1)


        def run_set(set_name, epoch, init_op, dataset=None):
            is_train = set_name == 'train'
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict

            total_loss = 0.0
            step_count = 0

            step_summary_writer = step_summary_writers.get(set_name)
            checkpoint_time = time.time()

            # Setup progress bar
            class LossWidget(progressbar.widgets.FormatLabel):
                def __init__(self):
                    progressbar.widgets.FormatLabel.__init__(self, format='Loss: %(mean_loss)f')

                def __call__(self, progress, data, **kwargs):
                    data['mean_loss'] = total_loss / step_count if step_count else 0.0
                    return progressbar.widgets.FormatLabel.__call__(self, progress, data, **kwargs)

            prefix = 'Epoch {} | {:>10}'.format(epoch, 'Training' if is_train else 'Validation')
            widgets = [' | ', progressbar.widgets.Timer(),
                       ' | Steps: ', progressbar.widgets.Counter(),
                       ' | ', LossWidget()]
            suffix = ' | Dataset: {}'.format(dataset) if dataset else None
            pbar = create_progressbar(prefix=prefix, widgets=widgets, suffix=suffix).start()

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # Batch loop
            while True:
                try:
                    _, current_step, batch_loss, problem_files, step_summary = \
                        session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
                                    feed_dict=feed_dict)
                except tf.errors.InvalidArgumentError as err:
                    if FLAGS.augmentation_sparse_warp:
                        log_info("Ignoring sparse warp error: {}".format(err))
                        continue
                    else:
                        raise
                except tf.errors.OutOfRangeError:
                    break

                if problem_files.size > 0:
                    problem_files = [f.decode('utf8') for f in problem_files[..., 0]]
                    log_error('The following files caused an infinite (or NaN) '
                              'loss: {}'.format(','.join(problem_files)))

                total_loss += batch_loss
                step_count += 1

                pbar.update(step_count)

                step_summary_writer.add_summary(step_summary, current_step)

                if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs:
                    checkpoint_saver.save(session, checkpoint_path, global_step=current_step)
                    checkpoint_time = time.time()

            pbar.finish()
            mean_loss = total_loss / step_count if step_count > 0 else 0.0
            return mean_loss, step_count

        log_info('STARTING Optimization')
        train_start_time = datetime.utcnow()
        best_dev_loss = float('inf')
        dev_losses = []
        try:
            for epoch in range(FLAGS.epochs):
                # Training
                log_progress('Training epoch %d...' % epoch)
                train_loss, _ = run_set('train', epoch, train_init_op)
                log_progress('Finished training epoch %d - loss: %f' % (epoch, train_loss))
                checkpoint_saver.save(session, checkpoint_path, global_step=global_step)

                if FLAGS.dev_files:
                    # Validation
                    dev_loss = 0.0
                    total_steps = 0
                    for csv, init_op in zip(dev_csvs, dev_init_ops):
                        log_progress('Validating epoch %d on %s...' % (epoch, csv))
                        set_loss, steps = run_set('dev', epoch, init_op, dataset=csv)
                        dev_loss += set_loss * steps
                        total_steps += steps
                        log_progress('Finished validating epoch %d on %s - loss: %f' % (epoch, csv, set_loss))
                    dev_loss = dev_loss / total_steps

                    dev_losses.append(dev_loss)

                    if dev_loss < best_dev_loss:
                        best_dev_loss = dev_loss
                        save_path = best_dev_saver.save(session, best_dev_path, global_step=global_step, latest_filename='best_dev_checkpoint')
                        log_info("Saved new best validating model with loss %f to: %s" % (best_dev_loss, save_path))

                    # Early stopping
                    if FLAGS.early_stop and len(dev_losses) >= FLAGS.es_steps:
                        mean_loss = np.mean(dev_losses[-FLAGS.es_steps:-1])
                        std_loss = np.std(dev_losses[-FLAGS.es_steps:-1])
                        dev_losses = dev_losses[-FLAGS.es_steps:]
                        log_debug('Checking for early stopping (last %d steps) validation loss: '
                                  '%f, with standard deviation: %f and mean: %f' %
                                  (FLAGS.es_steps, dev_losses[-1], std_loss, mean_loss))
                        if dev_losses[-1] > np.max(dev_losses[:-1]) or \
                           (abs(dev_losses[-1] - mean_loss) < FLAGS.es_mean_th and std_loss < FLAGS.es_std_th):
                            log_info('Early stop triggered as (for last %d steps) validation loss:'
                                     ' %f with standard deviation: %f and mean: %f' %
                                     (FLAGS.es_steps, dev_losses[-1], std_loss, mean_loss))
                            break
        except KeyboardInterrupt:
            pass
        log_info('FINISHED optimization in {}'.format(datetime.utcnow() - train_start_time))
    log_debug('Session closed.')
Beispiel #26
0
def train():
    # Create training and validation datasets
    train_set = create_dataset(FLAGS.train_files.split(','),
                               batch_size=FLAGS.train_batch_size,
                               cache_path=FLAGS.train_cached_features_path)

    iterator = tf.data.Iterator.from_structure(train_set.output_types,
                                               train_set.output_shapes,
                                               output_classes=train_set.output_classes)

    # Make initialization ops for switching between the two sets
    train_init_op = iterator.make_initializer(train_set)

    if FLAGS.dev_files:
        dev_set = create_dataset(FLAGS.dev_files.split(','),
                                 batch_size=FLAGS.dev_batch_size,
                                 cache_path=FLAGS.dev_cached_features_path)
        dev_init_op = iterator.make_initializer(dev_set)

    # Dropout
    dropout_rates = [tf.placeholder(tf.float32, name='dropout_{}'.format(i)) for i in range(6)]
    dropout_feed_dict = {
        dropout_rates[0]: FLAGS.dropout_rate,
        dropout_rates[1]: FLAGS.dropout_rate2,
        dropout_rates[2]: FLAGS.dropout_rate3,
        dropout_rates[3]: FLAGS.dropout_rate4,
        dropout_rates[4]: FLAGS.dropout_rate5,
        dropout_rates[5]: FLAGS.dropout_rate6,
    }
    no_dropout_feed_dict = {
        rate: 0. for rate in dropout_rates
    }

    # Building the graph
    optimizer = create_optimizer()
    gradients, loss = get_tower_results(iterator, optimizer, dropout_rates)

    # Average tower gradients across GPUs
    avg_tower_gradients = average_gradients(gradients)
    log_grads_and_vars(avg_tower_gradients)

    # global_step is automagically incremented by the optimizer
    global_step = tf.train.get_or_create_global_step()
    apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients, global_step=global_step)

    # Summaries
    step_summaries_op = tf.summary.merge_all('step_summaries')
    step_summary_writers = {
        'train': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'), max_queue=120),
        'dev': tf.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'), max_queue=120)
    }

    # Checkpointing
    checkpoint_saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep)
    checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'train')
    checkpoint_filename = 'checkpoint'

    best_dev_saver = tf.train.Saver(max_to_keep=1)
    best_dev_path = os.path.join(FLAGS.checkpoint_dir, 'best_dev')
    best_dev_filename = 'best_dev_checkpoint'

    initializer = tf.global_variables_initializer()

    with tf.Session(config=Config.session_config) as session:
        log_debug('Session opened.')

        tf.get_default_graph().finalize()

        # Loading or initializing
        loaded = False
        if FLAGS.load in ['auto', 'last']:
            loaded = try_loading(session, checkpoint_saver, checkpoint_filename, 'most recent')
        if not loaded and FLAGS.load in ['auto', 'best']:
            loaded = try_loading(session, best_dev_saver, best_dev_filename, 'best validation')
        if not loaded:
            if FLAGS.load in ['auto', 'init']:
                log_info('Initializing variables...')
                session.run(initializer)
            else:
                log_error('Unable to load %s model from specified checkpoint dir'
                          ' - consider using load option "auto" or "init".' % FLAGS.load)
                sys.exit(1)

        def run_set(set_name, init_op):
            is_train = set_name == 'train'
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict

            total_loss = 0.0
            step_count = 0

            step_summary_writer = step_summary_writers.get(set_name)
            checkpoint_time = time.time()

            class LossWidget(progressbar.widgets.FormatLabel):
                def __init__(self):
                    progressbar.widgets.FormatLabel.__init__(self, format='Loss: %(mean_loss)f')

                def __call__(self, progress, data, **kwargs):
                    data['mean_loss'] = total_loss / step_count if step_count else 0.0
                    return progressbar.widgets.FormatLabel.__call__(self, progress, data, **kwargs)

            if FLAGS.show_progressbar:
                pbar = progressbar.ProgressBar(widgets=['Epoch {}'.format(epoch),
                                                        ' | ', progressbar.widgets.Timer(),
                                                        ' | Steps: ', progressbar.widgets.Counter(),
                                                        ' | ', LossWidget()])
                pbar.start()

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # Batch loop
            while True:
                try:
                    _, current_step, batch_loss, step_summary = \
                        session.run([train_op, global_step, loss, step_summaries_op],
                                    feed_dict=feed_dict)
                except tf.errors.OutOfRangeError:
                    break

                total_loss += batch_loss
                step_count += 1

                if FLAGS.show_progressbar:
                    pbar.update(step_count)

                step_summary_writer.add_summary(step_summary, current_step)

                if is_train and FLAGS.checkpoint_secs > 0 and time.time() - checkpoint_time > FLAGS.checkpoint_secs:
                    checkpoint_saver.save(session, checkpoint_path, global_step=current_step)
                    checkpoint_time = time.time()

            if FLAGS.show_progressbar:
                pbar.finish()

            return total_loss / step_count

        log_info('STARTING Optimization')
        best_dev_loss = float('inf')
        dev_losses = []
        try:
            for epoch in range(FLAGS.epochs):
                # Training
                if not FLAGS.show_progressbar:
                    log_info('Training epoch %d...' % epoch)
                train_loss = run_set('train', train_init_op)
                if not FLAGS.show_progressbar:
                    log_info('Finished training epoch %d - loss: %f' % (epoch, train_loss))
                checkpoint_saver.save(session, checkpoint_path, global_step=global_step)

                if FLAGS.dev_files:
                    # Validation
                    if not FLAGS.show_progressbar:
                        log_info('Validating epoch %d...' % epoch)
                    dev_loss = run_set('dev', dev_init_op)
                    if not FLAGS.show_progressbar:
                        log_info('Finished validating epoch %d - loss: %f' % (epoch, dev_loss))
                    dev_losses.append(dev_loss)

                    if dev_loss < best_dev_loss:
                        best_dev_loss = dev_loss
                        save_path = best_dev_saver.save(session, best_dev_path, global_step=global_step, latest_filename=best_dev_filename)
                        log_info("Saved new best validating model with loss %f to: %s" % (best_dev_loss, save_path))

                    # Early stopping
                    if FLAGS.early_stop and len(dev_losses) >= FLAGS.es_steps:
                        mean_loss = np.mean(dev_losses[-FLAGS.es_steps:-1])
                        std_loss = np.std(dev_losses[-FLAGS.es_steps:-1])
                        dev_losses = dev_losses[-FLAGS.es_steps:]
                        log_debug('Checking for early stopping (last %d steps) validation loss: '
                                  '%f, with standard deviation: %f and mean: %f' %
                                  (FLAGS.es_steps, dev_losses[-1], std_loss, mean_loss))
                        if dev_losses[-1] > np.max(dev_losses[:-1]) or \
                           (abs(dev_losses[-1] - mean_loss) < FLAGS.es_mean_th and std_loss < FLAGS.es_std_th):
                            log_info('Early stop triggered as (for last %d steps) validation loss:'
                                     ' %f with standard deviation: %f and mean: %f' %
                                     (FLAGS.es_steps, dev_losses[-1], std_loss, mean_loss))
                            break
        except KeyboardInterrupt:
            pass
    log_debug('Session closed.')
Beispiel #27
0
def export():
    r'''
    Restores the trained variables into a simpler graph that will be exported for serving.
    '''
    log_info('Exporting the model...')
    from tensorflow.python.framework.ops import Tensor, Operation

    inputs, outputs, _ = create_inference_graph(batch_size=FLAGS.export_batch_size, n_steps=FLAGS.n_steps, tflite=FLAGS.export_tflite)

    graph_version = int(file_relative_read('GRAPH_VERSION').strip())
    assert graph_version > 0

    outputs['metadata_version'] = tf.constant([graph_version], name='metadata_version')
    outputs['metadata_sample_rate'] = tf.constant([FLAGS.audio_sample_rate], name='metadata_sample_rate')
    outputs['metadata_feature_win_len'] = tf.constant([FLAGS.feature_win_len], name='metadata_feature_win_len')
    outputs['metadata_feature_win_step'] = tf.constant([FLAGS.feature_win_step], name='metadata_feature_win_step')
    outputs['metadata_alphabet'] = tf.constant([Config.alphabet.serialize()], name='metadata_alphabet')

    if FLAGS.export_language:
        outputs['metadata_language'] = tf.constant([FLAGS.export_language.encode('utf-8')], name='metadata_language')

    output_names_tensors = [tensor.op.name for tensor in outputs.values() if isinstance(tensor, Tensor)]
    output_names_ops = [op.name for op in outputs.values() if isinstance(op, Operation)]
    output_names = ",".join(output_names_tensors + output_names_ops)

    # Create a saver using variables from the above newly created graph
    saver = tfv1.train.Saver()

    # Restore variables from training checkpoint
    checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
    checkpoint_path = checkpoint.model_checkpoint_path

    output_filename = 'output_graph.pb'
    if FLAGS.remove_export:
        if os.path.isdir(FLAGS.export_dir):
            log_info('Removing old export')
            shutil.rmtree(FLAGS.export_dir)
    try:
        output_graph_path = os.path.join(FLAGS.export_dir, output_filename)

        if not os.path.isdir(FLAGS.export_dir):
            os.makedirs(FLAGS.export_dir)

        def do_graph_freeze(output_file=None, output_node_names=None, variables_blacklist=''):
            frozen = freeze_graph.freeze_graph_with_def_protos(
                input_graph_def=tfv1.get_default_graph().as_graph_def(),
                input_saver_def=saver.as_saver_def(),
                input_checkpoint=checkpoint_path,
                output_node_names=output_node_names,
                restore_op_name=None,
                filename_tensor_name=None,
                output_graph=output_file,
                clear_devices=False,
                variable_names_blacklist=variables_blacklist,
                initializer_nodes='')

            input_node_names = []
            return strip_unused_lib.strip_unused(
                input_graph_def=frozen,
                input_node_names=input_node_names,
                output_node_names=output_node_names.split(','),
                placeholder_type_enum=tf.float32.as_datatype_enum)

        frozen_graph = do_graph_freeze(output_node_names=output_names)

        if not FLAGS.export_tflite:
            with open(output_graph_path, 'wb') as fout:
                fout.write(frozen_graph.SerializeToString())
        else:
            output_tflite_path = os.path.join(FLAGS.export_dir, output_filename.replace('.pb', '.tflite'))

            converter = tf.lite.TFLiteConverter(frozen_graph, input_tensors=inputs.values(), output_tensors=outputs.values())
            converter.optimizations = [ tf.lite.Optimize.DEFAULT ]
            # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
            converter.allow_custom_ops = True
            tflite_model = converter.convert()

            with open(output_tflite_path, 'wb') as fout:
                fout.write(tflite_model)

            log_info('Exported model for TF Lite engine as {}'.format(os.path.basename(output_tflite_path)))

        log_info('Models exported at %s' % (FLAGS.export_dir))
    except RuntimeError as e:
        log_error(str(e))
Beispiel #28
0
def export():
    r'''
    Restores the trained variables into a simpler graph that will be exported for serving.
    '''
    log_info('Exporting the model...')
    from tensorflow.python.framework.ops import Tensor, Operation

    inputs, outputs, _ = create_inference_graph(batch_size=FLAGS.export_batch_size, n_steps=FLAGS.n_steps, tflite=FLAGS.export_tflite)
    output_names_tensors = [tensor.op.name for tensor in outputs.values() if isinstance(tensor, Tensor)]
    output_names_ops = [op.name for op in outputs.values() if isinstance(op, Operation)]
    output_names = ",".join(output_names_tensors + output_names_ops)

    if not FLAGS.export_tflite:
        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
    else:
        # Create a saver using variables from the above newly created graph
        def fixup(name):
            if name.startswith('rnn/lstm_cell/'):
                return name.replace('rnn/lstm_cell/', 'lstm_fused_cell/')
            return name

        mapping = {fixup(v.op.name): v for v in tf.global_variables()}

    saver = tf.train.Saver(mapping)

    # Restore variables from training checkpoint
    checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
    checkpoint_path = checkpoint.model_checkpoint_path

    output_filename = 'output_graph.pb'
    if FLAGS.remove_export:
        if os.path.isdir(FLAGS.export_dir):
            log_info('Removing old export')
            shutil.rmtree(FLAGS.export_dir)
    try:
        output_graph_path = os.path.join(FLAGS.export_dir, output_filename)

        if not os.path.isdir(FLAGS.export_dir):
            os.makedirs(FLAGS.export_dir)

        def do_graph_freeze(output_file=None, output_node_names=None, variables_blacklist=None):
            return freeze_graph.freeze_graph_with_def_protos(
                input_graph_def=tf.get_default_graph().as_graph_def(),
                input_saver_def=saver.as_saver_def(),
                input_checkpoint=checkpoint_path,
                output_node_names=output_node_names,
                restore_op_name=None,
                filename_tensor_name=None,
                output_graph=output_file,
                clear_devices=False,
                variable_names_blacklist=variables_blacklist,
                initializer_nodes='')

        if not FLAGS.export_tflite:
            frozen_graph = do_graph_freeze(output_node_names=output_names, variables_blacklist='previous_state_c,previous_state_h')
            frozen_graph.version = int(file_relative_read('GRAPH_VERSION').strip())

            # Add a no-op node to the graph with metadata information to be loaded by the native client
            metadata = frozen_graph.node.add()
            metadata.name = 'model_metadata'
            metadata.op = 'NoOp'
            metadata.attr['sample_rate'].i = FLAGS.audio_sample_rate
            metadata.attr['feature_win_len'].i = FLAGS.feature_win_len
            metadata.attr['feature_win_step'].i = FLAGS.feature_win_step
            if FLAGS.export_language:
                metadata.attr['language'].s = FLAGS.export_language.encode('ascii')

            with open(output_graph_path, 'wb') as fout:
                fout.write(frozen_graph.SerializeToString())
        else:
            frozen_graph = do_graph_freeze(output_node_names=output_names, variables_blacklist='')
            output_tflite_path = os.path.join(FLAGS.export_dir, output_filename.replace('.pb', '.tflite'))

            converter = tf.lite.TFLiteConverter(frozen_graph, input_tensors=inputs.values(), output_tensors=outputs.values())
            converter.post_training_quantize = True
            # AudioSpectrogram and Mfcc ops are custom but have built-in kernels in TFLite
            converter.allow_custom_ops = True
            tflite_model = converter.convert()

            with open(output_tflite_path, 'wb') as fout:
                fout.write(tflite_model)

            log_info('Exported model for TF Lite engine as {}'.format(os.path.basename(output_tflite_path)))

        log_info('Models exported at %s' % (FLAGS.export_dir))
    except RuntimeError as e:
        log_error(str(e))
Beispiel #29
0
def export():
    r'''
    Restores the trained variables into a simpler graph that will be exported for serving.
    '''
    log_info('Exporting the model...')
    with tf.device('/cpu:0'):
        from tensorflow.python.framework.ops import Tensor, Operation

        tf.reset_default_graph()
        session = tf.Session(config=Config.session_config)

        inputs, outputs, _ = create_inference_graph(batch_size=1,
                                                    n_steps=FLAGS.n_steps,
                                                    tflite=FLAGS.export_tflite)
        input_names = ",".join(tensor.op.name for tensor in inputs.values())
        output_names_tensors = [
            tensor.op.name for tensor in outputs.values()
            if isinstance(tensor, Tensor)
        ]
        output_names_ops = [
            tensor.name for tensor in outputs.values()
            if isinstance(tensor, Operation)
        ]
        output_names = ",".join(output_names_tensors + output_names_ops)
        input_shapes = ":".join(",".join(map(str, tensor.shape))
                                for tensor in inputs.values())

        if not FLAGS.export_tflite:
            mapping = {
                v.op.name: v
                for v in tf.global_variables()
                if not v.op.name.startswith('previous_state_')
            }
        else:
            # Create a saver using variables from the above newly created graph
            def fixup(name):
                if name.startswith('rnn/lstm_cell/'):
                    return name.replace('rnn/lstm_cell/', 'lstm_fused_cell/')
                return name

            mapping = {fixup(v.op.name): v for v in tf.global_variables()}

        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        checkpoint_path = checkpoint.model_checkpoint_path

        output_filename = 'output_graph.pb'
        if FLAGS.remove_export:
            if os.path.isdir(FLAGS.export_dir):
                log_info('Removing old export')
                shutil.rmtree(FLAGS.export_dir)
        try:
            output_graph_path = os.path.join(FLAGS.export_dir, output_filename)

            if not os.path.isdir(FLAGS.export_dir):
                os.makedirs(FLAGS.export_dir)

            def do_graph_freeze(output_file=None,
                                output_node_names=None,
                                variables_blacklist=None):
                freeze_graph.freeze_graph_with_def_protos(
                    input_graph_def=session.graph_def,
                    input_saver_def=saver.as_saver_def(),
                    input_checkpoint=checkpoint_path,
                    output_node_names=output_node_names,
                    restore_op_name=None,
                    filename_tensor_name=None,
                    output_graph=output_file,
                    clear_devices=False,
                    variable_names_blacklist=variables_blacklist,
                    initializer_nodes='')

            if not FLAGS.export_tflite:
                do_graph_freeze(
                    output_file=output_graph_path,
                    output_node_names=output_names,
                    variables_blacklist='previous_state_c,previous_state_h')
            else:
                temp_fd, temp_freeze = tempfile.mkstemp(dir=FLAGS.export_dir)
                os.close(temp_fd)
                do_graph_freeze(output_file=temp_freeze,
                                output_node_names=output_names,
                                variables_blacklist='')
                output_tflite_path = os.path.join(
                    FLAGS.export_dir,
                    output_filename.replace('.pb', '.tflite'))

                class TFLiteFlags():
                    def __init__(self):
                        self.graph_def_file = temp_freeze
                        self.inference_type = 'FLOAT'
                        self.input_arrays = input_names
                        self.input_shapes = input_shapes
                        self.output_arrays = output_names
                        self.output_file = output_tflite_path
                        self.output_format = 'TFLITE'

                        default_empty = [
                            'inference_input_type', 'mean_values',
                            'default_ranges_min', 'default_ranges_max',
                            'drop_control_dependency',
                            'reorder_across_fake_quant',
                            'change_concat_input_ranges', 'allow_custom_ops',
                            'converter_mode', 'post_training_quantize',
                            'dump_graphviz_dir', 'dump_graphviz_video'
                        ]
                        for e in default_empty:
                            self.__dict__[e] = None

                flags = TFLiteFlags()
                tflite_convert._convert_model(flags)
                os.unlink(temp_freeze)
                log_info('Exported model for TF Lite engine as {}'.format(
                    os.path.basename(output_tflite_path)))

            log_info('Models exported at %s' % (FLAGS.export_dir))
        except RuntimeError as e:
            log_error(str(e))
Beispiel #30
0
        # Loading or initializing
        loaded = False

        # Initialize training from a CuDNN RNN checkpoint
        if FLAGS.cudnn_checkpoint:
            if FLAGS.use_cudnn_rnn:
                log_error('Trying to use --cudnn_checkpoint but --use_cudnn_rnn '
                          'was specified. The --cudnn_checkpoint flag is only '
                          'needed when converting a CuDNN RNN checkpoint to '
                          'a CPU-capable graph. If your system is capable of '
                          'using CuDNN RNN, you can just specify the CuDNN RNN '
                          'checkpoint normally with --checkpoint_dir.')
                exit(1)

            log_info('Converting CuDNN RNN checkpoint from {}'.format(FLAGS.cudnn_checkpoint))
            ckpt = tfv1.train.load_checkpoint(FLAGS.cudnn_checkpoint)
            missing_variables = []

            # Load compatible variables from checkpoint
            for v in tfv1.global_variables():
                try:
                    v.load(ckpt.get_tensor(v.op.name), session=session)
                except tf.errors.NotFoundError:
                    missing_variables.append(v)

            # Check that the only missing variables are the Adam moment tensors
            if any('Adam' not in v.op.name for v in missing_variables):
                log_error('Tried to load a CuDNN RNN checkpoint but there were '
                          'more missing variables than just the Adam moment '
                          'tensors.')