Esempio n. 1
0
    def _eval_network(self,
                      dataset,
                      data_mode,
                      logs_path,
                      batch_size,
                      metrics,
                      gpu_frac,
                      workers,
                      mem_dequeing,
                      track_summaries=50,
                      steps=None):
        """ Runs and tracks a single evaluation step on the given data subset
        Args:
            Other arguments can be seen in train_network
            data_mode: Data subset to use for evaluation
            track_summaries: Steps between Tensorboard summaries.
                Only used if steps is None
            steps: Number of batches the model is evaluated on.
                If not None, a single summary is created at the end.
                If None, the model is evaluated on the whole dataset and it
                tracks a summary periodically.
        Returns
            loss: Loss produced by the batch
            metrics: Set of metrics values
        """
        # Create reader to get TFRecords
        reader = DataReader(dataset)

        with tf.Graph().as_default() as g:

            # Read data and define operations
            data, labels = reader.read_batch(batch_size,
                                             data_mode,
                                             mem_dequeing,
                                             workers,
                                             shuffle=True,
                                             train_mode=False)

            main_run = self._build_run_settings(data, labels, metrics)

            # Prepare logging for Tensorboard
            saver, summary_ops, writer = mu.prepare_logging(logs_path, g)

            # Supervisor for training. We only want it to deal with the
            # session. Initializes variables
            supervisor = tf.train.Supervisor(graph=g)

            # Initialize session
            session_conf = mu.get_session_config(gpu_frac, log_placement=False)
            with supervisor.managed_session(config=session_conf) as sess:

                # Load model, if existing. Otherwise start from scratch
                step_value, _ = self._initialize_model(sess,
                                                       saver=saver,
                                                       is_training=False)

                try:
                    # Let queues start dequeing examples
                    coord, threads = mu.initialize_queues(sess)

                    # Initialize loop conditions
                    step_counter = 0
                    stop = False if steps is None else step_counter >= steps

                    while not stop:

                        # Run evaluation
                        res = main_run.test_run(sess,
                                                summary_ops,
                                                step_value,
                                                data_mode=data_mode,
                                                log=True)

                        if steps is not None and steps == step_counter:
                            # Reached max steps, store summary and stop
                            main_run.manual_log(writer, step_value)
                            stop = True
                        elif steps is None \
                                and step_counter % track_summaries == 0:
                            # Periodic storage of summaries
                            mu.store_summaries(writer, step_counter,
                                               res.summary_str)

                        step_counter += 1

                    try:
                        mu.finalize_queues(coord, threads)
                    except RuntimeError as e:
                        logger.warning('Error stopping coordinator: %s', e)

                except tf.errors.OutOfRangeError:
                    logger.info('Queue run out of evaluation instances')

                return main_run.loss_average(), main_run.metrics_average()
Esempio n. 2
0
    def eval_places(self,
                    dataset,
                    data_mode,
                    gpu_frac,
                    workers,
                    mem_dequeing,
                    steps=20):
        """ Runs and tracks a single evaluation step on the given data subset
        Args:
            Other arguments can be seen in _train_network
            data_mode: Data subset to use for evaluation
            steps: Number of examples the model is evaluated on.
        """
        # Create reader to get TFRecords
        reader = DataReader(dataset)

        with tf.Graph().as_default() as g:

            # Read data and define operations
            data, labels = reader.read_batch(1,
                                             data_mode,
                                             mem_dequeing,
                                             workers,
                                             shuffle=True,
                                             train_mode=False)

            main_run = self._build_run_settings(data, labels, [])

            # Load weights ops for warm start on training
            assign_ops = self._load_pretrained()

            # Supervisor for training. We only want it to deal with the
            # session. Initializes variables
            supervisor = tf.train.Supervisor(graph=g)

            # Initialize session
            session_conf = mu.get_session_config(gpu_frac, log_placement=False)
            with supervisor.managed_session(config=session_conf) as sess:

                # Load model, if existing. Otherwise start from scratch
                step_value, _ = self._initialize_model(sess,
                                                       saver=mu.get_saver(),
                                                       is_training=True)

                if len(assign_ops) > 0:
                    logger.info('Assigning pretrained values to models ...')
                    sess.run(assign_ops)
                else:
                    raise RuntimeError(
                        "Unexpected error: no assign operations" +
                        " to load weights")

                try:
                    # Let queues start dequeing examples
                    coord, threads = mu.initialize_queues(sess)
                    counter = 0
                    while counter < steps:
                        self.test_places_output(sess, data, main_run)
                        counter += 1

                    try:
                        mu.finalize_queues(coord, threads)
                    except RuntimeError as e:
                        logger.warning('Error stopping coordinator: %s', e)

                except tf.errors.OutOfRangeError:
                    logger.info('Queue run out of evaluation instances')
Esempio n. 3
0
    def _train_network(self,
                       dataset,
                       logs_path,
                       batch_size,
                       metrics,
                       track_summaries,
                       gpu_frac,
                       workers,
                       mem_dequeing,
                       track_models=None,
                       steps=None,
                       max_steps=None,
                       log_steps=10):
        """ Runs training on the network defined for the given number of steps
        and stores a checkpoint at the end.
        Args:
            See train_network for other arguments.
            logs_path: Path where to store the network stats
        Returns
            step: Step at which training has stopped
            loss: Mean loss in the process
            metrics: Mean metrics values in the process
        """
        # Create reader to get TFRecords
        reader = DataReader(dataset)

        with tf.Graph().as_default() as g:

            # Read data and define operations
            data, labels = reader.read_batch(batch_size, DataMode.TRAINING,
                                             mem_dequeing, workers)
            main_run = self._build_run_settings(data, labels, metrics)

            # Load weights ops for warm start on training
            assign_ops = self._load_pretrained()

            # Prepare logging for Tensorboard
            saver, summary_ops, writer = mu.prepare_logging(logs_path, g)

            # Supervisor for training. We only want it to deal with the
            # session. Initializes variables
            supervisor = tf.train.Supervisor(graph=g)

            # Initialize session
            session_conf = mu.get_session_config(gpu_frac, log_placement=False)
            with supervisor.managed_session(config=session_conf) as sess:

                # Load model, if existing. Otherwise start from scratch
                step_value, start = self._initialize_model(sess,
                                                           saver=saver,
                                                           is_training=True)

                # Get stopping condition according to mode
                step_limit = step_value + steps if steps is not None else max_steps  # noqa
                step_limit = step_limit if max_steps is None else min(
                    step_limit, max_steps)  # noqa
                stop = step_value >= step_limit

                # Assign weights only if model started from scratch
                if start is True and len(assign_ops) > 0:
                    logger.info('Assigning pretrained values to models ...')
                    sess.run(assign_ops)

                try:
                    # Let queues start dequeing examples
                    coord, threads = mu.initialize_queues(sess)

                    while not stop:

                        # Run network
                        log_run = step_value % log_steps == 0
                        main_res = main_run.training_run(sess,
                                                         summary_ops,
                                                         log=log_run)

                        # Track summaries if needed
                        if track_summaries is not None and \
                                step_value % track_summaries == 0:
                            mu.store_summaries(writer, step_value,
                                               main_res.summary_str)

                        # Track models if needed
                        if track_models is not None \
                                and step_value % track_models == 0 \
                                and step_value != 0:
                            mu.store_checkpoint(sess, saver, step_value,
                                                logs_path)

                        # Update current step and stop condition
                        step_value = main_res.step
                        stop = step_value >= step_limit

                    # Store model at exit
                    mu.store_checkpoint(sess, saver, step_value, logs_path)

                    try:
                        mu.finalize_queues(coord, threads)
                    except RuntimeError as e:
                        logger.warning('Error stopping coordinator: %s', e)

                except tf.errors.OutOfRangeError as e:
                    logger.warn(
                        'Input queue exhausted due to ' +
                        'unexpected reason: %s.', e)

                return step_value, main_run.loss_average(
                ), main_run.metrics_average()  # noqa
Esempio n. 4
0
        if FLAGS.show_image:
            img_spec = DataSpec(batch_size=FLAGS.batch_size,
                                scale_size=32,
                                crop_size=32,
                                channels=3,
                                mean=[0.0, 0.0, 0.0],
                                bgr=False,
                                random_crop=False)
        else:
            img_spec = None

        dataset = settings_fn(dataset_location=FLAGS.data_location,
                              image_specs=img_spec)

        reader = DataReader(dataset)
        features, label = reader.read_folded_batch(
            batch_size=FLAGS.batch_size,
            data_mode=DataMode.TRAINING,  # Use whatever here, e.g. training
            folds=[0, 1, 2],
            memory_factor=FLAGS.memory_factor,
            reader_threads=FLAGS.reader_threads,
            train_mode=False)

        # Initi all vars
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        # Define coordinator to handle all threads
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord, sess=sess)
Esempio n. 5
0
def predict_fn(data_settings_fn, data_location, folder, **params):

    store_summaries = params.get('summaries', True)

    with tf.Graph().as_default() as graph:

        step = get_global_step()

        dataset = data_settings_fn(
            dataset_location=data_location,
            image_specs=image_spec_from_params(**params)
        )
        reader = DataReader(dataset)

        test_context = build_run_context(
            dataset=dataset,
            reader=reader,
            tag=DataMode.TEST,
            folds=None,
            step=step,
            **params
        )

        if store_summaries:
            writer = get_writer(graph, folder, DataMode.TEST)

        saver = tf.train.Saver()

        with tf.train.MonitoredTrainingSession(
                save_checkpoint_secs=None,
                save_summaries_steps=None,
                save_summaries_secs=None) as sess:

            ckpt = tf.train.get_checkpoint_state(folder)
            if ckpt and ckpt.model_checkpoint_path:
                # Restores from checkpoint
                logger.debug(
                    'Restoring {} from {}'.format(
                        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES),
                        ckpt.model_checkpoint_path
                    )
                )
                saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                raise ValueError('No model found in %s' % folder)

            # Define coordinator to handle all threads
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord, sess=sess)

            status, finish = RunStatus(), False

            while not finish:

                try:
                    # Track loss and accuracy until queue exhausted
                    loss, acc, l2 = test_step(sess, test_context)
                    status.update(loss, acc, l2)

                    if store_summaries:
                        summary = sess.run(
                            test_context.summary_op,
                            feed_dict={test_context.is_training_op: False}
                        )
                        writer.add_summary(summary)

                except tf.errors.OutOfRangeError:
                    logger.info('Queue exhausted. Read all instances')
                    finish = True

            coord.request_stop()
            coord.join(threads)

    return {'loss': status.loss(), 'l2': status.l2(), 'error': status.error()}
Esempio n. 6
0
    def fit(self, train_folds, val_folds, max_epochs, **params):

        max_epochs = int(max_epochs)

        # Parameters with default values
        strip_length = params.get('strip_length', 5)
        progress_thresh = params.get('progress_thresh', 0.1)
        max_successive_strips = params.get('max_successive_strips', 3)
        is_layerwise = params.get('layerwise', False)

        self._initialize_training(is_layerwise, **params)

        with tf.Graph().as_default() as graph:

            step = get_global_step()

            dataset = self._settings_fn(
                dataset_location=self._data_location,
                image_specs=image_spec_from_params(**params)
            )
            reader = DataReader(dataset)

            # Get training operations
            train_context = build_run_context(
                dataset, reader, DataMode.TRAINING, train_folds, step, **params
            )
            early_stop = EarlyStop(
                'global', progress_thresh, max_successive_strips
            )

            # Get validation operations
            val_context = build_run_context(
                dataset, reader, DataMode.VALIDATION, val_folds, step, True, **params  # noqa
            )

            if self._should_save():
                self._init_writers(graph)

            saver = self._init_savers(step, **params)

            with tf.train.MonitoredTrainingSession(
                    save_checkpoint_secs=None,
                    save_summaries_steps=None,
                    save_summaries_secs=None) as sess:

                self._init_session(sess, **params)

                # Define coordinator to handle all threads
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(coord=coord, sess=sess)

                while(True):

                    if sess.run(step) >= max_epochs:
                        logger.debug('Max epochs %d reached' % max_epochs)
                        break

                    train_run = run_training_epoch(
                        sess, train_context, self._layer_idx
                    )
                    early_stop.epoch_update(train_run.error())

                    epoch = train_run.epoch

                    if epoch % strip_length == 0 and epoch != 0:

                        # Track training stats
                        logger.debug(
                            '[%d] Training Loss: %f, Error: %f. L2: %f'
                            % (epoch, train_run.loss(), train_run.error(), train_run.l2())  # noqa
                        )

                        # Track validation stats
                        val_run = eval_epoch(
                            sess, val_context, self._layer_idx
                        )
                        logger.debug(
                            '[%d] Validation loss: %f, Error: %f'
                            % (epoch, val_run.loss(), val_run.error())
                        )

                        if self._should_save():
                            self._epoch_summary(
                                sess,
                                train_context,
                                train_run,
                                val_context,
                                val_run,
                                epoch
                            )

                        is_best, stop, train_errors = early_stop.strip_update(
                            train_run, val_run, epoch
                        )

                        if is_best and self._should_save():
                            save_model(sess, saver, self._folder, epoch)

                        if stop and is_layerwise:

                            self._iterate_layer(epoch, train_errors)
                            early_stop.restart_errors()

                            if self._policy.cycle_ended():
                                _, l_stop, _ = self._layer_stop.strip_update(
                                    train_run, val_run, epoch
                                )

                                if l_stop:
                                    break

                        elif stop and not is_layerwise:
                            break

                best_model = early_stop.get_best()
                logger.debug('Best model found: {}'.format(best_model))

                coord.request_stop()
                coord.join(threads)

        return best_model
Esempio n. 7
0
    def fit(self, max_epochs, **params):

        max_epochs = int(max_epochs)

        # Parameters with default values
        is_layerwise = params.get('switch_epochs') is not None
        switch_epochs = params.get('switch_epochs').copy() \
            if is_layerwise else None
        summary_epochs = params.get('summary_epochs', 1)

        with tf.Graph().as_default() as graph:

            step = get_global_step()

            dataset = self._settings_fn(
                dataset_location=self._data_location,
                image_specs=image_spec_from_params(**params)
            )
            reader = DataReader(dataset)

            # Get training operations
            train_flds = range(dataset.get_fold_num())
            context = build_run_context(
                dataset, reader, DataMode.TRAINING, train_flds, step, **params
            )

            # Initialize writers and summaries
            writer = tf.summary.FileWriter(self._folder, graph)

            saver = self._init_savers(step, **params)

            self._initialize_fit(is_layerwise, **params)

            with tf.train.MonitoredTrainingSession(
                    save_checkpoint_secs=None,
                    save_summaries_steps=None,
                    save_summaries_secs=None) as sess:

                self._init_session(sess, **params)

                # Define coordinator to handle all threads
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(coord=coord, sess=sess)

                while(True):

                    if sess.run(step) >= max_epochs:
                        logger.debug('Max epochs %d reached' % max_epochs)
                        break

                    run = run_training_epoch(
                        sess, context, self._layer_idx
                    )

                    epoch = run.epoch

                    if epoch % summary_epochs == 0:
                        # Store histogram
                        sum_str = sess.run(
                            context.summary_op,
                            feed_dict={context.is_training_op: True}
                        )
                        writer.add_summary(sum_str, epoch)

                        # Store stats from current epoch
                        write_epoch(writer, run, epoch)

                        logger.debug(
                            '[%d] Training Loss: %f, Error: %f. L2: %f'
                            % (epoch, run.loss(), run.error(), run.l2())
                        )

                    if switch_epochs is not None and len(switch_epochs) > 0 \
                            and epoch == switch_epochs[0]:
                        self._layer_idx = self._policy.next_layer_id()
                        logger.debug(
                            'Switching to layer %d' % self._layer_idx
                        )
                        switch_epochs = switch_epochs[1:]

                logger.debug('Finished training at step %d' % max_epochs)
                model_path = save_model(sess, saver, self._folder, max_epochs)

                coord.request_stop()
                coord.join(threads)

        return model_path, run.loss(), run.error(), run.l2()