def get_config(self, model_type, override_params=None):
        custom_config = load_config_files(self.config.config_files)
        model_class = get_model('fasterrcnn')
        model_base_config = get_base_config(model_class)
        config = get_model_config(model_base_config, custom_config,
                                  override_params)

        config.model.type = model_type

        return config
Exemple #2
0
    def get_config(self, model_type, override_params=None):
        custom_config = load_config_files(self.config.config_files)
        model_class = get_model('fasterrcnn')
        model_base_config = get_base_config(model_class)
        config = get_model_config(
            model_base_config, custom_config, override_params
        )

        config.model.type = model_type

        return config
Exemple #3
0
def get_prediction(model_type, image, config_file, session=None,
                   prediction_dict=None, image_tensor=None,
                   return_tf_vars=False):
    """
    Gets the prediction given by the model `model_type` of the image `image`.
    If a checkpoint exists in the job's directory, load it.
    The names of the classes will be obtained from the dataset directory.
    Returns a dictionary with the objects, their labels and probabilities,
    the inference time and the scale factor. Also if the `return_tf_vars` is
    True, returns the image tensor, the entire prediction of the model and
    the sesssion.
    """
    model_class = get_model(model_type)
    config = get_model_config(
        model_class.base_config, config_file, None
    )

    if session is None or prediction_dict is None or image_tensor is None:
        graph = tf.Graph()
        session = tf.Session(graph=graph)

        with graph.as_default():
            image_tensor = tf.placeholder(tf.float32, (1, None, None, 3))
            model = model_class(model_class.base_config)
            prediction_dict = model(image_tensor)

            # Restore checkpoint
            if config.train.job_dir and config.train.run_name:
                ckpt = tf.train.get_checkpoint_state(os.path.join(
                    config.train.job_dir, config.train.run_name))
                if not ckpt or not ckpt.all_model_checkpoint_paths:
                    raise ValueError('Could not find checkpoint in {}.'.format(
                        config.train.job_dir
                    ))
                ckpt = ckpt.all_model_checkpoint_paths[-1]
                ckpt_dir = os.path.join('.', ckpt)
                saver = tf.train.Saver(sharded=True, allow_empty=True)
                saver.restore(session, ckpt_dir)
            # A prediction without checkpoint is just used for testing
            else:
                init_op = tf.group(
                    tf.global_variables_initializer(),
                    tf.local_variables_initializer()
                )
                session.run(init_op)

    classification_prediction = prediction_dict['classification_prediction']
    objects_tf = classification_prediction['objects']
    objects_labels_tf = classification_prediction['labels']
    objects_labels_prob_tf = classification_prediction['probs']
    image_resize_config = model_class.base_config.dataset.image_preprocessing

    image_array, scale_factor = resize_image(
        image, float(image_resize_config.min_size),
        float(image_resize_config.max_size)
    )

    start_time = time.time()
    objects, objects_labels, objects_labels_prob = session.run([
        objects_tf, objects_labels_tf, objects_labels_prob_tf
    ], feed_dict={
        image_tensor: image_array
    })
    end_time = time.time()

    if config.dataset.dir:
        # Gets the names of the classes
        classes_file = os.path.join(config.dataset.dir, 'classes.json')
        class_labels = json.load(tf.gfile.GFile(classes_file))
        objects_labels = [class_labels[obj] for obj in objects_labels]

    else:
        objects_labels = objects_labels.tolist()

    res = {
        'objects': objects.tolist(),
        'objects_labels': objects_labels,
        'objects_labels_prob': objects_labels_prob.tolist(),
        'inference_time': end_time - start_time,
        'scale_factor': scale_factor,
    }

    if return_tf_vars:
        res['image_tensor'] = image_tensor
        res['prediction_dict'] = prediction_dict
        res['session'] = session

    return res
Exemple #4
0
def run(custom_config,
        model_type,
        override_params,
        target='',
        cluster_spec=None,
        is_chief=True,
        job_name=None,
        task_index=None,
        get_model_fn=get_model,
        get_dataset_fn=get_dataset):
    model_class = get_model_fn(model_type)

    config = get_model_config(
        model_class.base_config,
        custom_config,
        override_params,
    )

    if config.train.get('seed') is not None:
        tf.set_random_seed(config.train.seed)

    log_prefix = '[{}-{}] - '.format(job_name, task_index) \
        if job_name is not None and task_index is not None else ''

    if config.train.debug or config.train.tf_debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    else:
        tf.logging.set_verbosity(tf.logging.INFO)

    model = model_class(config)

    # Placement of ops on devices using replica device setter
    # which automatically places the parameters on the `ps` server
    # and the `ops` on the workers
    #
    # See:
    # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter
    with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)):
        try:
            config['dataset']['type']
        except KeyError:
            raise KeyError('dataset.type should be set on the custom config.')
        dataset_class = get_dataset_fn(config.dataset.type)
        dataset = dataset_class(config)
        train_dataset = dataset()

        train_image = train_dataset['image']
        train_filename = train_dataset['filename']
        train_bboxes = train_dataset['bboxes']

        # TODO: This is not the best place to configure rank? Why is rank not
        # transmitted through the queue
        train_image.set_shape((None, None, 3))
        # We add fake batch dimension to train data.
        # TODO: DEFINITELY NOT THE BEST PLACE
        train_image = tf.expand_dims(train_image, 0)

        prediction_dict = model(train_image, train_bboxes, is_training=True)
        total_loss = model.loss(prediction_dict)

        global_step = tf.contrib.framework.get_or_create_global_step()

        optimizer = get_optimizer(config.train, global_step)

        trainable_vars = model.get_trainable_vars()

        with tf.name_scope('gradients'):
            # Compute, clip and apply gradients
            grads_and_vars = optimizer.compute_gradients(
                total_loss, trainable_vars)

            # Clip by norm. TODO: Configurable
            grads_and_vars = clip_gradients_by_norm(grads_and_vars)

        train_op = optimizer.apply_gradients(grads_and_vars,
                                             global_step=global_step)

    tf.logging.info('{}Starting training for {}'.format(log_prefix, model))

    run_options = None
    if config.train.full_trace:
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)

    if is_chief:
        # Load pretrained weights needs to be called before defining the train
        # op. After it, variables for the optimizer are created.
        with tf.control_dependencies([tf.global_variables_initializer()]):
            with tf.control_dependencies([model.load_pretrained_weights()]):
                init_op = tf.no_op(name='global_init_load_pretrained')
    else:
        init_op = tf.no_op()

    # Create custom Scaffold to make sure we run our own init_op when model
    # is not restored from checkpoint.
    scaffold = tf.train.Scaffold(
        # Initialize local and global variables.
        init_op=init_op,
        # Queue-related variables need a special initializer.
        local_init_op=tf.local_variables_initializer(),
        summary_op=tf.summary.merge([
            tf.summary.merge_all(),
            model.summary,
        ]))

    # Custom hooks for our session
    hooks = []
    chief_only_hooks = []

    if config.train.tf_debug:
        debug_hook = tf_debug.LocalCLIDebugHook()
        debug_hook.add_tensor_filter('has_inf_or_nan', tf_debug.has_inf_or_nan)
        hooks.extend([debug_hook])

    if not config.train.job_dir:
        tf.logging.warning(
            '`job_dir` is not defined. Checkpoints and logs will not be saved.'
        )
    elif config.train.run_name:
        # Use run_name when available
        checkpoint_dir = os.path.join(config.train.job_dir,
                                      config.train.run_name)
    else:
        checkpoint_dir = config.train.job_dir

    if config.train.display_every_steps or config.train.display_every_secs:
        if not config.train.debug:
            tf.logging.warning('ImageVisHook will not run without debug mode.')
        else:
            # ImageVis only runs on the chief.
            chief_only_hooks.append(
                ImageVisHook(prediction_dict,
                             with_rcnn=config.network.with_rcnn,
                             output_dir=checkpoint_dir,
                             every_n_steps=config.train.display_every_steps,
                             every_n_secs=config.train.display_every_secs))

    with tf.train.MonitoredTrainingSession(
            master=target,
            is_chief=is_chief,
            checkpoint_dir=checkpoint_dir,
            scaffold=scaffold,
            hooks=hooks,
            chief_only_hooks=chief_only_hooks,
            save_checkpoint_secs=config.train.save_checkpoint_secs,
            save_summaries_steps=config.train.save_summaries_steps,
            save_summaries_secs=config.train.save_summaries_secs,
    ) as sess:

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                before = time.time()
                _, train_loss, step, filename = sess.run(
                    [train_op, total_loss, global_step, train_filename],
                    options=run_options)

                # TODO: Add image summary every once in a while.

                tf.logging.info(
                    '{}step: {}, file: {}, train_loss: {}, in {:.2f}s'.format(
                        log_prefix, step, filename, train_loss,
                        time.time() - before))

        except tf.errors.OutOfRangeError:
            tf.logging.info('{}finished training after {} epoch limit'.format(
                log_prefix, config.train.num_epochs))

            # TODO: Print summary
        finally:
            coord.request_stop()

        # Wait for all threads to stop.
        coord.join(threads)
Exemple #5
0
def evaluate(model_type, dataset_split, config_file, job_dir, watch,
             from_global_step, override_params, image_vis, files_per_class):
    """
    Evaluate models using dataset.
    """
    model_cls = get_model(model_type)
    config = model_cls.base_config

    config = get_model_config(model_cls.base_config, config_file,
                              override_params)

    config.train.job_dir = job_dir or config.train.job_dir
    # Only activate debug for image visualizations.
    config.train.debug = image_vis

    if config.train.debug or config.train.tf_debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    else:
        tf.logging.set_verbosity(tf.logging.INFO)

    # Build the dataset tensors, overriding the default dataset split.
    config.dataset.split = dataset_split
    # Disable data augmentation.
    config.dataset.data_augmentation = []

    # Only a single run over the dataset to calculate metrics.
    config.train.num_epochs = 1

    # Seed setup
    if config.train.seed:
        tf.set_random_seed(config.train.seed)

    # Set pretrained as not training
    config.base_network.trainable = False

    model = model_cls(config)
    dataset = TFRecordDataset(config)
    train_dataset = dataset()

    train_image = train_dataset['image']
    train_objects = train_dataset['bboxes']
    train_filename = train_dataset['filename']

    # TODO: This is not the best place to configure rank? Why is rank not
    # transmitted through the queue
    train_image.set_shape((None, None, 3))
    # We add fake batch dimension to train data. TODO: DEFINITELY NOT THE BEST
    # PLACE
    train_image = tf.expand_dims(train_image, 0)

    # Build the graph of the model to evaluate, retrieving required
    # intermediate tensors.
    prediction_dict = model(train_image, train_objects)

    pred = prediction_dict['classification_prediction']
    pred_objects = pred['objects']
    pred_objects_classes = pred['labels']
    pred_objects_scores = pred['probs']

    # Retrieve *all* the losses from the model and calculate their streaming
    # means, so we get the loss over the whole dataset.
    batch_losses = model.loss(prediction_dict, return_all=True)
    losses = {}
    for loss_name, loss_tensor in batch_losses.items():
        loss_mean, _ = tf.metrics.mean(
            loss_tensor,
            name=loss_name,
            metrics_collections='metrics',
            updates_collections='metric_ops',
        )
        full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name)
        losses[full_loss_name] = loss_mean

    metric_ops = tf.get_collection('metric_ops')

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())

    # Using a global saver instead of the one for the model.
    saver = tf.train.Saver(sharded=True, allow_empty=True)

    # Aggregate the required ops to evaluate into a dict..
    ops = {
        'init_op': init_op,
        'metric_ops': metric_ops,
        'pred_objects': pred_objects,
        'pred_objects_classes': pred_objects_classes,
        'pred_objects_scores': pred_objects_scores,
        'train_objects': train_objects,
        'losses': losses,
        'prediction_dict': prediction_dict,
        'filename': train_filename
    }

    metrics_scope = '{}_metrics'.format(dataset_split)

    # Use global writer for all checkpoints. We don't want to write different
    # files for each checkpoint.
    writer = tf.summary.FileWriter(config.train.job_dir)

    files_to_visualize = {}

    last_global_step = from_global_step
    while True:
        # Get the checkpoint files to evaluate.
        try:
            checkpoints = get_checkpoints(config, last_global_step)
        except ValueError as e:
            if not watch:
                tf.logging.error('Missing checkpoint.')
                raise e

            tf.logging.warning(
                'Missing checkpoint; Checking again in a minute')
            time.sleep(60)
            continue

        for checkpoint in checkpoints:
            # Always returned in order, so it's safe to assign directly.
            tf.logging.info(
                'Evaluating global_step {} using checkpoint \'{}\''.format(
                    checkpoint['global_step'], checkpoint['file']))
            try:
                start = time.time()
                evaluate_once(writer,
                              saver,
                              ops,
                              config.network.num_classes,
                              checkpoint,
                              metrics_scope=metrics_scope,
                              image_vis=image_vis,
                              files_per_class=files_per_class,
                              files_to_visualize=files_to_visualize)
                last_global_step = checkpoint['global_step']
                tf.logging.info('Evaluated in {:.2f}s'.format(time.time() -
                                                              start))
            except tf.errors.NotFoundError:
                # The checkpoint is not ready yet. It was written in the
                # checkpoints file, but it still hasn't been completely saved.
                tf.logging.info('Checkpoint {} is not ready yet. '
                                'Checking again in a minute.'.format(
                                    checkpoint['file']))
                time.sleep(60)
                continue

        # If no watching was requested, finish the execution.
        if not watch:
            return

        # Sleep for a minute and check for new checkpoints.
        tf.logging.info('All checkpoints evaluated; sleeping for a minute')
        time.sleep(60)
Exemple #6
0
def train(job_id, service_account_json, bucket_name, region, config_files,
          dataset, scale_tier, master_type, worker_type, worker_count,
          parameter_server_type, parameter_server_count):

    project_id = get_project_id(service_account_json)
    if project_id is None:
        raise ValueError(
            'Missing "project_id" in service_account_json "{}"'.format(
                service_account_json))

    if bucket_name is None:
        client_id = get_client_id(service_account_json)
        bucket_name = 'luminoth-{}'.format(client_id)
        click.echo(
            'Bucket name not specified. Using "{}".'.format(bucket_name))

    credentials = get_credentials(service_account_json)
    validate_region(region, project_id, credentials)

    # Creates bucket for logs and models if it doesn't exist
    bucket = get_bucket(service_account_json, bucket_name)

    if not job_id:
        job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))

    # Define path in bucket to store job's config, logs, etc.
    base_path = 'lumi_{}'.format(job_id)

    package_path = build_package(bucket, base_path)

    # Check if absolute or relative dataset path
    if not dataset.startswith('gs://'):
        dataset = 'gs://{}'.format(dataset)

    args = []

    args.extend([
        '-o',
        'dataset.dir={}'.format(dataset),
    ])

    override_params = [
        'dataset.dir={}'.format(dataset),
    ]

    custom_config = load_config(config_files)
    model_class = get_model(custom_config.model.type)
    config = get_model_config(
        model_class.base_config,
        custom_config,
        override_params,
    )
    # We should validate config before submitting job

    # Update final config file to job bucket
    config_path = os.path.join(base_path, DEFAULT_CONFIG_FILENAME)
    upload_data(bucket, config_path, dump_config(config))

    args = ['--config', 'gs://{}/{}'.format(bucket_name, config_path)]

    cloudml = cloud_service(credentials, 'ml')

    training_inputs = {
        'scaleTier': scale_tier,
        'packageUris': ['gs://{}/{}'.format(bucket_name, package_path)],
        'pythonModule': 'luminoth.train',
        'args': args,
        'region': region,
        'jobDir': 'gs://{}/{}/'.format(bucket_name, base_path),
        'runtimeVersion': RUNTIME_VERSION
    }

    if scale_tier == 'CUSTOM':
        training_inputs['masterType'] = master_type
        training_inputs['workerType'] = worker_type
        training_inputs['workerCount'] = worker_count
        if parameter_server_count > 0:
            training_inputs['parameterServerCount'] = parameter_server_count
            training_inputs['parameterServerType'] = parameter_server_type

    job_spec = {'jobId': job_id, 'trainingInput': training_inputs}

    jobrequest = cloudml.projects().jobs().create(
        body=job_spec, parent='projects/{}'.format(project_id))

    try:
        click.echo('Submitting training job.')
        res = jobrequest.execute()
        click.echo('Job {} submitted successfully.'.format(job_id))
        click.echo('state = {}, createTime = {}'.format(
            res.get('state'), res.get('createTime')))

        save_run(config, environment='gcloud', extra_config=job_spec)

    except Exception as err:
        click.echo('There was an error creating the training job. '
                   'Check the details: \n{}'.format(err._get_reason()))