Esempio n. 1
0
def visualize_model(
        live_ims,
        dead_ims,
        model_file,
        output_folder,
        num_channels,
        smooth_iterations=50,
        untargeted=False,
        viz='none',
        per_timepoint=True):
    """Train an SVM for your dataset on GEDI-model encodings."""
    config = GEDIconfig()
    if live_ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the live images.')
    if dead_ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the dead images.')

    live_files = glob(os.path.join(live_ims, '*%s' % config.raw_im_ext))
    dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext))
    combined_labels = np.concatenate((
        np.zeros(len(live_files)),
        np.ones(len(dead_files))))
    combined_files = np.concatenate((live_files, dead_files))
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1])
    meta_file_pointer = os.path.join(
        model_file_path,
        'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file: train_maximum_value.npz'
            'Closest I could find from directory %s was %s.'
            'Download this from the link described in the README.md.'
            % (model_file_path, glob(os.path.join(model_file_path, '*.npz'))))
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    images = tf.placeholder(
        tf.float32,
        shape=[None] + config.model_image_size,
        name='images')
    labels = tf.placeholder(
        tf.int64,
        shape=[None],
        name='labels')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(
                vgg16_npy_path=config.vgg16_weight_path,
                fine_tune_layers=config.fine_tune_layers)
            vgg.build(
                images,
                output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.fc7
        preds = tf.argmax(vgg.prob, 1)
        activity_pattern = vgg.fc8
        if not untargeted:
            oh_labels = tf.one_hot(labels, config.output_shape)
            activity_pattern *= oh_labels
        grad_image = tf.gradients(activity_pattern, images)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_y, ckpt_scores = [], [], []
    ckpt_file_array, ckpt_viz_images = [], []
    print '-' * 60
    print 'Beginning evaluation'
    print '-' * 60

    if config.validation_batch > len(combined_files):
        print 'Trimming validation_batch to %s (same as # of files).' % len(
            combined_files)
        config.validation_batch = len(combined_files)

    count = 0
    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, y, file_array, viz_images = [], [], [], [], []
        # Initialize the graph
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(
            tf.group(
                tf.global_variables_initializer(),
                tf.local_variables_initializer()))

        # Set up exemplar threading
        saver.restore(sess, c)
        start_time = time.time()
        num_batches = np.floor(
            len(combined_files) / float(
                config.validation_batch)).astype(int)
        for image_batch, label_batch, file_batch in tqdm(
                image_batcher(
                    start=0,
                    num_batches=num_batches,
                    images=combined_files,
                    labels=combined_labels,
                    config=config,
                    training_max=training_max,
                    training_min=training_min,
                    num_channels=num_channels,
                    per_timepoint=per_timepoint),
                total=num_batches):
            feed_dict = {
                images: image_batch,
                labels: label_batch
            }
            it_grads = np.zeros((image_batch.shape))
            sc, tyh = sess.run(
                [scores, preds],
                feed_dict=feed_dict)
            for idx in range(smooth_iterations):
                feed_dict = {
                    images: add_noise(image_batch),
                    labels: label_batch
                }
                it_grad = sess.run(
                    grad_image,
                    feed_dict=feed_dict)
                it_grads += it_grad[0]
            it_grads /= smooth_iterations  # Mean across iterations
            it_grads = visualization_function(it_grads, viz)

            # Save each grad individually
            for grad_i, pred_i, file_i, label_i in zip(
                    it_grads, tyh, file_batch, label_batch):
                out_pointer = os.path.join(
                    output_folder,
                    file_i.split(os.path.sep)[-1])
                out_pointer = out_pointer.split('.')[0] + '.png'
                f = plt.figure()
                plt.imshow(grad_i)
                plt.title('Pred=%s, label=%s' % (pred_i, label_batch))
                plt.savefig(out_pointer)
                plt.close(f)

            # Plot a moisaic of the grads
            if viz == 'none':
                pos_grads = normalize(np.maximum(it_grads, 0))
                neg_grads = normalize(np.minimum(it_grads, 0))
                alpha_mosaic(
                    image_batch,
                    pos_grads,
                    'pos_batch_%s.pdf' % count,
                    title='Positive gradient overlays.',
                    rc=1,
                    cc=len(image_batch),
                    cmap=plt.cm.Reds)
                alpha_mosaic(
                    image_batch,
                    neg_grads,
                    'neg_batch_%s.pdf' % count,
                    title='Negative gradient overlays.',
                    rc=1,
                    cc=len(image_batch),
                    cmap=plt.cm.Reds)
            else:
                alpha_mosaic(
                    image_batch,
                    it_grads,
                    'batch_%s.pdf' % count,
                    title='Gradient overlays.',
                    rc=1,
                    cc=len(image_batch),
                    cmap=plt.cm.Reds)
            count += 1

            # Store the results
            dec_scores += [sc]
            yhat = np.append(yhat, tyh)
            y = np.append(y, label_batch)
            file_array = np.append(file_array, file_batch)
            viz_images += [it_grads]
        ckpt_yhat.append(yhat)
        ckpt_y.append(y)
        ckpt_scores.append(dec_scores)
        ckpt_file_array.append(file_array)
        ckpt_viz_images.append(viz_images)
        print 'Batch %d took %.1f seconds' % (
            idx, time.time() - start_time)
    sess.close()

    # Save everything
    np.savez(
        os.path.join(out_dir, 'validation_accuracies'),
        ckpt_yhat=ckpt_yhat,
        ckpt_y=ckpt_y,
        ckpt_scores=ckpt_scores,
        ckpt_names=ckpts,
        combined_files=ckpt_file_array,
        ckpt_viz_images=ckpt_viz_images)
Esempio n. 2
0
def train_vgg16(train_dir=None, validation_dir=None):
    config = GEDIconfig()
    if train_dir is None:  # Use globals
        train_data = os.path.join(
            config.tfrecord_dir,
            config.tf_record_names['train'])
        meta_data = np.load(
            os.path.join(
                config.tfrecord_dir,
                '%s_%s' % (config.tvt_flags[0], config.max_file)))
    else:
        meta_data = np.load(
            os.path.join(
                train_dir,
                '%s_%s' % (config.tvt_flags[0], config.max_file)))

    # Prepare image normalization values
    if config.max_gedi is None:
        max_value = np.nanmax(meta_data['max_array']).astype(np.float32)
        if max_value == 0:
            max_value = None
            print 'Derived max value is 0'
        else:
            print 'Normalizing with empirical max.'
        if 'min_array' in meta_data.keys():
            min_value = np.min(meta_data['min_array']).astype(np.float32)
            print 'Normalizing with empirical min.'
        else:
            min_value = None
            print 'Not normalizing with a min.'
    else:
        max_value = config.max_gedi
        min_value = config.min_gedi
    ratio = meta_data['ratio']
    if config.encode_time_of_death:
        tod = pd.read_csv(config.encode_time_of_death)
        tod_data = tod['dead_tp'].as_matrix()
        mask = np.isnan(tod_data).astype(int) + (
            tod['plate_well_neuron'] == 'empty').as_matrix().astype(int)
        tod_data = tod_data[mask == 0]
        tod_data = tod_data[tod_data > config.mask_timepoint_value]
        config.output_shape = len(np.unique(tod_data))
        ratio = class_weight.compute_class_weight(
            'balanced',
            np.sort(np.unique(tod_data)),
            tod_data)
        flip_ratio = False
    else:
        flip_ratio = True
    print 'Ratio is: %s' % ratio

    if validation_dir is None:  # Use globals
        validation_data = os.path.join(
            config.tfrecord_dir,
            config.tf_record_names['val'])
    elif validation_dir is False:
        pass  # Do not use validation data during training

    # Make output directories if they do not exist
    dt_stamp = re.split(
        '\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    dt_dataset = config.which_dataset + '_' + dt_stamp + '/'
    config.train_checkpoint = os.path.join(
        config.train_checkpoint, dt_dataset)  # timestamp this run
    out_dir = os.path.join(config.results, dt_dataset)
    dir_list = [
        config.train_checkpoint, config.train_summaries,
        config.results, out_dir]
    [make_dir(d) for d in dir_list]
    # im_shape = get_image_size(config)
    im_shape = config.gedi_image_size

    print '-'*60
    print('Training model:' + dt_dataset)
    print '-'*60

    # Prepare data on CPU
    assert os.path.exists(train_data)
    assert os.path.exists(validation_data)
    assert os.path.exists(config.vgg16_weight_path)
    with tf.device('/cpu:0'):
        train_images, train_labels, train_gedi_images = inputs(
            train_data,
            config.train_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            train=config.data_augmentations,
            num_epochs=config.epochs,
            normalize=config.normalize,
            return_gedi=config.include_GEDI_in_tfrecords,
            return_extra_gfp=config.extra_image,
            return_GEDI_derivative=True)
        val_images, val_labels, val_gedi_images = inputs(
            validation_data,
            config.validation_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            num_epochs=config.epochs,
            normalize=config.normalize,
            return_gedi=config.include_GEDI_in_tfrecords,
            return_extra_gfp=config.extra_image,
            return_GEDI_derivative=True)
        if config.include_GEDI_in_tfrecords:
            extra_im_name = 'GEDI at current timepoint'
        else:
            extra_im_name = 'next gfp timepoint'
        tf.summary.image('train images', train_images)
        tf.summary.image('validation images', val_images)
        tf.summary.image('train %s' % extra_im_name, train_gedi_images)
        tf.summary.image('validation %s' % extra_im_name, val_gedi_images)

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn') as scope:
            if config.ordinal_classification is None:
                vgg_output = 2  # Sign of derivative (inf norm)
                train_labels = tf.cast(tf.sign(train_labels), tf.int32)
                val_labels = tf.cast(tf.sign(val_labels), tf.int32)
            elif config.ordinal_classification == 'regression':
                vgg_output = 1
            else:
                raise RuntimeError(
                    'config.ordinal_classification must be sign or regression.'
                    )
            vgg = vgg16.model_struct()
            train_mode = tf.get_variable(name='training', initializer=True)

            # Mask NAN images from loss
            image_nan = tf.reduce_sum(
                tf.cast(tf.is_nan(train_images), tf.float32),
                reduction_indices=[1, 2, 3])
            gedi_nan = tf.reduce_sum(
                tf.cast(tf.is_nan(train_gedi_images), tf.float32),
                reduction_indices=[1, 2, 3],
                keep_dims=True)
            image_mask = tf.cast(tf.equal(image_nan, 0.), tf.float32)
            gedi_nan = tf.cast(tf.equal(gedi_nan, 0.), tf.float32)
            train_images = tf.where(
                tf.is_nan(train_images),
                tf.zeros_like(train_images),
                train_images)
            train_gedi_images = tf.where(
                tf.is_nan(train_gedi_images),
                tf.zeros_like(train_gedi_images),
                train_gedi_images)
            train_images = tf.concat([train_images, train_images, train_images], axis=3)
            val_images = tf.concat([val_images, val_images, val_images], axis=3)
            vgg.build(
                train_images, output_shape=vgg_output,
                train_mode=train_mode, batchnorm=config.batchnorm_layers)
            # Prepare the cost function
            if config.ordinal_classification is None:
                # Encode y w/ k-hot and yhat w/ sigmoid ce. units capture dist.
                cost = softmax_cost(
                    vgg.fc8,
                    train_labels,
                    mask=image_mask)
            elif config.ordinal_classification == 'regression':
                cost = tf.nn.l2_loss(tf.squeeze(vgg.fc8) - train_labels)

            class_loss = cost
            tf.summary.scalar("cce cost", cost)

            # Weight decay
            if config.wd_layers is not None:
                _, l2_wd_layers = fine_tune_prepare_layers(
                    tf.trainable_variables(), config.wd_layers)
                l2_wd_layers = [
                    x for x in l2_wd_layers if 'biases' not in x.name]
                if len(l2_wd_layers) > 0:
                    cost += (config.wd_penalty * tf.add_n(
                        [tf.nn.l2_loss(x) for x in l2_wd_layers]))

            # Optimize
            train_op = tf.train.AdamOptimizer(config.new_lr).minimize(cost)
            if config.ordinal_classification is None:
                train_accuracy = class_accuracy(
                    vgg.prob, train_labels)  # training accuracy
            elif config.ordinal_classification == 'regression':
                train_accuracy = tf.nn.l2_loss(
                    tf.squeeze(vgg.fc8) - train_labels)
            tf.summary.scalar("training accuracy", train_accuracy)

            # Setup validation op
            if validation_data is not False:
                scope.reuse_variables()
                # Validation graph is the same as training except no batchnorm
                val_vgg = vgg16.model_struct(
                    fine_tune_layers=config.fine_tune_layers)
                val_vgg.build(val_images, output_shape=vgg_output)
                # Calculate validation accuracy
                if config.ordinal_classification is None:
                    val_accuracy = class_accuracy(val_vgg.prob, val_labels)
                elif config.ordinal_classification == 'regression':
                    val_accuracy = tf.nn.l2_loss(tf.squeeze(val_vgg.fc8) - val_labels)
                tf.summary.scalar("validation accuracy", val_accuracy)

    # Set up summaries and saver
    saver = tf.train.Saver(
        tf.global_variables(), max_to_keep=config.keep_checkpoints)
    summary_op = tf.summary.merge_all()

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(tf.group(tf.global_variables_initializer(),
             tf.local_variables_initializer()))
    summary_dir = os.path.join(
        config.train_summaries, config.which_dataset + '_' + dt_stamp)
    summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)

    # Set up exemplar threading
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # Restore model if requested
    if config.restore_path is not None:
        print '-' * 60
        print 'Restoring from a previous model: %s' % config.restore_path
        print '-' * 60
        saver.restore(sess, config.restore_path)

    # Start training loop
    np.save(out_dir + 'meta_info', config)
    step, losses = 0, []  # val_max = 0

    try:
        # print response
        while not coord.should_stop():
            start_time = time.time()
            _, loss_value, train_acc = sess.run(
                [train_op, cost, train_accuracy])
            losses.append(loss_value)
            duration = time.time() - start_time
            if np.isnan(loss_value).sum():
                import ipdb;ipdb.set_trace()
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            if step % config.validation_steps == 0:
                if validation_data is not False:
                    _, val_acc = sess.run([train_op, val_accuracy])
                else:
                    val_acc -= 1  # Store every checkpoint

                # Summaries
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                # Training status and validation accuracy
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; '
                    '%.3f sec/batch) | Training accuracy = %s | '
                    'Training %s = %s | Training class loss = %s | '
                    'Validation accuracy = %s | Validation %s = %s | '
                    'logdir = %s')
                print (format_str % (
                    datetime.now(), step, loss_value,
                    config.train_batch / duration, float(duration),
                    train_acc, extra_im_name, 0.,
                    0., val_acc, extra_im_name,
                    0., summary_dir))

                # Save the model checkpoint if it's the best yet
                if 1:  # val_acc >= val_max:
                    saver.save(
                        sess, os.path.join(
                            config.train_checkpoint,
                            'model_' + str(step) + '.ckpt'), global_step=step)
                    # Store the new max validation accuracy
                    # val_max = val_acc

            else:
                # Training status
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Training %s = %s | Training class loss = %s')
                print (format_str % (datetime.now(), step, loss_value,
                                     config.train_batch / duration,
                                     float(duration), train_acc,
                                     extra_im_name, 0.,
                                     0.))
            # End iteration
            step += 1

    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (config.epochs, step))
    finally:
        coord.request_stop()
        np.save(os.path.join(config.tfrecord_dir, 'training_loss'), losses)
    coord.join(threads)
    sess.close()
Esempio n. 3
0
def test_vgg16(image_dir,
               model_file,
               output_csv='prediction_file',
               training_max=None):
    print(image_dir)
    #    tf.set_random_seed(0)
    config = GEDIconfig()
    if image_dir is None:
        raise RuntimeError(
            'You need to supply a directory path to the images.')

    combined_files = np.asarray(
        glob(os.path.join(image_dir, '*%s' % config.raw_im_ext)))
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1])
    print('model file path', model_file_path)
    meta_file_pointer = os.path.join(model_file_path,
                                     'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file: train_maximum_value.npz'
            'Closest I could find from directory %s was %s.'
            'Download this from the link described in the README.md.' %
            (model_file_path, glob(os.path.join(model_file_path, '*.npz'))))
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    if training_max is None:
        training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)
    print('out_dir', out_dir)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    if config.model_image_size[-1] < 3:
        print('*' * 60)
        print('Warning: model is expecting a H/W/1 image. '
              'Do you mean to set the last dimension of '
              'config.model_image_size to 3?')
        print('*' * 60)

    images = tf.placeholder(tf.float32,
                            shape=[None] + config.model_image_size,
                            name='images')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path,
                                     fine_tune_layers=config.fine_tune_layers)
            vgg.build(images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.prob
        preds = tf.argmax(vgg.prob, 1)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], []
    print('-' * 60)
    print('Beginning evaluation')
    print('-' * 60)

    if config.validation_batch > len(combined_files):
        print('Trimming validation_batch size to %s (same as # of files).' %
              len(combined_files))
        config.validation_batch = len(combined_files)

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, file_array = [], [], []
        # Initialize the graph

        #        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            sess.run(
                tf.group(tf.global_variables_initializer(),
                         tf.local_variables_initializer()))

            # Set up exemplar threading
            saver.restore(sess, c)
            start_time = time.time()
            num_batches = np.floor(
                len(combined_files) /
                float(config.validation_batch)).astype(int)
            for image_batch, file_batch in tqdm(image_batcher(
                    start=0,
                    num_batches=num_batches,
                    images=combined_files,
                    config=config,
                    training_max=training_max,
                    training_min=training_min),
                                                total=num_batches):
                feed_dict = {images: image_batch}
                sc, tyh = sess.run([scores, preds], feed_dict=feed_dict)
                dec_scores = np.append(dec_scores, sc)
                yhat = np.append(yhat, tyh)
                file_array = np.append(file_array, file_batch)
            ckpt_yhat.append(yhat)
            ckpt_scores.append(dec_scores)
            ckpt_file_array.append(file_array)
            print('Batch %d took %.1f seconds' %
                  (idx, time.time() - start_time))
#    sess.close()

# Save everything
    print('Save npz.')
    print(os.path.join(out_dir, 'validation_accuracies'))
    np.savez(os.path.join(out_dir, 'validation_accuracies'),
             ckpt_yhat=ckpt_yhat,
             ckpt_scores=ckpt_scores,
             ckpt_names=ckpts,
             combined_files=ckpt_file_array)

    # Also save a csv with item/guess pairs
    try:
        dec_scores = np.asarray(dec_scores)
        yhat = np.asarray(yhat)
        df = pd.DataFrame(np.hstack(
            (np.asarray(ckpt_file_array).reshape(-1, 1), yhat.reshape(-1, 1),
             dec_scores.reshape(dec_scores.shape[0] // 2, 2))),
                          columns=[
                              'files', 'live_guesses', 'classifier score dead',
                              'classifier score live'
                          ])
        output_name = image_dir.split('/')[-1]
        if output_name is None or len(output_name) == 0:
            output_name = 'output'
        df.to_csv(os.path.join(out_dir, '%s.csv' % output_name))
        print('Saved csv to: %s' %
              os.path.join(out_dir, '%s.csv' % output_name))
    except:
        print('X' * 60)
        print('Could not save a spreadsheet of file info')
        print('X' * 60)

    # Plot everything
    try:
        plot_accuracies(ckpt_y, ckpt_yhat, config, ckpts,
                        os.path.join(out_dir, 'validation_accuracies.png'))
        plot_std(ckpt_y, ckpt_yhat, ckpts,
                 os.path.join(out_dir, 'validation_stds.png'))
        plot_cms(ckpt_y, ckpt_yhat, config,
                 os.path.join(out_dir, 'confusion_matrix.png'))
        plot_pr(ckpt_y, ckpt_yhat, ckpt_scores,
                os.path.join(out_dir, 'precision_recall.png'))


#        plot_cost(
#            os.path.join(out_dir, 'training_loss.npy'), ckpts,
#            os.path.join(out_dir, 'training_costs.png'))
    except:
        print('X' * 60)
        print('Could not locate the loss numpy')
        print('X' * 60)
Esempio n. 4
0
def train_model(train_dir=None, validation_dir=None):
    config = GEDIconfig()
    if train_dir is None:  # Use globals
        train_data = os.path.join(config.tfrecord_dir,
                                  config.tf_record_names['train'])
        meta_data = np.load(
            os.path.join(config.tfrecord_dir,
                         '%s_%s' % (config.tvt_flags[0], config.max_file)))
    else:
        meta_data = np.load(
            os.path.join(train_dir,
                         '%s_%s' % (config.tvt_flags[0], config.max_file)))

    # Prepare image normalization values
    if config.max_gedi is None:
        max_value = np.nanmax(meta_data['max_array']).astype(np.float32)
        if max_value == 0:
            max_value = None
            print 'Derived max value is 0'
        else:
            print 'Normalizing with empirical max.'
        if 'min_array' in meta_data.keys():
            min_value = np.min(meta_data['min_array']).astype(np.float32)
            print 'Normalizing with empirical min.'
        else:
            min_value = None
            print 'Not normalizing with a min.'
    else:
        max_value = config.max_gedi
        min_value = config.min_gedi
    ratio = meta_data['ratio']
    print 'Ratio is: %s' % ratio

    if validation_dir is None:  # Use globals
        validation_data = os.path.join(config.tfrecord_dir,
                                       config.tf_record_names['val'])
    elif validation_dir is False:
        pass  # Do not use validation data during training

    # Make output directories if they do not exist
    dt_stamp = re.split(
        '\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    dt_dataset = config.which_dataset + '_' + dt_stamp + '/'
    config.train_checkpoint = os.path.join(config.train_checkpoint,
                                           dt_dataset)  # timestamp this run
    out_dir = os.path.join(config.results, dt_dataset)
    dir_list = [
        config.train_checkpoint, config.train_summaries, config.results,
        out_dir
    ]
    [make_dir(d) for d in dir_list]
    # im_shape = get_image_size(config)
    im_shape = config.gedi_image_size

    print '-' * 60
    print('Training model:' + dt_dataset)
    print '-' * 60

    # Prepare data on CPU
    assert os.path.exists(train_data)
    assert os.path.exists(validation_data)
    assert os.path.exists(config.vgg16_weight_path)
    with tf.device('/cpu:0'):
        train_images_0, train_images_1, train_labels, train_times = inputs(
            train_data,
            config.train_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            train=config.data_augmentations,
            num_epochs=config.epochs,
            normalize=config.normalize,
            return_filename=True)
        val_images_0, val_images_1, val_labels, val_times = inputs(
            validation_data,
            config.validation_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            num_epochs=config.epochs,
            normalize=config.normalize,
            return_filename=True)
        tf.summary.image('train image frame 0', train_images_0)
        tf.summary.image('train image frame 1', train_images_1)
        tf.summary.image('validation image frame 0', val_images_0)
        tf.summary.image('validation image frame 1', val_images_1)

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('gedi'):
            # Build training GEDI model for frame 0
            vgg_train_mode = tf.get_variable(name='vgg_training',
                                             initializer=False)
            gedi_model_0 = vgg16.model_struct(
                vgg16_npy_path=config.gedi_weight_path, trainable=False)
            gedi_model_0.build(prep_images_for_gedi(train_images_0),
                               output_shape=2,
                               train_mode=vgg_train_mode)
            gedi_scores_0 = gedi_model_0.fc7

        with tf.variable_scope('match'):
            # Build matching model for frame 0
            model_0 = matching_gedi.model_struct()
            model_0.build(train_images_0)

            # Build frame 0 vector
            frame_0 = tf.concat([gedi_scores_0, model_0.output], axis=-1)

            # Build output layer
            if config.matching_combine == 'concatenate':
                output_shape = [int(frame_0.get_shape()[-1]) * 2, 2]
            elif config.matching_combine == 'subtract':
                output_shape = [int(frame_0.get_shape()[-1]), 2]
            else:
                raise RuntimeError

        # Build GEDI model for frame 1
        with tf.variable_scope('gedi', reuse=True):
            gedi_model_1 = vgg16.model_struct(
                vgg16_npy_path=config.gedi_weight_path, trainable=False)
            gedi_model_1.build(prep_images_for_gedi(train_images_1),
                               output_shape=2,
                               train_mode=vgg_train_mode)
            gedi_scores_1 = gedi_model_1.fc7

        with tf.variable_scope('match', reuse=True):
            # Build matching model for frame 1
            model_1 = matching_gedi.model_struct()
            model_1.build(train_images_1)

        # Build frame 0 and frame 1 vectors
        frame_1 = tf.concat([gedi_scores_1, model_1.output], axis=-1)

        with tf.variable_scope('output'):
            # Concatenate or subtract
            if config.matching_combine == 'concatenate':
                output_scores = tf.concat([frame_0, frame_1], axis=-1)
            elif config.matching_combine == 'subtract':
                output_scores = frame_0 - frame_1
            else:
                raise NotImplementedError

            # Build output layer
            output_shape = [int(output_scores.get_shape()[-1]), 2]
            output_weights = tf.get_variable(
                name='output_weights',
                shape=output_shape,
                initializer=tf.contrib.layers.xavier_initializer(
                    uniform=False))
            output_bias = tf.get_variable(name='output_bias',
                                          initializer=tf.truncated_normal(
                                              [output_shape[-1]], .0, .001))
            decision_logits = tf.nn.bias_add(
                tf.matmul(output_scores, output_weights), output_bias)
            train_soft_decisions = tf.nn.softmax(decision_logits)
            cost = softmax_cost(decision_logits, train_labels)
            tf.summary.scalar("cce loss", cost)
            cost += tf.nn.l2_loss(output_weights)

            # Weight decay
            if config.wd_layers is not None:
                _, l2_wd_layers = fine_tune_prepare_layers(
                    tf.trainable_variables(), config.wd_layers)
                l2_wd_layers = [
                    x for x in l2_wd_layers if 'biases' not in x.name
                ]
                if len(l2_wd_layers) > 0:
                    cost += (config.wd_penalty *
                             tf.add_n([tf.nn.l2_loss(x)
                                       for x in l2_wd_layers]))

        # Optimize
        train_op = tf.train.AdamOptimizer(config.new_lr).minimize(cost)
        train_accuracy = class_accuracy(train_soft_decisions,
                                        train_labels)  # training accuracy
        tf.summary.scalar("training accuracy", train_accuracy)

        # Setup validation op
        if validation_data is not False:
            with tf.variable_scope('gedi', reuse=tf.AUTO_REUSE):  # FIX THIS
                # Validation graph is the same as training except no batchnorm
                val_gedi_model_0 = vgg16.model_struct(
                    vgg16_npy_path=config.gedi_weight_path)
                val_gedi_model_0.build(prep_images_for_gedi(val_images_0),
                                       output_shape=2,
                                       train_mode=vgg_train_mode)
                val_gedi_scores_0 = val_gedi_model_0.fc7

                # Build GEDI model for frame 1
                val_gedi_model_1 = vgg16.model_struct(
                    vgg16_npy_path=config.gedi_weight_path)
                val_gedi_model_1.build(prep_images_for_gedi(val_images_1),
                                       output_shape=2,
                                       train_mode=vgg_train_mode)
                val_gedi_scores_1 = val_gedi_model_1.fc7

            with tf.variable_scope('match', reuse=tf.AUTO_REUSE):
                # Build matching model for frame 0
                val_model_0 = matching_gedi.model_struct()
                val_model_0.build(val_images_0)

                # Build matching model for frame 1
                val_model_1 = matching_gedi.model_struct()
                val_model_1.build(val_images_1)

            # Build frame 0 and frame 1 vectors
            val_frame_0 = tf.concat([val_gedi_scores_0, val_model_0.output],
                                    axis=-1)
            val_frame_1 = tf.concat([val_gedi_scores_1, val_model_1.output],
                                    axis=-1)

            # Concatenate or subtract
            if config.matching_combine == 'concatenate':
                val_output_scores = tf.concat([val_frame_0, val_frame_1],
                                              axis=-1)
            elif config.matching_combine == 'subtract':
                val_output_scores = val_frame_0 - val_frame_1
            else:
                raise NotImplementedError

            with tf.variable_scope('output', reuse=tf.AUTO_REUSE):
                # Build output layer
                val_output_weights = tf.get_variable(
                    name='val_output_weights',
                    shape=output_shape,
                    trainable=False,
                    initializer=tf.contrib.layers.xavier_initializer(
                        uniform=False))
                val_output_bias = tf.get_variable(
                    name='output_bias',
                    trainable=False,
                    initializer=tf.truncated_normal([output_shape[-1]], .0,
                                                    .001))
                val_decision_logits = tf.nn.bias_add(
                    tf.matmul(val_output_scores, val_output_weights),
                    val_output_bias)
                val_soft_decisions = tf.nn.softmax(val_decision_logits)

            # Calculate validation accuracy
            val_accuracy = class_accuracy(val_soft_decisions, val_labels)
            tf.summary.scalar("validation accuracy", val_accuracy)

    # Set up summaries and saver
    saver = tf.train.Saver(tf.global_variables(),
                           max_to_keep=config.keep_checkpoints)
    summary_op = tf.summary.merge_all()

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(
        tf.group(tf.global_variables_initializer(),
                 tf.local_variables_initializer()))
    summary_dir = os.path.join(config.train_summaries,
                               config.which_dataset + '_' + dt_stamp)
    summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)

    # Set up exemplar threading
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # Start training loop
    np.save(out_dir + 'meta_info', config)
    step, losses = 0, []  # val_max = 0
    try:
        # print response
        while not coord.should_stop():
            start_time = time.time()
            _, loss_value, train_acc, val_acc = sess.run(
                [train_op, cost, train_accuracy, val_accuracy])
            losses += [loss_value]
            duration = time.time() - start_time
            if np.isnan(loss_value).sum():
                assert not np.isnan(loss_value), 'Model loss = NaN'

            if step % config.validation_steps == 0:
                if validation_data is not False:
                    val_acc = sess.run(val_accuracy)
                else:
                    val_acc -= 1  # Store every checkpoint

                # Summaries
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                # Training status and validation accuracy
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Validation accuracy = %s | '
                              'logdir = %s')
                print(format_str %
                      (datetime.now(), step,
                       loss_value, config.train_batch / duration,
                       float(duration), train_acc, val_acc, summary_dir))

                # Save the model checkpoint if it's the best yet
                if 1:  # val_acc >= val_max:
                    saver.save(sess,
                               os.path.join(config.train_checkpoint,
                                            'model_' + str(step) + '.ckpt'),
                               global_step=step)
                    # Store the new max validation accuracy
                    # val_max = val_acc

            else:
                # Training status
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Training loss = %s')
                print(format_str %
                      (datetime.now(), step, loss_value, config.train_batch /
                       duration, float(duration), loss_value))
            # End iteration
            step += 1

    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (config.epochs, step))
    finally:
        coord.request_stop()
        np.save(os.path.join(config.tfrecord_dir, 'training_loss'), losses)
    coord.join(threads)
    sess.close()
Esempio n. 5
0
def test_vgg16(model_file,
               trained_svm,
               ims,
               dead_ims=None,
               output_csv='prediction_file',
               training_max=None,
               C=1e-3,
               k_folds=10):
    """Test an SVM you've trained on a new dataset."""
    config = GEDIconfig()
    if ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the images.')
    if dead_ims is None:
        print 'Assuming all of your images are in the ims folder' + \
            '-- will not derive labels to calculate accuracy.'
    # if not os.path.exists(trained_svm):
    #     raise RuntimeError(
    #         'Cannot find the trained svm model. Check the path you passed.')
    try:
        clf = cPickle.load(open(trained_svm, 'rb'))
        # clf = model_dict['clf']
        # mu = model_dict['mu']
        # sd = model_dict['sd']
    except:
        raise RuntimeError('Cannot find SVM file: %s' % trained_svm)

    if dead_ims is not None:
        live_files = glob(os.path.join(ims, '*%s' % config.raw_im_ext))
        dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext))
        combined_labels = np.concatenate(
            (np.zeros(len(live_files)), np.ones(len(dead_files))))
        combined_files = np.concatenate((live_files, dead_files))
    else:
        live_files = glob(os.path.join(ims, '*%s' % config.raw_im_ext))
        combined_labels = None
        combined_files = np.asarray(live_files)
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1])
    meta_file_pointer = os.path.join(model_file_path,
                                     'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file: train_maximum_value.npz'
            'Closest I could find from directory %s was %s.'
            'Download this from the link described in the README.md.' %
            (model_file_path, glob(os.path.join(model_file_path, '*.npz'))))
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    if training_max is None:
        training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    images = tf.placeholder(tf.float32,
                            shape=[None] + config.model_image_size,
                            name='images')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path,
                                     fine_tune_layers=config.fine_tune_layers)
            vgg.build(images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.fc7
        preds = tf.argmax(vgg.prob, 1)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], []
    print '-' * 60
    print 'Beginning evaluation'
    print '-' * 60

    if config.validation_batch > len(combined_files):
        print 'Trimming validation_batch size to %s (same as # of files).' % len(
            combined_files)
        config.validation_batch = len(combined_files)

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, y, file_array = [], [], [], []
        # Initialize the graph
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(
            tf.group(tf.global_variables_initializer(),
                     tf.local_variables_initializer()))

        # Set up exemplar threading
        saver.restore(sess, c)
        start_time = time.time()
        num_batches = np.floor(
            len(combined_files) / float(config.validation_batch)).astype(int)
        for image_batch, label_batch, file_batch in tqdm(image_batcher(
                start=0,
                num_batches=num_batches,
                images=combined_files,
                labels=combined_labels,
                config=config,
                training_max=training_max,
                training_min=training_min),
                                                         total=num_batches):
            feed_dict = {images: image_batch}
            sc, tyh = sess.run([scores, preds], feed_dict=feed_dict)
            dec_scores += [sc]
            yhat = np.append(yhat, tyh)
            y = np.append(y, label_batch)
            file_array = np.append(file_array, file_batch)
        ckpt_yhat.append(yhat)
        ckpt_y.append(y)
        ckpt_scores.append(dec_scores)
        ckpt_file_array.append(file_array)
        print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time)
    sess.close()

    # Save everything
    new_dt_string = re.split('\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    np.savez(os.path.join(out_dir, '%s_validation_accuracies' % new_dt_string),
             ckpt_yhat=ckpt_yhat,
             ckpt_y=ckpt_y,
             ckpt_scores=ckpt_scores,
             ckpt_names=ckpts,
             combined_files=ckpt_file_array)

    # Run SVM
    all_scores = np.concatenate(dec_scores)
    # all_scores = (all_scores - mu) / sd
    predictions = clf.predict(all_scores)
    if dead_ims is not None:
        mean_acc = np.mean(predictions == y)
        p_value = randomization_test(y=y, yhat=predictions)
        print 'SVM performance: mean accuracy = %s%%, p = %.5f' % (mean_acc,
                                                                   p_value)
        df_col_label = 'true label'
    else:
        mean_acc, p_value = None, None
        y = np.copy(yhat)
        df_col_label = 'Dummy column (no labels supplied)'
    np.savez(os.path.join(out_dir, '%s_svm_test_data' % new_dt_string),
             yhat=yhat,
             y=y,
             scores=dec_scores,
             ckpts=ckpts,
             p_value=p_value)

    # Also save a csv with item/guess pairs
    trimmed_files = np.asarray([
        x.split(os.path.sep)[-1] for x in np.asarray(ckpt_file_array).ravel()
    ])
    yhat = np.asarray(yhat)
    df = pd.DataFrame(
        np.hstack((trimmed_files.reshape(-1, 1), yhat.reshape(-1, 1))),
        #   y.reshape(-1, 1))),
        columns=['files', 'guesses'])  # , df_col_label])
    df.to_csv(os.path.join(out_dir, 'prediction_file.csv'))
    print 'Saved csv to: %s' % out_dir
def test_vgg16(live_ims,
               dead_ims,
               model_file,
               svm_model='svm_model',
               output_csv='prediction_file',
               training_max=None,
               C=1e-3,
               k_folds=10):
    """Train an SVM for your dataset on GEDI-model encodings."""
    config = GEDIconfig()
    if live_ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the live images.')
    if dead_ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the dead images.')

    live_files = glob(os.path.join(live_ims, '*%s' % config.raw_im_ext))
    dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext))
    combined_labels = np.concatenate(
        (np.zeros(len(live_files)), np.ones(len(dead_files))))
    combined_files = np.concatenate((live_files, dead_files))
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1])
    meta_file_pointer = os.path.join(model_file_path,
                                     'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file: train_maximum_value.npz'
            'Closest I could find from directory %s was %s.'
            'Download this from the link described in the README.md.' %
            (model_file_path, glob(os.path.join(model_file_path, '*.npz'))))
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    if training_max is None:
        training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    images = tf.placeholder(tf.float32,
                            shape=[None] + config.model_image_size,
                            name='images')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path,
                                     fine_tune_layers=config.fine_tune_layers)
            vgg.build(images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.fc7
        preds = tf.argmax(vgg.prob, 1)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], []
    print '-' * 60
    print 'Beginning evaluation'
    print '-' * 60

    if config.validation_batch > len(combined_files):
        print 'Trimming validation_batch size to %s (same as # of files).' % len(
            combined_files)
        config.validation_batch = len(combined_files)

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, y, file_array = [], [], [], []
        # Initialize the graph
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(
            tf.group(tf.global_variables_initializer(),
                     tf.local_variables_initializer()))

        # Set up exemplar threading
        saver.restore(sess, c)
        start_time = time.time()
        num_batches = np.floor(
            len(combined_files) / float(config.validation_batch)).astype(int)
        for image_batch, label_batch, file_batch in tqdm(image_batcher(
                start=0,
                num_batches=num_batches,
                images=combined_files,
                labels=combined_labels,
                config=config,
                training_max=training_max,
                training_min=training_min),
                                                         total=num_batches):
            feed_dict = {images: image_batch}
            sc, tyh = sess.run([scores, preds], feed_dict=feed_dict)
            dec_scores += [sc]
            yhat = np.append(yhat, tyh)
            y = np.append(y, label_batch)
            file_array = np.append(file_array, file_batch)
        ckpt_yhat.append(yhat)
        ckpt_y.append(y)
        ckpt_scores.append(dec_scores)
        ckpt_file_array.append(file_array)
        print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time)
    sess.close()

    # Save everything
    np.savez(os.path.join(out_dir, 'validation_accuracies'),
             ckpt_yhat=ckpt_yhat,
             ckpt_y=ckpt_y,
             ckpt_scores=ckpt_scores,
             ckpt_names=ckpts,
             combined_files=ckpt_file_array)

    # Run SVM
    svm = LinearSVC(C=C, dual=False, class_weight='balanced')
    clf = make_pipeline(preprocessing.StandardScaler(), svm)
    predictions = cross_val_predict(clf,
                                    np.concatenate(dec_scores),
                                    y,
                                    cv=k_folds)
    cv_performance = metrics.accuracy_score(predictions, y)
    p_value = randomization_test(y=y, yhat=predictions)
    clf.fit(np.concatenate(dec_scores), y)
    # mu = dec_scores.mean(0)
    # sd = dec_scores.std(0)
    print '%s-fold SVM performance: accuracy = %s%% , p = %.5f' % (
        k_folds, np.mean(cv_performance * 100), p_value)
    np.savez(
        os.path.join(out_dir, 'svm_data'),
        yhat=predictions,
        y=y,
        scores=dec_scores,
        ckpts=ckpts,
        cv_performance=cv_performance,
        p_value=p_value,
        k_folds=k_folds,
        # mu=mu,
        # sd=sd,
        C=C)

    # Also save a csv with item/guess pairs
    try:
        trimmed_files = [re.split('/', x)[-1] for x in combined_files]
        trimmed_files = np.asarray(trimmed_files)
        dec_scores = np.asarray(dec_scores)
        yhat = np.asarray(yhat)
        df = pd.DataFrame(np.hstack(
            (trimmed_files.reshape(-1, 1), yhat.reshape(-1,
                                                        1), y.reshape(-1, 1))),
                          columns=['files', 'guesses', 'true label'])
        df.to_csv(os.path.join(out_dir, 'prediction_file.csv'))
        print 'Saved csv to: %s' % out_dir
    except:
        print 'X' * 60
        print 'Could not save a spreadsheet of file info'
        print 'X' * 60

    # save the classifier
    with open('%s.pkl' % svm_model, 'wb') as fid:
        # model_dict = {
        #     'model': clf,
        #     'mu': mu,
        #     'sd': sd
        # }
        cPickle.dump(clf, fid)
    print 'Saved svm model to: %s.pkl' % svm_model
Esempio n. 7
0
def test_vgg16(image_dir,
               model_file,
               autopsy_csv=None,
               autopsy_path=None,
               output_csv='prediction_file',
               target_layer='fc7',
               save_npy=False,
               shuffle_images=True,
               embedding_type='PCA'):
    """Testing function for pretrained vgg16."""
    assert autopsy_csv is not None, 'You must pass an autopsy file name.'
    assert autopsy_path is not None, 'You must pass an autopsy path.'

    # Load autopsy information
    autopsy_data = pd.read_csv(os.path.join(autopsy_path, autopsy_csv))

    # Load config and begin preparing data
    config = GEDIconfig()
    if image_dir is None:
        raise RuntimeError(
            'You need to supply a directory path to the images.')

    combined_files = np.asarray(
        glob(os.path.join(image_dir, '*%s' % config.raw_im_ext)))
    if shuffle_images:
        combined_files = combined_files[np.random.permutation(
            len(combined_files))]
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    meta_file_pointer = os.path.join(
        model_file.split('/model')[0], 'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file.'
            'Download this from the link described in the README.md.')
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    images = tf.placeholder(tf.float32,
                            shape=[None] + config.model_image_size,
                            name='images')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path,
                                     fine_tune_layers=config.fine_tune_layers)
            vgg.build(images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg[target_layer]
        preds = tf.argmax(vgg.prob, 1)

    # Derive pathologies from file names
    pathologies = []
    for f in combined_files:
        sf = f.split('/')[-1].split('_')
        sf = '_'.join(sf[1:4])
        it_path = autopsy_data[autopsy_data['plate_well_neuron'] ==
                               sf]['disease']
        if not len(it_path):
            it_path = 'Absent'
        else:
            it_path = it_path.as_matrix()[0]
        pathologies += [it_path]
    pathologies = np.asarray(pathologies)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_scores, ckpt_file_array = [], [], []
    print '-' * 60
    print 'Beginning evaluation'
    print '-' * 60

    if config.validation_batch > len(combined_files):
        print 'Trimming validation_batch size to %s.' % len(combined_files)
        config.validation_batch = len(combined_files)

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, file_array = [], [], []
        # Initialize the graph
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(
            tf.group(tf.global_variables_initializer(),
                     tf.local_variables_initializer()))

        # Set up exemplar threading
        saver.restore(sess, c)
        start_time = time.time()
        num_batches = np.floor(
            len(combined_files) / float(config.validation_batch)).astype(int)
        for image_batch, file_batch in tqdm(image_batcher(
                start=0,
                num_batches=num_batches,
                images=combined_files,
                config=config,
                training_max=training_max,
                training_min=training_min),
                                            total=num_batches):
            feed_dict = {images: image_batch}
            sc, tyh = sess.run([scores, preds], feed_dict=feed_dict)
            dec_scores += [sc]
            yhat += [tyh]
            file_array += [file_batch]
        ckpt_yhat.append(yhat)
        ckpt_scores.append(dec_scores)
        ckpt_file_array.append(file_array)
        print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time)
    sess.close()

    # Create and plot an embedding
    im_path_map = pathologies[:num_batches * config.validation_batch]
    dec_scores = np.concatenate(dec_scores)
    mu = dec_scores.mean(0)[None, :]
    sd = dec_scores.std(0)[None, :]
    dec_scores = (dec_scores - mu) / sd
    yhat = np.concatenate(yhat)
    file_array = np.concatenate(file_array)

    if embedding_type == 'TSNE' or embedding_type == 'tsne':
        emb = manifold.TSNE(n_components=2, init='pca', random_state=0)
    elif embedding_type == 'PCA' or embedding_type == 'pca':
        emb = PCA(n_components=2, svd_solver='randomized', random_state=0)
    elif embedding_type == 'spectral':
        emb = manifold.SpectralEmbedding(n_components=2, random_state=0)
    y = emb.fit_transform(dec_scores)

    # Ouput csv
    df = pd.DataFrame(np.hstack(
        (y, im_path_map.reshape(-1, 1), file_array.reshape(-1, 1))),
                      columns=['D1', 'D2', 'pathology', 'filename'])
    out_name = os.path.join(out_dir, 'embedding.csv')
    df.to_csv(out_name)
    print 'Saved csv to: %s' % out_name

    # Create plot
    f, ax = plt.subplots()
    unique_cats = np.unique(im_path_map)
    h = []
    for idx, cat in enumerate(unique_cats):
        h += [
            plt.scatter(y[im_path_map == cat, 0],
                        y[im_path_map == cat, 1],
                        c=plt.cm.Spectral(idx * 1000))
        ]
    plt.legend(h, unique_cats)
    plt.axis('tight')
    plt.show()
    plt.savefig('embedding.png')
    plt.close(f)

    # Save everything
    if save_npy:
        np.savez(os.path.join(out_dir, 'validation_accuracies'),
                 ckpt_yhat=ckpt_yhat,
                 ckpt_scores=ckpt_scores,
                 ckpt_names=ckpts,
                 combined_files=ckpt_file_array)