Exemple #1
0
def visualize_model(
        live_ims,
        dead_ims,
        model_file,
        output_folder,
        num_channels,
        smooth_iterations=50,
        untargeted=False,
        viz='none',
        per_timepoint=True):
    """Train an SVM for your dataset on GEDI-model encodings."""
    config = GEDIconfig()
    if live_ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the live images.')
    if dead_ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the dead images.')

    live_files = glob(os.path.join(live_ims, '*%s' % config.raw_im_ext))
    dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext))
    combined_labels = np.concatenate((
        np.zeros(len(live_files)),
        np.ones(len(dead_files))))
    combined_files = np.concatenate((live_files, dead_files))
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1])
    meta_file_pointer = os.path.join(
        model_file_path,
        'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file: train_maximum_value.npz'
            'Closest I could find from directory %s was %s.'
            'Download this from the link described in the README.md.'
            % (model_file_path, glob(os.path.join(model_file_path, '*.npz'))))
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    images = tf.placeholder(
        tf.float32,
        shape=[None] + config.model_image_size,
        name='images')
    labels = tf.placeholder(
        tf.int64,
        shape=[None],
        name='labels')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(
                vgg16_npy_path=config.vgg16_weight_path,
                fine_tune_layers=config.fine_tune_layers)
            vgg.build(
                images,
                output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.fc7
        preds = tf.argmax(vgg.prob, 1)
        activity_pattern = vgg.fc8
        if not untargeted:
            oh_labels = tf.one_hot(labels, config.output_shape)
            activity_pattern *= oh_labels
        grad_image = tf.gradients(activity_pattern, images)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_y, ckpt_scores = [], [], []
    ckpt_file_array, ckpt_viz_images = [], []
    print '-' * 60
    print 'Beginning evaluation'
    print '-' * 60

    if config.validation_batch > len(combined_files):
        print 'Trimming validation_batch to %s (same as # of files).' % len(
            combined_files)
        config.validation_batch = len(combined_files)

    count = 0
    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, y, file_array, viz_images = [], [], [], [], []
        # Initialize the graph
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(
            tf.group(
                tf.global_variables_initializer(),
                tf.local_variables_initializer()))

        # Set up exemplar threading
        saver.restore(sess, c)
        start_time = time.time()
        num_batches = np.floor(
            len(combined_files) / float(
                config.validation_batch)).astype(int)
        for image_batch, label_batch, file_batch in tqdm(
                image_batcher(
                    start=0,
                    num_batches=num_batches,
                    images=combined_files,
                    labels=combined_labels,
                    config=config,
                    training_max=training_max,
                    training_min=training_min,
                    num_channels=num_channels,
                    per_timepoint=per_timepoint),
                total=num_batches):
            feed_dict = {
                images: image_batch,
                labels: label_batch
            }
            it_grads = np.zeros((image_batch.shape))
            sc, tyh = sess.run(
                [scores, preds],
                feed_dict=feed_dict)
            for idx in range(smooth_iterations):
                feed_dict = {
                    images: add_noise(image_batch),
                    labels: label_batch
                }
                it_grad = sess.run(
                    grad_image,
                    feed_dict=feed_dict)
                it_grads += it_grad[0]
            it_grads /= smooth_iterations  # Mean across iterations
            it_grads = visualization_function(it_grads, viz)

            # Save each grad individually
            for grad_i, pred_i, file_i, label_i in zip(
                    it_grads, tyh, file_batch, label_batch):
                out_pointer = os.path.join(
                    output_folder,
                    file_i.split(os.path.sep)[-1])
                out_pointer = out_pointer.split('.')[0] + '.png'
                f = plt.figure()
                plt.imshow(grad_i)
                plt.title('Pred=%s, label=%s' % (pred_i, label_batch))
                plt.savefig(out_pointer)
                plt.close(f)

            # Plot a moisaic of the grads
            if viz == 'none':
                pos_grads = normalize(np.maximum(it_grads, 0))
                neg_grads = normalize(np.minimum(it_grads, 0))
                alpha_mosaic(
                    image_batch,
                    pos_grads,
                    'pos_batch_%s.pdf' % count,
                    title='Positive gradient overlays.',
                    rc=1,
                    cc=len(image_batch),
                    cmap=plt.cm.Reds)
                alpha_mosaic(
                    image_batch,
                    neg_grads,
                    'neg_batch_%s.pdf' % count,
                    title='Negative gradient overlays.',
                    rc=1,
                    cc=len(image_batch),
                    cmap=plt.cm.Reds)
            else:
                alpha_mosaic(
                    image_batch,
                    it_grads,
                    'batch_%s.pdf' % count,
                    title='Gradient overlays.',
                    rc=1,
                    cc=len(image_batch),
                    cmap=plt.cm.Reds)
            count += 1

            # Store the results
            dec_scores += [sc]
            yhat = np.append(yhat, tyh)
            y = np.append(y, label_batch)
            file_array = np.append(file_array, file_batch)
            viz_images += [it_grads]
        ckpt_yhat.append(yhat)
        ckpt_y.append(y)
        ckpt_scores.append(dec_scores)
        ckpt_file_array.append(file_array)
        ckpt_viz_images.append(viz_images)
        print 'Batch %d took %.1f seconds' % (
            idx, time.time() - start_time)
    sess.close()

    # Save everything
    np.savez(
        os.path.join(out_dir, 'validation_accuracies'),
        ckpt_yhat=ckpt_yhat,
        ckpt_y=ckpt_y,
        ckpt_scores=ckpt_scores,
        ckpt_names=ckpts,
        combined_files=ckpt_file_array,
        ckpt_viz_images=ckpt_viz_images)
Exemple #2
0
def test_vgg16(validation_data, model_dir, label_file, selected_ckpts=-1):
    config = GEDIconfig()

    # Load metas
    meta_data = np.load(os.path.join(tf_dir, 'val_maximum_value.npz'))
    max_value = np.max(meta_data['max_array']).astype(np.float32)

    # Find model checkpoints
    ckpts, ckpt_names = find_ckpts(config, model_dir)
    # ds_dt_stamp = re.split('/', ckpts[0])[-2]
    out_dir = os.path.join(config.results, 'gfp_2017_02_19_17_41_19' + '/')
    try:
        config = np.load(os.path.join(out_dir, 'meta_info.npy')).item()
        # Make sure this is always at 1
        config.validation_batch = 64
        print '-' * 60
        print 'Loading config meta data for:%s' % out_dir
        print '-' * 60
    except:
        print '-' * 60
        print 'Using config from gedi_config.py for model:%s' % out_dir
        print '-' * 60

    sorted_index = np.argsort(np.asarray([int(x) for x in ckpt_names]))
    ckpts = ckpts[sorted_index]
    ckpt_names = ckpt_names[sorted_index]

    # CSV file
    svm_image_file = os.path.join(out_dir, 'svm_models.npz')
    if svm_image_file == 2:
        svm_image_data = np.load(svm_image_file)
        image_array = svm_image_data['image_array']
        label_vec = svm_image_data['label_vec']
        tr_label_vec = svm_image_data['tr_label_vec']
    else:
        labels = pd.read_csv(
            os.path.join(config.processed_image_patch_dir,
                         'LINCSproject_platelayout_trans.csv'))
        label_vec = []
        image_array = []
        for idx, row in tqdm(labels.iterrows(), total=len(labels)):
            path_wd = '*%s_%s*' % (row['Plate'], row['Sci_WellID'])
            path_pointer = glob(os.path.join(image_dir, path_wd))
            if len(path_pointer) > 0:
                for p in path_pointer:
                    import ipdb
                    ipdb.set_trace()
                    label_vec.append(row['Sci_SampleID'])
        label_vec = np.asarray(label_vec)
        le = preprocessing.LabelEncoder()
        tr_label_vec = le.fit_transform(label_vec)
        np.savez(svm_image_file,
                 image_array=image_array,
                 label_vec=label_vec,
                 tr_label_vec=tr_label_vec)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Make placeholder
    val_images = tf.placeholder(tf.float32,
                                shape=[None] + config.model_image_size)

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.Vgg16(vgg16_npy_path=config.vgg16_weight_path,
                              fine_tune_layers=config.fine_tune_layers)
            validation_mode = tf.Variable(False, name='training')
            # No batchnorms durign testing
            vgg.build(val_images,
                      output_shape=config.output_shape,
                      train_mode=validation_mode)

    # Set up saver
    svm_feature_file = os.path.join(out_dir, 'svm_scores.npz')
    if os.path.exists(svm_feature_file):
        svm_features = np.load(svm_feature_file)
        dec_scores = svm_features['dec_scores']
        label_vec = svm_features['label_vec']
    else:
        saver = tf.train.Saver(tf.global_variables())
        ckpts = [ckpts[selected_ckpts]]
        image_array = np.asarray(image_array)
        for idx, c in enumerate(ckpts):
            dec_scores = []
            # Initialize the graph
            sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
            sess.run(
                tf.group(tf.global_variables_initializer(),
                         tf.local_variables_initializer()))

            # Set up exemplar threading
            saver.restore(sess, c)
            num_batches = np.ceil(len(image_array) /
                                  config.validation_batch).astype(int)
            batch_idx = np.arange(num_batches).repeat(
                num_batches)[:len(image_array)]
            for bi in np.unique(batch_idx):
                # move this above to image processing
                batch_images = image_array[batch_idx == bi] / 255.
                start_time = time.time()
                sc = sess.run(vgg.fc7, feed_dict={val_images: batch_images})
                dec_scores.append(sc)
                print 'Batch %d took %.1f seconds' % (idx,
                                                      time.time() - start_time)

    # Save everything
    np.savez(svm_feature_file, dec_scores=dec_scores, label_vec=label_vec)

    # Build SVM
    dec_scores = np.concatenate(dec_scores[:], axis=0)
    model_array, score_array, combo_array, masked_label_array = [], [], [], []
    for combo in itertools.combinations(np.unique(label_vec), 2):
        combo_array.append(combo)
        mask = np.logical_or(label_vec == combo[0], label_vec == combo[1])
        import ipdb
        ipdb.set_trace()
        masked_labels = label_vec[mask]
        masked_scores = dec_scores[mask, :]
        clf = SVC(kernel='linear', C=1)
        scores = cross_val_score(clf, masked_scores, masked_labels, cv=5)
        model_array.append(clf)
        score_array.append(scores)
        masked_label_array.append(masked_labels)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    # Save everything
    np.savez(os.path.join(out_dir, 'svm_models'),
             combo_array=combo_array,
             model_array=model_array,
             score_array=score_array,
             masked_label_array=masked_label_array)
def test_vgg16(image_dir,
               model_file,
               output_csv='prediction_file',
               training_max=None):
    print(image_dir)
    #    tf.set_random_seed(0)
    config = GEDIconfig()
    if image_dir is None:
        raise RuntimeError(
            'You need to supply a directory path to the images.')

    combined_files = np.asarray(
        glob(os.path.join(image_dir, '*%s' % config.raw_im_ext)))
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1])
    print('model file path', model_file_path)
    meta_file_pointer = os.path.join(model_file_path,
                                     'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file: train_maximum_value.npz'
            'Closest I could find from directory %s was %s.'
            'Download this from the link described in the README.md.' %
            (model_file_path, glob(os.path.join(model_file_path, '*.npz'))))
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    if training_max is None:
        training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)
    print('out_dir', out_dir)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    if config.model_image_size[-1] < 3:
        print('*' * 60)
        print('Warning: model is expecting a H/W/1 image. '
              'Do you mean to set the last dimension of '
              'config.model_image_size to 3?')
        print('*' * 60)

    images = tf.placeholder(tf.float32,
                            shape=[None] + config.model_image_size,
                            name='images')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path,
                                     fine_tune_layers=config.fine_tune_layers)
            vgg.build(images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.prob
        preds = tf.argmax(vgg.prob, 1)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], []
    print('-' * 60)
    print('Beginning evaluation')
    print('-' * 60)

    if config.validation_batch > len(combined_files):
        print('Trimming validation_batch size to %s (same as # of files).' %
              len(combined_files))
        config.validation_batch = len(combined_files)

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, file_array = [], [], []
        # Initialize the graph

        #        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as sess:
            sess.run(
                tf.group(tf.global_variables_initializer(),
                         tf.local_variables_initializer()))

            # Set up exemplar threading
            saver.restore(sess, c)
            start_time = time.time()
            num_batches = np.floor(
                len(combined_files) /
                float(config.validation_batch)).astype(int)
            for image_batch, file_batch in tqdm(image_batcher(
                    start=0,
                    num_batches=num_batches,
                    images=combined_files,
                    config=config,
                    training_max=training_max,
                    training_min=training_min),
                                                total=num_batches):
                feed_dict = {images: image_batch}
                sc, tyh = sess.run([scores, preds], feed_dict=feed_dict)
                dec_scores = np.append(dec_scores, sc)
                yhat = np.append(yhat, tyh)
                file_array = np.append(file_array, file_batch)
            ckpt_yhat.append(yhat)
            ckpt_scores.append(dec_scores)
            ckpt_file_array.append(file_array)
            print('Batch %d took %.1f seconds' %
                  (idx, time.time() - start_time))
#    sess.close()

# Save everything
    print('Save npz.')
    print(os.path.join(out_dir, 'validation_accuracies'))
    np.savez(os.path.join(out_dir, 'validation_accuracies'),
             ckpt_yhat=ckpt_yhat,
             ckpt_scores=ckpt_scores,
             ckpt_names=ckpts,
             combined_files=ckpt_file_array)

    # Also save a csv with item/guess pairs
    try:
        dec_scores = np.asarray(dec_scores)
        yhat = np.asarray(yhat)
        df = pd.DataFrame(np.hstack(
            (np.asarray(ckpt_file_array).reshape(-1, 1), yhat.reshape(-1, 1),
             dec_scores.reshape(dec_scores.shape[0] // 2, 2))),
                          columns=[
                              'files', 'live_guesses', 'classifier score dead',
                              'classifier score live'
                          ])
        output_name = image_dir.split('/')[-1]
        if output_name is None or len(output_name) == 0:
            output_name = 'output'
        df.to_csv(os.path.join(out_dir, '%s.csv' % output_name))
        print('Saved csv to: %s' %
              os.path.join(out_dir, '%s.csv' % output_name))
    except:
        print('X' * 60)
        print('Could not save a spreadsheet of file info')
        print('X' * 60)

    # Plot everything
    try:
        plot_accuracies(ckpt_y, ckpt_yhat, config, ckpts,
                        os.path.join(out_dir, 'validation_accuracies.png'))
        plot_std(ckpt_y, ckpt_yhat, ckpts,
                 os.path.join(out_dir, 'validation_stds.png'))
        plot_cms(ckpt_y, ckpt_yhat, config,
                 os.path.join(out_dir, 'confusion_matrix.png'))
        plot_pr(ckpt_y, ckpt_yhat, ckpt_scores,
                os.path.join(out_dir, 'precision_recall.png'))


#        plot_cost(
#            os.path.join(out_dir, 'training_loss.npy'), ckpts,
#            os.path.join(out_dir, 'training_costs.png'))
    except:
        print('X' * 60)
        print('Could not locate the loss numpy')
        print('X' * 60)
Exemple #4
0
def test_vgg16(validation_data, model_dir, which_set, selected_ckpts):
    config = GEDIconfig()
    blur_kernel = config.hm_blur
    if validation_data is None:  # Use globals
        validation_data = os.path.join(config.tfrecord_dir,
                                       config.tf_record_names[which_set])
        meta_data = np.load(
            os.path.join(config.tfrecord_dir, 'val_%s' % config.max_file))
    else:
        meta_data = np.load('%s_maximum_value.npz' %
                            validation_data.split('.tfrecords')[0])
    label_list = os.path.join(
        config.processed_image_patch_dir,
        'list_of_' + '_'.join(x
                              for x in config.image_prefixes) + '_labels.txt')
    with open(label_list) as f:
        file_pointers = [l.rstrip('\n') for l in f.readlines()]

    # Prepare image normalization values
    try:
        max_value = np.max(meta_data['max_array']).astype(np.float32)
    except:
        max_value = np.asarray([config.max_gedi])
    try:
        min_value = np.max(meta_data['min_array']).astype(np.float32)
    except:
        min_value = np.asarray([config.min_gedi])

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_dir)[-1]
    out_dir = os.path.join(config.results, ds_dt_stamp + '/')
    try:
        config = np.load(os.path.join(out_dir, 'meta_info.npy')).item()
        # Make sure this is always at 1
        config.validation_batch = 1
        print '-' * 60
        print 'Loading config meta data for:%s' % out_dir
        print '-' * 60
    except:
        print '-' * 60
        print 'Using config from gedi_config.py for model:%s' % out_dir
        print '-' * 60

    # Make output directories if they do not exist
    im_shape = config.gedi_image_size

    # Prepare data on CPU
    with tf.device('/cpu:0'):
        val_images, val_labels = inputs(validation_data,
                                        1,
                                        im_shape,
                                        config.model_image_size[:2],
                                        max_value=max_value,
                                        min_value=min_value,
                                        num_epochs=1,
                                        normalize=config.normalize)

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.Vgg16(vgg16_npy_path=config.vgg16_weight_path,
                              fine_tune_layers=config.fine_tune_layers)
            vgg.build(val_images, output_shape=config.output_shape)

        # Setup validation op
        preds = tf.argmax(vgg.prob, 1)
        targets = tf.cast(val_labels, dtype=tf.int64)
        grad_labels = tf.one_hot(val_labels,
                                 config.output_shape,
                                 dtype=tf.float32)
        heatmap_op = tf.gradients(vgg.fc8 * grad_labels, val_images)[0]

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())
    ckpts = [selected_ckpts]

    # Loop through each checkpoint then test the entire validation set
    print '-' * 60
    print 'Beginning evaluation on ckpt: %s' % ckpts
    print '-' * 60
    yhat, y, tn_hms, tp_hms, fn_hms, fp_hms = [], [], [], [], [], []
    tn_ims, tp_ims, fn_ims, fp_ims = [], [], [], []
    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        try:
            # Initialize the graph
            sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
            sess.run(
                tf.group(tf.global_variables_initializer(),
                         tf.local_variables_initializer()))

            # Set up exemplar threading
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            saver.restore(sess, c)
            start_time = time.time()
            while not coord.should_stop():
                tyh, ty, thm, tim = sess.run(
                    [preds, targets, heatmap_op, val_images])
                tyh = tyh[0]
                ty = ty[0]
                tim = (tim / tim.max()).squeeze()
                yhat += [tyh]
                y += [ty]
                if tyh == ty and not tyh:  # True negative
                    tn_hms += [hm_normalize(thm)]
                    tn_ims += [tim]
                elif tyh == ty and tyh:  # True positive
                    tp_hms += [hm_normalize(thm)]
                    tp_ims += [tim]
                elif tyh != ty and not tyh:  # False negative
                    fn_hms += [hm_normalize(thm)]
                    fn_ims += [tim]
                elif tyh != ty and tyh:  # False positive
                    fp_hms += [hm_normalize(thm)]
                    fp_ims += [tim]
        except tf.errors.OutOfRangeError:
            print 'Batch %d took %.1f seconds' % (idx,
                                                  time.time() - start_time)
        finally:
            coord.request_stop()
        coord.join(threads)
        sess.close()

    # Plot images -- add to a dict and incorporate file_pointers
    dir_pointer = os.path.join(config.heatmap_source_images, ds_dt_stamp)
    stem_dirs = ['tn', 'tp', 'fn', 'fp']
    dir_list = [dir_pointer]
    dir_list += [os.path.join(dir_pointer, x) for x in stem_dirs]
    [make_dir(d) for d in dir_list]
    loop_plot(tn_ims,
              tn_hms,
              'True negative',
              os.path.join(dir_pointer, 'tn'),
              blur=blur_kernel)
    loop_plot(tp_ims,
              tp_hms,
              'True positive',
              os.path.join(dir_pointer, 'tp'),
              blur=blur_kernel)
    loop_plot(fn_ims,
              fn_hms,
              'False negative',
              os.path.join(dir_pointer, 'fn'),
              blur=blur_kernel)
    loop_plot(fp_ims,
              fp_hms,
              'False positive',
              os.path.join(dir_pointer, 'fp'),
              blur=blur_kernel)
Exemple #5
0
def test_vgg16(validation_data, model_dir, which_set, selected_ckpts=-1):
    config = GEDIconfig()
    if validation_data is None:  # Use globals
        validation_data = os.path.join(config.tfrecord_dir,
                                       config.tf_record_names[which_set])
        meta_data = np.load(
            os.path.join(config.tfrecord_dir, 'val_%s' % config.max_file))
    else:
        meta_data = np.load('%s_maximum_value.npz' %
                            validation_data.split('.tfrecords')[0])
    label_list = os.path.join(
        config.processed_image_patch_dir,
        'list_of_' + '_'.join(x
                              for x in config.image_prefixes) + '_labels.txt')
    with open(label_list) as f:
        file_pointers = [l.rstrip('\n') for l in f.readlines()]

    # Prepare image normalization values
    try:
        max_value = np.max(meta_data['max_array']).astype(np.float32)
    except:
        max_value = np.asarray([config.max_gedi])
    try:
        min_value = np.max(meta_data['min_array']).astype(np.float32)
    except:
        min_value = np.asarray([config.min_gedi])

    # Find model checkpoints
    ckpts, ckpt_names = find_ckpts(config, model_dir)
    ds_dt_stamp = re.split('/', ckpts[0])[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)
    try:
        config = np.load(os.path.join(out_dir, 'meta_info.npy')).item()
        # Make sure this is always at 1
        config.validation_batch = 1
        print('-' * 60)
        print('Loading config meta data for:%s' % out_dir)
        print('-' * 60)
    except:
        print('-' * 60)
        print('Using config from gedi_config.py for model:%s' % out_dir)
        print('-' * 60)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]
    # im_shape = get_image_size(config)
    im_shape = config.gedi_image_size

    # Prepare data on CPU
    with tf.device('/cpu:0'):
        val_images, val_labels = inputs(validation_data,
                                        1,
                                        im_shape,
                                        config.model_image_size[:2],
                                        max_value=max_value,
                                        min_value=min_value,
                                        num_epochs=1,
                                        normalize=config.normalize)

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.Vgg16(vgg16_npy_path=config.vgg16_weight_path,
                              fine_tune_layers=config.fine_tune_layers)
            vgg.build(val_images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.prob
        preds = tf.argmax(vgg.prob, 1)
        targets = tf.cast(val_labels, dtype=tf.int64)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpt_yhat, ckpt_y, ckpt_scores = [], [], []
    print('-' * 60)
    print('Beginning evaluation')
    print('-' * 60)

    if selected_ckpts is not None:
        # Select a specific ckpt
        if selected_ckpts < 0:
            ckpts = ckpts[selected_ckpts:]
        else:
            ckpts = ckpts[:selected_ckpts]

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, y = [], [], []
        try:
            # Initialize the graph
            sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
            sess.run(
                tf.group(tf.global_variables_initializer(),
                         tf.local_variables_initializer()))

            # Set up exemplar threading
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            saver.restore(sess, c)
            start_time = time.time()
            while not coord.should_stop():
                sc, tyh, ty = sess.run([scores, preds, targets])
                dec_scores = np.append(dec_scores, sc)
                yhat = np.append(yhat, tyh)
                y = np.append(y, ty)
        except tf.errors.OutOfRangeError:
            ckpt_yhat.append(yhat)
            ckpt_y.append(y)
            ckpt_scores.append(dec_scores)
            print('Iteration accuracy: %s' % np.mean(yhat == y))
            print('Iteration pvalue: %.5f' %
                  randomization_test(y=y, yhat=yhat))
            print('Batch %d took %.1f seconds' %
                  (idx, time.time() - start_time))
        finally:
            coord.request_stop()
        coord.join(threads)
        sess.close()

    # Save everything
    np.savez(os.path.join(out_dir, 'validation_accuracies'),
             ckpt_yhat=ckpt_yhat,
             ckpt_y=ckpt_y,
             ckpt_scores=ckpt_scores,
             ckpt_names=ckpt_names,
             file_pointers=file_pointers)

    # Also save a csv with item/guess pairs
    try:
        trimmed_files = [re.split('/', x)[-1] for x in file_pointers]
        trimmed_files = np.asarray(trimmed_files)
        dec_scores = np.asarray(dec_scores)
        yhat = np.asarray(yhat)
        df = pd.DataFrame(
            np.hstack((trimmed_files.reshape(-1, 1), yhat.reshape(-1, 1),
                       dec_scores.reshape(dec_scores.shape[0] // 2, 2))),
            columns=['files', 'guesses', 'score dead', 'score live'])
        df.to_csv(os.path.join(out_dir, 'prediction_file.csv'))
        print('Saved csv to: %s' % out_dir)
    except:
        print('X' * 60)
        print('Could not save a spreadsheet of file info')
        print('X' * 60)

    # Plot everything
    try:
        plot_accuracies(ckpt_y, ckpt_yhat, config, ckpt_names,
                        os.path.join(out_dir, 'validation_accuracies.png'))
        plot_std(ckpt_y, ckpt_yhat, ckpt_names,
                 os.path.join(out_dir, 'validation_stds.png'))
        plot_cms(ckpt_y, ckpt_yhat, config,
                 os.path.join(out_dir, 'confusion_matrix.png'))
        plot_pr(ckpt_y, ckpt_yhat, ckpt_scores,
                os.path.join(out_dir, 'precision_recall.png'))
        plot_cost(os.path.join(out_dir, 'training_loss.npy'), ckpt_names,
                  os.path.join(out_dir, 'training_costs.png'))
    except:
        print('X' * 60)
        print('Could not locate the loss numpy')
        print('X' * 60)
Exemple #6
0
def test_vgg16(model_file,
               trained_svm,
               ims,
               dead_ims=None,
               output_csv='prediction_file',
               training_max=None,
               C=1e-3,
               k_folds=10):
    """Test an SVM you've trained on a new dataset."""
    config = GEDIconfig()
    if ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the images.')
    if dead_ims is None:
        print 'Assuming all of your images are in the ims folder' + \
            '-- will not derive labels to calculate accuracy.'
    # if not os.path.exists(trained_svm):
    #     raise RuntimeError(
    #         'Cannot find the trained svm model. Check the path you passed.')
    try:
        clf = cPickle.load(open(trained_svm, 'rb'))
        # clf = model_dict['clf']
        # mu = model_dict['mu']
        # sd = model_dict['sd']
    except:
        raise RuntimeError('Cannot find SVM file: %s' % trained_svm)

    if dead_ims is not None:
        live_files = glob(os.path.join(ims, '*%s' % config.raw_im_ext))
        dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext))
        combined_labels = np.concatenate(
            (np.zeros(len(live_files)), np.ones(len(dead_files))))
        combined_files = np.concatenate((live_files, dead_files))
    else:
        live_files = glob(os.path.join(ims, '*%s' % config.raw_im_ext))
        combined_labels = None
        combined_files = np.asarray(live_files)
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1])
    meta_file_pointer = os.path.join(model_file_path,
                                     'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file: train_maximum_value.npz'
            'Closest I could find from directory %s was %s.'
            'Download this from the link described in the README.md.' %
            (model_file_path, glob(os.path.join(model_file_path, '*.npz'))))
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    if training_max is None:
        training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    images = tf.placeholder(tf.float32,
                            shape=[None] + config.model_image_size,
                            name='images')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path,
                                     fine_tune_layers=config.fine_tune_layers)
            vgg.build(images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.fc7
        preds = tf.argmax(vgg.prob, 1)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], []
    print '-' * 60
    print 'Beginning evaluation'
    print '-' * 60

    if config.validation_batch > len(combined_files):
        print 'Trimming validation_batch size to %s (same as # of files).' % len(
            combined_files)
        config.validation_batch = len(combined_files)

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, y, file_array = [], [], [], []
        # Initialize the graph
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(
            tf.group(tf.global_variables_initializer(),
                     tf.local_variables_initializer()))

        # Set up exemplar threading
        saver.restore(sess, c)
        start_time = time.time()
        num_batches = np.floor(
            len(combined_files) / float(config.validation_batch)).astype(int)
        for image_batch, label_batch, file_batch in tqdm(image_batcher(
                start=0,
                num_batches=num_batches,
                images=combined_files,
                labels=combined_labels,
                config=config,
                training_max=training_max,
                training_min=training_min),
                                                         total=num_batches):
            feed_dict = {images: image_batch}
            sc, tyh = sess.run([scores, preds], feed_dict=feed_dict)
            dec_scores += [sc]
            yhat = np.append(yhat, tyh)
            y = np.append(y, label_batch)
            file_array = np.append(file_array, file_batch)
        ckpt_yhat.append(yhat)
        ckpt_y.append(y)
        ckpt_scores.append(dec_scores)
        ckpt_file_array.append(file_array)
        print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time)
    sess.close()

    # Save everything
    new_dt_string = re.split('\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    np.savez(os.path.join(out_dir, '%s_validation_accuracies' % new_dt_string),
             ckpt_yhat=ckpt_yhat,
             ckpt_y=ckpt_y,
             ckpt_scores=ckpt_scores,
             ckpt_names=ckpts,
             combined_files=ckpt_file_array)

    # Run SVM
    all_scores = np.concatenate(dec_scores)
    # all_scores = (all_scores - mu) / sd
    predictions = clf.predict(all_scores)
    if dead_ims is not None:
        mean_acc = np.mean(predictions == y)
        p_value = randomization_test(y=y, yhat=predictions)
        print 'SVM performance: mean accuracy = %s%%, p = %.5f' % (mean_acc,
                                                                   p_value)
        df_col_label = 'true label'
    else:
        mean_acc, p_value = None, None
        y = np.copy(yhat)
        df_col_label = 'Dummy column (no labels supplied)'
    np.savez(os.path.join(out_dir, '%s_svm_test_data' % new_dt_string),
             yhat=yhat,
             y=y,
             scores=dec_scores,
             ckpts=ckpts,
             p_value=p_value)

    # Also save a csv with item/guess pairs
    trimmed_files = np.asarray([
        x.split(os.path.sep)[-1] for x in np.asarray(ckpt_file_array).ravel()
    ])
    yhat = np.asarray(yhat)
    df = pd.DataFrame(
        np.hstack((trimmed_files.reshape(-1, 1), yhat.reshape(-1, 1))),
        #   y.reshape(-1, 1))),
        columns=['files', 'guesses'])  # , df_col_label])
    df.to_csv(os.path.join(out_dir, 'prediction_file.csv'))
    print 'Saved csv to: %s' % out_dir
def test_placeholder(
        image_path,
        model_file,
        model_meta,
        out_dir,
        n_images=3,
        first_n_images=1,
        debug=True,
        margin=.1,
        autopsy_csv=None,
        C=1,
        k_folds=10,
        embedding_type='tsne',
        autopsy_model='match'):
    config = GEDIconfig()
    assert margin is not None, 'Need a margin for the loss.'
    assert image_path is not None, 'Provide a path to an image directory.'
    assert model_file is not None, 'Provide a path to the model file.'

    try:
        # Load the model's config
        config = np.load(model_meta).item()
    except:
        print 'Could not load model config, falling back to default config.'
    config.model_image_size[-1] = 1
    try:
        # Load autopsy information
        autopsy_data = pd.read_csv(autopsy_csv)
    except IOError:
        print 'Unable to load autopsy file.'
    if not hasattr(config, 'include_GEDI'):
        raise RuntimeError('You need to pass the correct meta file.')
        config.include_GEDI = True
        config.l2_norm = False
        config.dist_fun = 'pearson'
        config.per_batch = False
        config.output_shape = 32
        config.margin = 0.1
    if os.path.isdir(image_path):
        combined_files = np.asarray(
            glob(os.path.join(image_path, '*%s' % config.raw_im_ext)))
    else:
        combined_files = [image_path]
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    # Make output directories if they do not exist
    dt_stamp = re.split(
        '\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    dt_dataset = config.which_dataset + '_' + dt_stamp + '/'
    config.train_checkpoint = os.path.join(
        config.train_checkpoint, dt_dataset)  # timestamp this run
    out_dir = os.path.join(out_dir, dt_dataset)
    dir_list = [out_dir]
    [tf_fun.make_dir(d) for d in dir_list]

    # Prepare data on CPU
    with tf.device('/cpu:0'):
        images = []
        for idx in range(first_n_images):
            images += [tf.placeholder(
                tf.float32,
                shape=[None] + config.model_image_size,
                name='images_%s' % idx)]

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        if autopsy_model == 'match':
            from models import matching_vgg16 as model_type
            with tf.variable_scope('match'):
                # Build matching model for frame 0
                model_0 = model_type.model_struct(
                    vgg16_npy_path=config.gedi_weight_path)  # ,
                frame_activity = []
                model_activity = model_0.build(
                    images[0],
                    output_shape=config.output_shape,
                    include_GEDI=config.include_GEDI)
                if config.l2_norm:
                    model_activity = [model_activity]
                frame_activity += [model_activity]
            if first_n_images > 1:
                with tf.variable_scope('match', reuse=tf.AUTO_REUSE):
                    # Build matching model for other frames
                    for idx in range(1, len(images)):
                        model_activity = model_0.build(
                            images[idx],
                            output_shape=config.output_shape,
                            include_GEDI=config.include_GEDI)
                        if config.l2_norm:
                            model_activity = tf_fun.l2_normalize(
                                model_activity)
                        frame_activity += [model_activity]
                if config.dist_fun == 'l2':
                    pos = tf_fun.l2_dist(
                        frame_activity[0],
                        frame_activity[1], axis=1)
                    neg = tf_fun.l2_dist(
                        frame_activity[0],
                        frame_activity[2], axis=1)
                elif config.dist_fun == 'pearson':
                    pos = tf_fun.pearson_dist(
                        frame_activity[0],
                        frame_activity[1],
                        axis=1)
                    neg = tf_fun.pearson_dist(
                        frame_activity[0],
                        frame_activity[2],
                        axis=1)
                model_activity = pos - neg  # Store the difference in distances
        elif autopsy_model == 'GEDI' or autopsy_model == 'gedi':
            from models import baseline_vgg16 as model_type
            model = model_type.model_struct(
                vgg16_npy_path=config.gedi_weight_path)  # ,
            model.build(
                images[0],
                output_shape=config.output_shape)
            model_activity = model.fc7
        else:
            raise NotImplementedError(autopsy_model)

    if config.validation_batch > len(combined_files):
        print (
            'Trimming validation_batch size to %s '
            '(same as # of files).' % len(combined_files))
        config.validation_batch = len(combined_files)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(
        tf.group(
            tf.global_variables_initializer(),
            tf.local_variables_initializer()))

    # Set up exemplar threading
    if autopsy_model == 'match':
        saver.restore(sess, model_file)
    start_time = time.time()
    num_batches = np.floor(
        len(combined_files) / float(
            config.validation_batch)).astype(int)
    score_array, file_array = [], []
    for image_batch, file_batch in tqdm(
            image_batcher(
                start=0,
                num_batches=num_batches,
                images=combined_files,
                config=config,
                first_n_images=first_n_images,
                n_images=n_images),
            total=num_batches):
        for im_head in images:
            feed_dict = {
                im_head: image_batch
            }
            activity = sess.run(
                model_activity,
                feed_dict=feed_dict)
            score_array += [activity]
        file_array += [file_batch]
    print 'Image processing %d took %.1f seconds' % (
        idx, time.time() - start_time)
    sess.close()
    score_array = np.concatenate(score_array, axis=0)
    score_array = score_array.reshape(-1, score_array.shape[-1])
    file_array = np.concatenate(file_array, axis=0)

    # Save everything
    np.savez(
        os.path.join(out_dir, 'validation_accuracies'),
        score_array=score_array,
        file_array=file_array)

    if first_n_images == 1:
        # Derive pathologies from file names
        pathologies = []
        for f in combined_files:
            sf = f.split(os.path.sep)[-1].split('_')
            line = sf[1]
            # time_col = sf[2]
            well = sf[4]
            disease = autopsy_data[
                np.logical_and(
                    autopsy_data['line'] == line,
                    autopsy_data['wells'] == well)]['type']
            try:
                disease = disease.as_matrix()[0]
            except:
                disease = 'Not_found'
            pathologies += [disease]
        pathologies = np.asarray(pathologies)[:len(score_array)]

        mu = score_array.mean(0)
        sd = score_array.std(0)
        z_score_array = (score_array - mu) / (sd + 1e-4)
        if embedding_type == 'TSNE' or embedding_type == 'tsne':
            emb = manifold.TSNE(n_components=2, init='pca', random_state=0)
        elif embedding_type == 'PCA' or embedding_type == 'pca':
            emb = PCA(n_components=2, svd_solver='randomized', random_state=0)
        elif embedding_type == 'spectral':
            emb = manifold.SpectralEmbedding(n_components=2, random_state=0)

        y = emb.fit_transform(score_array)

        # Do a classification analysis
        labels = np.unique(pathologies.reshape(-1, 1), return_inverse=True)[1]

        # Run SVM
        svm = LinearSVC(C=C, dual=False, class_weight='balanced')
        clf = make_pipeline(preprocessing.StandardScaler(), svm)
        predictions = cross_val_predict(clf, score_array, labels, cv=k_folds)
        cv_performance = metrics.accuracy_score(predictions, labels)
        clf.fit(score_array, labels)
        # mu = dec_scores.mean(0)
        # sd = dec_scores.std(0)
        print '%s-fold SVM performance: accuracy = %s%%' % (
            k_folds,
            np.mean(cv_performance * 100))
        np.savez(
            os.path.join(out_dir, 'svm_data'),
            yhat=score_array,
            y=labels,
            cv_performance=cv_performance,
            # mu=mu,
            # sd=sd,
            C=C)

        # Ouput csv
        df = pd.DataFrame(
            np.hstack((
                y,
                pathologies.reshape(-1, 1),
                file_array.reshape(-1, 1))),
            columns=['dim1', 'dim2', 'pathology', 'filename'])
        out_name = os.path.join(out_dir, 'raw_embedding.csv')
        df.to_csv(out_name)
        print 'Saved csv to: %s' % out_name

        create_figs(
            emb=emb,
            out_dir=out_dir,
            out_name=out_name,
            embedding_type=embedding_type,
            embedding_name='raw_embedding')

        # Now work on zscored data
        y = emb.fit_transform(z_score_array)

        # Ouput csv
        df = pd.DataFrame(
            np.hstack((
                y,
                pathologies.reshape(-1, 1),
                file_array.reshape(-1, 1))),
            columns=['dim1', 'dim2', 'pathology', 'filename'])
        out_name = os.path.join(out_dir, 'embedding.csv')
        df.to_csv(out_name)
        print 'Saved csv to: %s' % out_name

        # Create plot
        create_figs(
            emb=emb,
            out_dir=out_dir,
            out_name=out_name,
            embedding_type=embedding_type,
            embedding_name='normalized_embedding')

    else:
        # Do a classification (sign of the score)
        decisions = np.sign(score_array)
        df = pd.DataFrame(
            np.hstack(decisions, score_array),
            columns=['Decisions', 'Scores'])
        df.to_csv(
            os.path.join(
                out_dir, 'tracking_model_scores.csv'))
def test_vgg16(live_ims,
               dead_ims,
               model_file,
               svm_model='svm_model',
               output_csv='prediction_file',
               training_max=None,
               C=1e-3,
               k_folds=10):
    """Train an SVM for your dataset on GEDI-model encodings."""
    config = GEDIconfig()
    if live_ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the live images.')
    if dead_ims is None:
        raise RuntimeError(
            'You need to supply a directory path to the dead images.')

    live_files = glob(os.path.join(live_ims, '*%s' % config.raw_im_ext))
    dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext))
    combined_labels = np.concatenate(
        (np.zeros(len(live_files)), np.ones(len(dead_files))))
    combined_files = np.concatenate((live_files, dead_files))
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1])
    meta_file_pointer = os.path.join(model_file_path,
                                     'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file: train_maximum_value.npz'
            'Closest I could find from directory %s was %s.'
            'Download this from the link described in the README.md.' %
            (model_file_path, glob(os.path.join(model_file_path, '*.npz'))))
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    if training_max is None:
        training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    images = tf.placeholder(tf.float32,
                            shape=[None] + config.model_image_size,
                            name='images')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path,
                                     fine_tune_layers=config.fine_tune_layers)
            vgg.build(images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg.fc7
        preds = tf.argmax(vgg.prob, 1)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], []
    print '-' * 60
    print 'Beginning evaluation'
    print '-' * 60

    if config.validation_batch > len(combined_files):
        print 'Trimming validation_batch size to %s (same as # of files).' % len(
            combined_files)
        config.validation_batch = len(combined_files)

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, y, file_array = [], [], [], []
        # Initialize the graph
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(
            tf.group(tf.global_variables_initializer(),
                     tf.local_variables_initializer()))

        # Set up exemplar threading
        saver.restore(sess, c)
        start_time = time.time()
        num_batches = np.floor(
            len(combined_files) / float(config.validation_batch)).astype(int)
        for image_batch, label_batch, file_batch in tqdm(image_batcher(
                start=0,
                num_batches=num_batches,
                images=combined_files,
                labels=combined_labels,
                config=config,
                training_max=training_max,
                training_min=training_min),
                                                         total=num_batches):
            feed_dict = {images: image_batch}
            sc, tyh = sess.run([scores, preds], feed_dict=feed_dict)
            dec_scores += [sc]
            yhat = np.append(yhat, tyh)
            y = np.append(y, label_batch)
            file_array = np.append(file_array, file_batch)
        ckpt_yhat.append(yhat)
        ckpt_y.append(y)
        ckpt_scores.append(dec_scores)
        ckpt_file_array.append(file_array)
        print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time)
    sess.close()

    # Save everything
    np.savez(os.path.join(out_dir, 'validation_accuracies'),
             ckpt_yhat=ckpt_yhat,
             ckpt_y=ckpt_y,
             ckpt_scores=ckpt_scores,
             ckpt_names=ckpts,
             combined_files=ckpt_file_array)

    # Run SVM
    svm = LinearSVC(C=C, dual=False, class_weight='balanced')
    clf = make_pipeline(preprocessing.StandardScaler(), svm)
    predictions = cross_val_predict(clf,
                                    np.concatenate(dec_scores),
                                    y,
                                    cv=k_folds)
    cv_performance = metrics.accuracy_score(predictions, y)
    p_value = randomization_test(y=y, yhat=predictions)
    clf.fit(np.concatenate(dec_scores), y)
    # mu = dec_scores.mean(0)
    # sd = dec_scores.std(0)
    print '%s-fold SVM performance: accuracy = %s%% , p = %.5f' % (
        k_folds, np.mean(cv_performance * 100), p_value)
    np.savez(
        os.path.join(out_dir, 'svm_data'),
        yhat=predictions,
        y=y,
        scores=dec_scores,
        ckpts=ckpts,
        cv_performance=cv_performance,
        p_value=p_value,
        k_folds=k_folds,
        # mu=mu,
        # sd=sd,
        C=C)

    # Also save a csv with item/guess pairs
    try:
        trimmed_files = [re.split('/', x)[-1] for x in combined_files]
        trimmed_files = np.asarray(trimmed_files)
        dec_scores = np.asarray(dec_scores)
        yhat = np.asarray(yhat)
        df = pd.DataFrame(np.hstack(
            (trimmed_files.reshape(-1, 1), yhat.reshape(-1,
                                                        1), y.reshape(-1, 1))),
                          columns=['files', 'guesses', 'true label'])
        df.to_csv(os.path.join(out_dir, 'prediction_file.csv'))
        print 'Saved csv to: %s' % out_dir
    except:
        print 'X' * 60
        print 'Could not save a spreadsheet of file info'
        print 'X' * 60

    # save the classifier
    with open('%s.pkl' % svm_model, 'wb') as fid:
        # model_dict = {
        #     'model': clf,
        #     'mu': mu,
        #     'sd': sd
        # }
        cPickle.dump(clf, fid)
    print 'Saved svm model to: %s.pkl' % svm_model
Exemple #9
0
def test_vgg16(image_dir,
               model_file,
               autopsy_csv=None,
               autopsy_path=None,
               output_csv='prediction_file',
               target_layer='fc7',
               save_npy=False,
               shuffle_images=True,
               embedding_type='PCA'):
    """Testing function for pretrained vgg16."""
    assert autopsy_csv is not None, 'You must pass an autopsy file name.'
    assert autopsy_path is not None, 'You must pass an autopsy path.'

    # Load autopsy information
    autopsy_data = pd.read_csv(os.path.join(autopsy_path, autopsy_csv))

    # Load config and begin preparing data
    config = GEDIconfig()
    if image_dir is None:
        raise RuntimeError(
            'You need to supply a directory path to the images.')

    combined_files = np.asarray(
        glob(os.path.join(image_dir, '*%s' % config.raw_im_ext)))
    if shuffle_images:
        combined_files = combined_files[np.random.permutation(
            len(combined_files))]
    if len(combined_files) == 0:
        raise RuntimeError('Could not find any files. Check your image path.')

    config = GEDIconfig()
    meta_file_pointer = os.path.join(
        model_file.split('/model')[0], 'train_maximum_value.npz')
    if not os.path.exists(meta_file_pointer):
        raise RuntimeError(
            'Cannot find the training data meta file.'
            'Download this from the link described in the README.md.')
    meta_data = np.load(meta_file_pointer)

    # Prepare image normalization values
    training_max = np.max(meta_data['max_array']).astype(np.float32)
    training_min = np.min(meta_data['min_array']).astype(np.float32)

    # Find model checkpoints
    ds_dt_stamp = re.split('/', model_file)[-2]
    out_dir = os.path.join(config.results, ds_dt_stamp)

    # Make output directories if they do not exist
    dir_list = [config.results, out_dir]
    [make_dir(d) for d in dir_list]

    # Prepare data on CPU
    images = tf.placeholder(tf.float32,
                            shape=[None] + config.model_image_size,
                            name='images')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn'):
            vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path,
                                     fine_tune_layers=config.fine_tune_layers)
            vgg.build(images, output_shape=config.output_shape)

        # Setup validation op
        scores = vgg[target_layer]
        preds = tf.argmax(vgg.prob, 1)

    # Derive pathologies from file names
    pathologies = []
    for f in combined_files:
        sf = f.split('/')[-1].split('_')
        sf = '_'.join(sf[1:4])
        it_path = autopsy_data[autopsy_data['plate_well_neuron'] ==
                               sf]['disease']
        if not len(it_path):
            it_path = 'Absent'
        else:
            it_path = it_path.as_matrix()[0]
        pathologies += [it_path]
    pathologies = np.asarray(pathologies)

    # Set up saver
    saver = tf.train.Saver(tf.global_variables())

    # Loop through each checkpoint then test the entire validation set
    ckpts = [model_file]
    ckpt_yhat, ckpt_scores, ckpt_file_array = [], [], []
    print '-' * 60
    print 'Beginning evaluation'
    print '-' * 60

    if config.validation_batch > len(combined_files):
        print 'Trimming validation_batch size to %s.' % len(combined_files)
        config.validation_batch = len(combined_files)

    for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'):
        dec_scores, yhat, file_array = [], [], []
        # Initialize the graph
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(
            tf.group(tf.global_variables_initializer(),
                     tf.local_variables_initializer()))

        # Set up exemplar threading
        saver.restore(sess, c)
        start_time = time.time()
        num_batches = np.floor(
            len(combined_files) / float(config.validation_batch)).astype(int)
        for image_batch, file_batch in tqdm(image_batcher(
                start=0,
                num_batches=num_batches,
                images=combined_files,
                config=config,
                training_max=training_max,
                training_min=training_min),
                                            total=num_batches):
            feed_dict = {images: image_batch}
            sc, tyh = sess.run([scores, preds], feed_dict=feed_dict)
            dec_scores += [sc]
            yhat += [tyh]
            file_array += [file_batch]
        ckpt_yhat.append(yhat)
        ckpt_scores.append(dec_scores)
        ckpt_file_array.append(file_array)
        print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time)
    sess.close()

    # Create and plot an embedding
    im_path_map = pathologies[:num_batches * config.validation_batch]
    dec_scores = np.concatenate(dec_scores)
    mu = dec_scores.mean(0)[None, :]
    sd = dec_scores.std(0)[None, :]
    dec_scores = (dec_scores - mu) / sd
    yhat = np.concatenate(yhat)
    file_array = np.concatenate(file_array)

    if embedding_type == 'TSNE' or embedding_type == 'tsne':
        emb = manifold.TSNE(n_components=2, init='pca', random_state=0)
    elif embedding_type == 'PCA' or embedding_type == 'pca':
        emb = PCA(n_components=2, svd_solver='randomized', random_state=0)
    elif embedding_type == 'spectral':
        emb = manifold.SpectralEmbedding(n_components=2, random_state=0)
    y = emb.fit_transform(dec_scores)

    # Ouput csv
    df = pd.DataFrame(np.hstack(
        (y, im_path_map.reshape(-1, 1), file_array.reshape(-1, 1))),
                      columns=['D1', 'D2', 'pathology', 'filename'])
    out_name = os.path.join(out_dir, 'embedding.csv')
    df.to_csv(out_name)
    print 'Saved csv to: %s' % out_name

    # Create plot
    f, ax = plt.subplots()
    unique_cats = np.unique(im_path_map)
    h = []
    for idx, cat in enumerate(unique_cats):
        h += [
            plt.scatter(y[im_path_map == cat, 0],
                        y[im_path_map == cat, 1],
                        c=plt.cm.Spectral(idx * 1000))
        ]
    plt.legend(h, unique_cats)
    plt.axis('tight')
    plt.show()
    plt.savefig('embedding.png')
    plt.close(f)

    # Save everything
    if save_npy:
        np.savez(os.path.join(out_dir, 'validation_accuracies'),
                 ckpt_yhat=ckpt_yhat,
                 ckpt_scores=ckpt_scores,
                 ckpt_names=ckpts,
                 combined_files=ckpt_file_array)