def visualize_model( live_ims, dead_ims, model_file, output_folder, num_channels, smooth_iterations=50, untargeted=False, viz='none', per_timepoint=True): """Train an SVM for your dataset on GEDI-model encodings.""" config = GEDIconfig() if live_ims is None: raise RuntimeError( 'You need to supply a directory path to the live images.') if dead_ims is None: raise RuntimeError( 'You need to supply a directory path to the dead images.') live_files = glob(os.path.join(live_ims, '*%s' % config.raw_im_ext)) dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext)) combined_labels = np.concatenate(( np.zeros(len(live_files)), np.ones(len(dead_files)))) combined_files = np.concatenate((live_files, dead_files)) if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1]) meta_file_pointer = os.path.join( model_file_path, 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file: train_maximum_value.npz' 'Closest I could find from directory %s was %s.' 'Download this from the link described in the README.md.' % (model_file_path, glob(os.path.join(model_file_path, '*.npz')))) meta_data = np.load(meta_file_pointer) # Prepare image normalization values training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU images = tf.placeholder( tf.float32, shape=[None] + config.model_image_size, name='images') labels = tf.placeholder( tf.int64, shape=[None], name='labels') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct( vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build( images, output_shape=config.output_shape) # Setup validation op scores = vgg.fc7 preds = tf.argmax(vgg.prob, 1) activity_pattern = vgg.fc8 if not untargeted: oh_labels = tf.one_hot(labels, config.output_shape) activity_pattern *= oh_labels grad_image = tf.gradients(activity_pattern, images) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_y, ckpt_scores = [], [], [] ckpt_file_array, ckpt_viz_images = [], [] print '-' * 60 print 'Beginning evaluation' print '-' * 60 if config.validation_batch > len(combined_files): print 'Trimming validation_batch to %s (same as # of files).' % len( combined_files) config.validation_batch = len(combined_files) count = 0 for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, y, file_array, viz_images = [], [], [], [], [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group( tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float( config.validation_batch)).astype(int) for image_batch, label_batch, file_batch in tqdm( image_batcher( start=0, num_batches=num_batches, images=combined_files, labels=combined_labels, config=config, training_max=training_max, training_min=training_min, num_channels=num_channels, per_timepoint=per_timepoint), total=num_batches): feed_dict = { images: image_batch, labels: label_batch } it_grads = np.zeros((image_batch.shape)) sc, tyh = sess.run( [scores, preds], feed_dict=feed_dict) for idx in range(smooth_iterations): feed_dict = { images: add_noise(image_batch), labels: label_batch } it_grad = sess.run( grad_image, feed_dict=feed_dict) it_grads += it_grad[0] it_grads /= smooth_iterations # Mean across iterations it_grads = visualization_function(it_grads, viz) # Save each grad individually for grad_i, pred_i, file_i, label_i in zip( it_grads, tyh, file_batch, label_batch): out_pointer = os.path.join( output_folder, file_i.split(os.path.sep)[-1]) out_pointer = out_pointer.split('.')[0] + '.png' f = plt.figure() plt.imshow(grad_i) plt.title('Pred=%s, label=%s' % (pred_i, label_batch)) plt.savefig(out_pointer) plt.close(f) # Plot a moisaic of the grads if viz == 'none': pos_grads = normalize(np.maximum(it_grads, 0)) neg_grads = normalize(np.minimum(it_grads, 0)) alpha_mosaic( image_batch, pos_grads, 'pos_batch_%s.pdf' % count, title='Positive gradient overlays.', rc=1, cc=len(image_batch), cmap=plt.cm.Reds) alpha_mosaic( image_batch, neg_grads, 'neg_batch_%s.pdf' % count, title='Negative gradient overlays.', rc=1, cc=len(image_batch), cmap=plt.cm.Reds) else: alpha_mosaic( image_batch, it_grads, 'batch_%s.pdf' % count, title='Gradient overlays.', rc=1, cc=len(image_batch), cmap=plt.cm.Reds) count += 1 # Store the results dec_scores += [sc] yhat = np.append(yhat, tyh) y = np.append(y, label_batch) file_array = np.append(file_array, file_batch) viz_images += [it_grads] ckpt_yhat.append(yhat) ckpt_y.append(y) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) ckpt_viz_images.append(viz_images) print 'Batch %d took %.1f seconds' % ( idx, time.time() - start_time) sess.close() # Save everything np.savez( os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_y=ckpt_y, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array, ckpt_viz_images=ckpt_viz_images)
def test_vgg16(validation_data, model_dir, label_file, selected_ckpts=-1): config = GEDIconfig() # Load metas meta_data = np.load(os.path.join(tf_dir, 'val_maximum_value.npz')) max_value = np.max(meta_data['max_array']).astype(np.float32) # Find model checkpoints ckpts, ckpt_names = find_ckpts(config, model_dir) # ds_dt_stamp = re.split('/', ckpts[0])[-2] out_dir = os.path.join(config.results, 'gfp_2017_02_19_17_41_19' + '/') try: config = np.load(os.path.join(out_dir, 'meta_info.npy')).item() # Make sure this is always at 1 config.validation_batch = 64 print '-' * 60 print 'Loading config meta data for:%s' % out_dir print '-' * 60 except: print '-' * 60 print 'Using config from gedi_config.py for model:%s' % out_dir print '-' * 60 sorted_index = np.argsort(np.asarray([int(x) for x in ckpt_names])) ckpts = ckpts[sorted_index] ckpt_names = ckpt_names[sorted_index] # CSV file svm_image_file = os.path.join(out_dir, 'svm_models.npz') if svm_image_file == 2: svm_image_data = np.load(svm_image_file) image_array = svm_image_data['image_array'] label_vec = svm_image_data['label_vec'] tr_label_vec = svm_image_data['tr_label_vec'] else: labels = pd.read_csv( os.path.join(config.processed_image_patch_dir, 'LINCSproject_platelayout_trans.csv')) label_vec = [] image_array = [] for idx, row in tqdm(labels.iterrows(), total=len(labels)): path_wd = '*%s_%s*' % (row['Plate'], row['Sci_WellID']) path_pointer = glob(os.path.join(image_dir, path_wd)) if len(path_pointer) > 0: for p in path_pointer: import ipdb ipdb.set_trace() label_vec.append(row['Sci_SampleID']) label_vec = np.asarray(label_vec) le = preprocessing.LabelEncoder() tr_label_vec = le.fit_transform(label_vec) np.savez(svm_image_file, image_array=image_array, label_vec=label_vec, tr_label_vec=tr_label_vec) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Make placeholder val_images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size) # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.Vgg16(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) validation_mode = tf.Variable(False, name='training') # No batchnorms durign testing vgg.build(val_images, output_shape=config.output_shape, train_mode=validation_mode) # Set up saver svm_feature_file = os.path.join(out_dir, 'svm_scores.npz') if os.path.exists(svm_feature_file): svm_features = np.load(svm_feature_file) dec_scores = svm_features['dec_scores'] label_vec = svm_features['label_vec'] else: saver = tf.train.Saver(tf.global_variables()) ckpts = [ckpts[selected_ckpts]] image_array = np.asarray(image_array) for idx, c in enumerate(ckpts): dec_scores = [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) num_batches = np.ceil(len(image_array) / config.validation_batch).astype(int) batch_idx = np.arange(num_batches).repeat( num_batches)[:len(image_array)] for bi in np.unique(batch_idx): # move this above to image processing batch_images = image_array[batch_idx == bi] / 255. start_time = time.time() sc = sess.run(vgg.fc7, feed_dict={val_images: batch_images}) dec_scores.append(sc) print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time) # Save everything np.savez(svm_feature_file, dec_scores=dec_scores, label_vec=label_vec) # Build SVM dec_scores = np.concatenate(dec_scores[:], axis=0) model_array, score_array, combo_array, masked_label_array = [], [], [], [] for combo in itertools.combinations(np.unique(label_vec), 2): combo_array.append(combo) mask = np.logical_or(label_vec == combo[0], label_vec == combo[1]) import ipdb ipdb.set_trace() masked_labels = label_vec[mask] masked_scores = dec_scores[mask, :] clf = SVC(kernel='linear', C=1) scores = cross_val_score(clf, masked_scores, masked_labels, cv=5) model_array.append(clf) score_array.append(scores) masked_label_array.append(masked_labels) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Save everything np.savez(os.path.join(out_dir, 'svm_models'), combo_array=combo_array, model_array=model_array, score_array=score_array, masked_label_array=masked_label_array)
def test_vgg16(image_dir, model_file, output_csv='prediction_file', training_max=None): print(image_dir) # tf.set_random_seed(0) config = GEDIconfig() if image_dir is None: raise RuntimeError( 'You need to supply a directory path to the images.') combined_files = np.asarray( glob(os.path.join(image_dir, '*%s' % config.raw_im_ext))) if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1]) print('model file path', model_file_path) meta_file_pointer = os.path.join(model_file_path, 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file: train_maximum_value.npz' 'Closest I could find from directory %s was %s.' 'Download this from the link described in the README.md.' % (model_file_path, glob(os.path.join(model_file_path, '*.npz')))) meta_data = np.load(meta_file_pointer) # Prepare image normalization values if training_max is None: training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) print('out_dir', out_dir) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU if config.model_image_size[-1] < 3: print('*' * 60) print('Warning: model is expecting a H/W/1 image. ' 'Do you mean to set the last dimension of ' 'config.model_image_size to 3?') print('*' * 60) images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size, name='images') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(images, output_shape=config.output_shape) # Setup validation op scores = vgg.prob preds = tf.argmax(vgg.prob, 1) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], [] print('-' * 60) print('Beginning evaluation') print('-' * 60) if config.validation_batch > len(combined_files): print('Trimming validation_batch size to %s (same as # of files).' % len(combined_files)) config.validation_batch = len(combined_files) for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, file_array = [], [], [] # Initialize the graph # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float(config.validation_batch)).astype(int) for image_batch, file_batch in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, config=config, training_max=training_max, training_min=training_min), total=num_batches): feed_dict = {images: image_batch} sc, tyh = sess.run([scores, preds], feed_dict=feed_dict) dec_scores = np.append(dec_scores, sc) yhat = np.append(yhat, tyh) file_array = np.append(file_array, file_batch) ckpt_yhat.append(yhat) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) print('Batch %d took %.1f seconds' % (idx, time.time() - start_time)) # sess.close() # Save everything print('Save npz.') print(os.path.join(out_dir, 'validation_accuracies')) np.savez(os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array) # Also save a csv with item/guess pairs try: dec_scores = np.asarray(dec_scores) yhat = np.asarray(yhat) df = pd.DataFrame(np.hstack( (np.asarray(ckpt_file_array).reshape(-1, 1), yhat.reshape(-1, 1), dec_scores.reshape(dec_scores.shape[0] // 2, 2))), columns=[ 'files', 'live_guesses', 'classifier score dead', 'classifier score live' ]) output_name = image_dir.split('/')[-1] if output_name is None or len(output_name) == 0: output_name = 'output' df.to_csv(os.path.join(out_dir, '%s.csv' % output_name)) print('Saved csv to: %s' % os.path.join(out_dir, '%s.csv' % output_name)) except: print('X' * 60) print('Could not save a spreadsheet of file info') print('X' * 60) # Plot everything try: plot_accuracies(ckpt_y, ckpt_yhat, config, ckpts, os.path.join(out_dir, 'validation_accuracies.png')) plot_std(ckpt_y, ckpt_yhat, ckpts, os.path.join(out_dir, 'validation_stds.png')) plot_cms(ckpt_y, ckpt_yhat, config, os.path.join(out_dir, 'confusion_matrix.png')) plot_pr(ckpt_y, ckpt_yhat, ckpt_scores, os.path.join(out_dir, 'precision_recall.png')) # plot_cost( # os.path.join(out_dir, 'training_loss.npy'), ckpts, # os.path.join(out_dir, 'training_costs.png')) except: print('X' * 60) print('Could not locate the loss numpy') print('X' * 60)
def test_vgg16(validation_data, model_dir, which_set, selected_ckpts): config = GEDIconfig() blur_kernel = config.hm_blur if validation_data is None: # Use globals validation_data = os.path.join(config.tfrecord_dir, config.tf_record_names[which_set]) meta_data = np.load( os.path.join(config.tfrecord_dir, 'val_%s' % config.max_file)) else: meta_data = np.load('%s_maximum_value.npz' % validation_data.split('.tfrecords')[0]) label_list = os.path.join( config.processed_image_patch_dir, 'list_of_' + '_'.join(x for x in config.image_prefixes) + '_labels.txt') with open(label_list) as f: file_pointers = [l.rstrip('\n') for l in f.readlines()] # Prepare image normalization values try: max_value = np.max(meta_data['max_array']).astype(np.float32) except: max_value = np.asarray([config.max_gedi]) try: min_value = np.max(meta_data['min_array']).astype(np.float32) except: min_value = np.asarray([config.min_gedi]) # Find model checkpoints ds_dt_stamp = re.split('/', model_dir)[-1] out_dir = os.path.join(config.results, ds_dt_stamp + '/') try: config = np.load(os.path.join(out_dir, 'meta_info.npy')).item() # Make sure this is always at 1 config.validation_batch = 1 print '-' * 60 print 'Loading config meta data for:%s' % out_dir print '-' * 60 except: print '-' * 60 print 'Using config from gedi_config.py for model:%s' % out_dir print '-' * 60 # Make output directories if they do not exist im_shape = config.gedi_image_size # Prepare data on CPU with tf.device('/cpu:0'): val_images, val_labels = inputs(validation_data, 1, im_shape, config.model_image_size[:2], max_value=max_value, min_value=min_value, num_epochs=1, normalize=config.normalize) # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.Vgg16(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(val_images, output_shape=config.output_shape) # Setup validation op preds = tf.argmax(vgg.prob, 1) targets = tf.cast(val_labels, dtype=tf.int64) grad_labels = tf.one_hot(val_labels, config.output_shape, dtype=tf.float32) heatmap_op = tf.gradients(vgg.fc8 * grad_labels, val_images)[0] # Set up saver saver = tf.train.Saver(tf.global_variables()) ckpts = [selected_ckpts] # Loop through each checkpoint then test the entire validation set print '-' * 60 print 'Beginning evaluation on ckpt: %s' % ckpts print '-' * 60 yhat, y, tn_hms, tp_hms, fn_hms, fp_hms = [], [], [], [], [], [] tn_ims, tp_ims, fn_ims, fp_ims = [], [], [], [] for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): try: # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver.restore(sess, c) start_time = time.time() while not coord.should_stop(): tyh, ty, thm, tim = sess.run( [preds, targets, heatmap_op, val_images]) tyh = tyh[0] ty = ty[0] tim = (tim / tim.max()).squeeze() yhat += [tyh] y += [ty] if tyh == ty and not tyh: # True negative tn_hms += [hm_normalize(thm)] tn_ims += [tim] elif tyh == ty and tyh: # True positive tp_hms += [hm_normalize(thm)] tp_ims += [tim] elif tyh != ty and not tyh: # False negative fn_hms += [hm_normalize(thm)] fn_ims += [tim] elif tyh != ty and tyh: # False positive fp_hms += [hm_normalize(thm)] fp_ims += [tim] except tf.errors.OutOfRangeError: print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time) finally: coord.request_stop() coord.join(threads) sess.close() # Plot images -- add to a dict and incorporate file_pointers dir_pointer = os.path.join(config.heatmap_source_images, ds_dt_stamp) stem_dirs = ['tn', 'tp', 'fn', 'fp'] dir_list = [dir_pointer] dir_list += [os.path.join(dir_pointer, x) for x in stem_dirs] [make_dir(d) for d in dir_list] loop_plot(tn_ims, tn_hms, 'True negative', os.path.join(dir_pointer, 'tn'), blur=blur_kernel) loop_plot(tp_ims, tp_hms, 'True positive', os.path.join(dir_pointer, 'tp'), blur=blur_kernel) loop_plot(fn_ims, fn_hms, 'False negative', os.path.join(dir_pointer, 'fn'), blur=blur_kernel) loop_plot(fp_ims, fp_hms, 'False positive', os.path.join(dir_pointer, 'fp'), blur=blur_kernel)
def test_vgg16(validation_data, model_dir, which_set, selected_ckpts=-1): config = GEDIconfig() if validation_data is None: # Use globals validation_data = os.path.join(config.tfrecord_dir, config.tf_record_names[which_set]) meta_data = np.load( os.path.join(config.tfrecord_dir, 'val_%s' % config.max_file)) else: meta_data = np.load('%s_maximum_value.npz' % validation_data.split('.tfrecords')[0]) label_list = os.path.join( config.processed_image_patch_dir, 'list_of_' + '_'.join(x for x in config.image_prefixes) + '_labels.txt') with open(label_list) as f: file_pointers = [l.rstrip('\n') for l in f.readlines()] # Prepare image normalization values try: max_value = np.max(meta_data['max_array']).astype(np.float32) except: max_value = np.asarray([config.max_gedi]) try: min_value = np.max(meta_data['min_array']).astype(np.float32) except: min_value = np.asarray([config.min_gedi]) # Find model checkpoints ckpts, ckpt_names = find_ckpts(config, model_dir) ds_dt_stamp = re.split('/', ckpts[0])[-2] out_dir = os.path.join(config.results, ds_dt_stamp) try: config = np.load(os.path.join(out_dir, 'meta_info.npy')).item() # Make sure this is always at 1 config.validation_batch = 1 print('-' * 60) print('Loading config meta data for:%s' % out_dir) print('-' * 60) except: print('-' * 60) print('Using config from gedi_config.py for model:%s' % out_dir) print('-' * 60) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # im_shape = get_image_size(config) im_shape = config.gedi_image_size # Prepare data on CPU with tf.device('/cpu:0'): val_images, val_labels = inputs(validation_data, 1, im_shape, config.model_image_size[:2], max_value=max_value, min_value=min_value, num_epochs=1, normalize=config.normalize) # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.Vgg16(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(val_images, output_shape=config.output_shape) # Setup validation op scores = vgg.prob preds = tf.argmax(vgg.prob, 1) targets = tf.cast(val_labels, dtype=tf.int64) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpt_yhat, ckpt_y, ckpt_scores = [], [], [] print('-' * 60) print('Beginning evaluation') print('-' * 60) if selected_ckpts is not None: # Select a specific ckpt if selected_ckpts < 0: ckpts = ckpts[selected_ckpts:] else: ckpts = ckpts[:selected_ckpts] for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, y = [], [], [] try: # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver.restore(sess, c) start_time = time.time() while not coord.should_stop(): sc, tyh, ty = sess.run([scores, preds, targets]) dec_scores = np.append(dec_scores, sc) yhat = np.append(yhat, tyh) y = np.append(y, ty) except tf.errors.OutOfRangeError: ckpt_yhat.append(yhat) ckpt_y.append(y) ckpt_scores.append(dec_scores) print('Iteration accuracy: %s' % np.mean(yhat == y)) print('Iteration pvalue: %.5f' % randomization_test(y=y, yhat=yhat)) print('Batch %d took %.1f seconds' % (idx, time.time() - start_time)) finally: coord.request_stop() coord.join(threads) sess.close() # Save everything np.savez(os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_y=ckpt_y, ckpt_scores=ckpt_scores, ckpt_names=ckpt_names, file_pointers=file_pointers) # Also save a csv with item/guess pairs try: trimmed_files = [re.split('/', x)[-1] for x in file_pointers] trimmed_files = np.asarray(trimmed_files) dec_scores = np.asarray(dec_scores) yhat = np.asarray(yhat) df = pd.DataFrame( np.hstack((trimmed_files.reshape(-1, 1), yhat.reshape(-1, 1), dec_scores.reshape(dec_scores.shape[0] // 2, 2))), columns=['files', 'guesses', 'score dead', 'score live']) df.to_csv(os.path.join(out_dir, 'prediction_file.csv')) print('Saved csv to: %s' % out_dir) except: print('X' * 60) print('Could not save a spreadsheet of file info') print('X' * 60) # Plot everything try: plot_accuracies(ckpt_y, ckpt_yhat, config, ckpt_names, os.path.join(out_dir, 'validation_accuracies.png')) plot_std(ckpt_y, ckpt_yhat, ckpt_names, os.path.join(out_dir, 'validation_stds.png')) plot_cms(ckpt_y, ckpt_yhat, config, os.path.join(out_dir, 'confusion_matrix.png')) plot_pr(ckpt_y, ckpt_yhat, ckpt_scores, os.path.join(out_dir, 'precision_recall.png')) plot_cost(os.path.join(out_dir, 'training_loss.npy'), ckpt_names, os.path.join(out_dir, 'training_costs.png')) except: print('X' * 60) print('Could not locate the loss numpy') print('X' * 60)
def test_vgg16(model_file, trained_svm, ims, dead_ims=None, output_csv='prediction_file', training_max=None, C=1e-3, k_folds=10): """Test an SVM you've trained on a new dataset.""" config = GEDIconfig() if ims is None: raise RuntimeError( 'You need to supply a directory path to the images.') if dead_ims is None: print 'Assuming all of your images are in the ims folder' + \ '-- will not derive labels to calculate accuracy.' # if not os.path.exists(trained_svm): # raise RuntimeError( # 'Cannot find the trained svm model. Check the path you passed.') try: clf = cPickle.load(open(trained_svm, 'rb')) # clf = model_dict['clf'] # mu = model_dict['mu'] # sd = model_dict['sd'] except: raise RuntimeError('Cannot find SVM file: %s' % trained_svm) if dead_ims is not None: live_files = glob(os.path.join(ims, '*%s' % config.raw_im_ext)) dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext)) combined_labels = np.concatenate( (np.zeros(len(live_files)), np.ones(len(dead_files)))) combined_files = np.concatenate((live_files, dead_files)) else: live_files = glob(os.path.join(ims, '*%s' % config.raw_im_ext)) combined_labels = None combined_files = np.asarray(live_files) if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1]) meta_file_pointer = os.path.join(model_file_path, 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file: train_maximum_value.npz' 'Closest I could find from directory %s was %s.' 'Download this from the link described in the README.md.' % (model_file_path, glob(os.path.join(model_file_path, '*.npz')))) meta_data = np.load(meta_file_pointer) # Prepare image normalization values if training_max is None: training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size, name='images') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(images, output_shape=config.output_shape) # Setup validation op scores = vgg.fc7 preds = tf.argmax(vgg.prob, 1) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], [] print '-' * 60 print 'Beginning evaluation' print '-' * 60 if config.validation_batch > len(combined_files): print 'Trimming validation_batch size to %s (same as # of files).' % len( combined_files) config.validation_batch = len(combined_files) for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, y, file_array = [], [], [], [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float(config.validation_batch)).astype(int) for image_batch, label_batch, file_batch in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, labels=combined_labels, config=config, training_max=training_max, training_min=training_min), total=num_batches): feed_dict = {images: image_batch} sc, tyh = sess.run([scores, preds], feed_dict=feed_dict) dec_scores += [sc] yhat = np.append(yhat, tyh) y = np.append(y, label_batch) file_array = np.append(file_array, file_batch) ckpt_yhat.append(yhat) ckpt_y.append(y) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time) sess.close() # Save everything new_dt_string = re.split('\.', str(datetime.now()))[0].\ replace(' ', '_').replace(':', '_').replace('-', '_') np.savez(os.path.join(out_dir, '%s_validation_accuracies' % new_dt_string), ckpt_yhat=ckpt_yhat, ckpt_y=ckpt_y, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array) # Run SVM all_scores = np.concatenate(dec_scores) # all_scores = (all_scores - mu) / sd predictions = clf.predict(all_scores) if dead_ims is not None: mean_acc = np.mean(predictions == y) p_value = randomization_test(y=y, yhat=predictions) print 'SVM performance: mean accuracy = %s%%, p = %.5f' % (mean_acc, p_value) df_col_label = 'true label' else: mean_acc, p_value = None, None y = np.copy(yhat) df_col_label = 'Dummy column (no labels supplied)' np.savez(os.path.join(out_dir, '%s_svm_test_data' % new_dt_string), yhat=yhat, y=y, scores=dec_scores, ckpts=ckpts, p_value=p_value) # Also save a csv with item/guess pairs trimmed_files = np.asarray([ x.split(os.path.sep)[-1] for x in np.asarray(ckpt_file_array).ravel() ]) yhat = np.asarray(yhat) df = pd.DataFrame( np.hstack((trimmed_files.reshape(-1, 1), yhat.reshape(-1, 1))), # y.reshape(-1, 1))), columns=['files', 'guesses']) # , df_col_label]) df.to_csv(os.path.join(out_dir, 'prediction_file.csv')) print 'Saved csv to: %s' % out_dir
def test_placeholder( image_path, model_file, model_meta, out_dir, n_images=3, first_n_images=1, debug=True, margin=.1, autopsy_csv=None, C=1, k_folds=10, embedding_type='tsne', autopsy_model='match'): config = GEDIconfig() assert margin is not None, 'Need a margin for the loss.' assert image_path is not None, 'Provide a path to an image directory.' assert model_file is not None, 'Provide a path to the model file.' try: # Load the model's config config = np.load(model_meta).item() except: print 'Could not load model config, falling back to default config.' config.model_image_size[-1] = 1 try: # Load autopsy information autopsy_data = pd.read_csv(autopsy_csv) except IOError: print 'Unable to load autopsy file.' if not hasattr(config, 'include_GEDI'): raise RuntimeError('You need to pass the correct meta file.') config.include_GEDI = True config.l2_norm = False config.dist_fun = 'pearson' config.per_batch = False config.output_shape = 32 config.margin = 0.1 if os.path.isdir(image_path): combined_files = np.asarray( glob(os.path.join(image_path, '*%s' % config.raw_im_ext))) else: combined_files = [image_path] if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') # Make output directories if they do not exist dt_stamp = re.split( '\.', str(datetime.now()))[0].\ replace(' ', '_').replace(':', '_').replace('-', '_') dt_dataset = config.which_dataset + '_' + dt_stamp + '/' config.train_checkpoint = os.path.join( config.train_checkpoint, dt_dataset) # timestamp this run out_dir = os.path.join(out_dir, dt_dataset) dir_list = [out_dir] [tf_fun.make_dir(d) for d in dir_list] # Prepare data on CPU with tf.device('/cpu:0'): images = [] for idx in range(first_n_images): images += [tf.placeholder( tf.float32, shape=[None] + config.model_image_size, name='images_%s' % idx)] # Prepare model on GPU with tf.device('/gpu:0'): if autopsy_model == 'match': from models import matching_vgg16 as model_type with tf.variable_scope('match'): # Build matching model for frame 0 model_0 = model_type.model_struct( vgg16_npy_path=config.gedi_weight_path) # , frame_activity = [] model_activity = model_0.build( images[0], output_shape=config.output_shape, include_GEDI=config.include_GEDI) if config.l2_norm: model_activity = [model_activity] frame_activity += [model_activity] if first_n_images > 1: with tf.variable_scope('match', reuse=tf.AUTO_REUSE): # Build matching model for other frames for idx in range(1, len(images)): model_activity = model_0.build( images[idx], output_shape=config.output_shape, include_GEDI=config.include_GEDI) if config.l2_norm: model_activity = tf_fun.l2_normalize( model_activity) frame_activity += [model_activity] if config.dist_fun == 'l2': pos = tf_fun.l2_dist( frame_activity[0], frame_activity[1], axis=1) neg = tf_fun.l2_dist( frame_activity[0], frame_activity[2], axis=1) elif config.dist_fun == 'pearson': pos = tf_fun.pearson_dist( frame_activity[0], frame_activity[1], axis=1) neg = tf_fun.pearson_dist( frame_activity[0], frame_activity[2], axis=1) model_activity = pos - neg # Store the difference in distances elif autopsy_model == 'GEDI' or autopsy_model == 'gedi': from models import baseline_vgg16 as model_type model = model_type.model_struct( vgg16_npy_path=config.gedi_weight_path) # , model.build( images[0], output_shape=config.output_shape) model_activity = model.fc7 else: raise NotImplementedError(autopsy_model) if config.validation_batch > len(combined_files): print ( 'Trimming validation_batch size to %s ' '(same as # of files).' % len(combined_files)) config.validation_batch = len(combined_files) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Need to initialize both of these if supplying num_epochs to inputs sess.run( tf.group( tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading if autopsy_model == 'match': saver.restore(sess, model_file) start_time = time.time() num_batches = np.floor( len(combined_files) / float( config.validation_batch)).astype(int) score_array, file_array = [], [] for image_batch, file_batch in tqdm( image_batcher( start=0, num_batches=num_batches, images=combined_files, config=config, first_n_images=first_n_images, n_images=n_images), total=num_batches): for im_head in images: feed_dict = { im_head: image_batch } activity = sess.run( model_activity, feed_dict=feed_dict) score_array += [activity] file_array += [file_batch] print 'Image processing %d took %.1f seconds' % ( idx, time.time() - start_time) sess.close() score_array = np.concatenate(score_array, axis=0) score_array = score_array.reshape(-1, score_array.shape[-1]) file_array = np.concatenate(file_array, axis=0) # Save everything np.savez( os.path.join(out_dir, 'validation_accuracies'), score_array=score_array, file_array=file_array) if first_n_images == 1: # Derive pathologies from file names pathologies = [] for f in combined_files: sf = f.split(os.path.sep)[-1].split('_') line = sf[1] # time_col = sf[2] well = sf[4] disease = autopsy_data[ np.logical_and( autopsy_data['line'] == line, autopsy_data['wells'] == well)]['type'] try: disease = disease.as_matrix()[0] except: disease = 'Not_found' pathologies += [disease] pathologies = np.asarray(pathologies)[:len(score_array)] mu = score_array.mean(0) sd = score_array.std(0) z_score_array = (score_array - mu) / (sd + 1e-4) if embedding_type == 'TSNE' or embedding_type == 'tsne': emb = manifold.TSNE(n_components=2, init='pca', random_state=0) elif embedding_type == 'PCA' or embedding_type == 'pca': emb = PCA(n_components=2, svd_solver='randomized', random_state=0) elif embedding_type == 'spectral': emb = manifold.SpectralEmbedding(n_components=2, random_state=0) y = emb.fit_transform(score_array) # Do a classification analysis labels = np.unique(pathologies.reshape(-1, 1), return_inverse=True)[1] # Run SVM svm = LinearSVC(C=C, dual=False, class_weight='balanced') clf = make_pipeline(preprocessing.StandardScaler(), svm) predictions = cross_val_predict(clf, score_array, labels, cv=k_folds) cv_performance = metrics.accuracy_score(predictions, labels) clf.fit(score_array, labels) # mu = dec_scores.mean(0) # sd = dec_scores.std(0) print '%s-fold SVM performance: accuracy = %s%%' % ( k_folds, np.mean(cv_performance * 100)) np.savez( os.path.join(out_dir, 'svm_data'), yhat=score_array, y=labels, cv_performance=cv_performance, # mu=mu, # sd=sd, C=C) # Ouput csv df = pd.DataFrame( np.hstack(( y, pathologies.reshape(-1, 1), file_array.reshape(-1, 1))), columns=['dim1', 'dim2', 'pathology', 'filename']) out_name = os.path.join(out_dir, 'raw_embedding.csv') df.to_csv(out_name) print 'Saved csv to: %s' % out_name create_figs( emb=emb, out_dir=out_dir, out_name=out_name, embedding_type=embedding_type, embedding_name='raw_embedding') # Now work on zscored data y = emb.fit_transform(z_score_array) # Ouput csv df = pd.DataFrame( np.hstack(( y, pathologies.reshape(-1, 1), file_array.reshape(-1, 1))), columns=['dim1', 'dim2', 'pathology', 'filename']) out_name = os.path.join(out_dir, 'embedding.csv') df.to_csv(out_name) print 'Saved csv to: %s' % out_name # Create plot create_figs( emb=emb, out_dir=out_dir, out_name=out_name, embedding_type=embedding_type, embedding_name='normalized_embedding') else: # Do a classification (sign of the score) decisions = np.sign(score_array) df = pd.DataFrame( np.hstack(decisions, score_array), columns=['Decisions', 'Scores']) df.to_csv( os.path.join( out_dir, 'tracking_model_scores.csv'))
def test_vgg16(live_ims, dead_ims, model_file, svm_model='svm_model', output_csv='prediction_file', training_max=None, C=1e-3, k_folds=10): """Train an SVM for your dataset on GEDI-model encodings.""" config = GEDIconfig() if live_ims is None: raise RuntimeError( 'You need to supply a directory path to the live images.') if dead_ims is None: raise RuntimeError( 'You need to supply a directory path to the dead images.') live_files = glob(os.path.join(live_ims, '*%s' % config.raw_im_ext)) dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext)) combined_labels = np.concatenate( (np.zeros(len(live_files)), np.ones(len(dead_files)))) combined_files = np.concatenate((live_files, dead_files)) if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1]) meta_file_pointer = os.path.join(model_file_path, 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file: train_maximum_value.npz' 'Closest I could find from directory %s was %s.' 'Download this from the link described in the README.md.' % (model_file_path, glob(os.path.join(model_file_path, '*.npz')))) meta_data = np.load(meta_file_pointer) # Prepare image normalization values if training_max is None: training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size, name='images') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(images, output_shape=config.output_shape) # Setup validation op scores = vgg.fc7 preds = tf.argmax(vgg.prob, 1) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], [] print '-' * 60 print 'Beginning evaluation' print '-' * 60 if config.validation_batch > len(combined_files): print 'Trimming validation_batch size to %s (same as # of files).' % len( combined_files) config.validation_batch = len(combined_files) for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, y, file_array = [], [], [], [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float(config.validation_batch)).astype(int) for image_batch, label_batch, file_batch in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, labels=combined_labels, config=config, training_max=training_max, training_min=training_min), total=num_batches): feed_dict = {images: image_batch} sc, tyh = sess.run([scores, preds], feed_dict=feed_dict) dec_scores += [sc] yhat = np.append(yhat, tyh) y = np.append(y, label_batch) file_array = np.append(file_array, file_batch) ckpt_yhat.append(yhat) ckpt_y.append(y) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time) sess.close() # Save everything np.savez(os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_y=ckpt_y, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array) # Run SVM svm = LinearSVC(C=C, dual=False, class_weight='balanced') clf = make_pipeline(preprocessing.StandardScaler(), svm) predictions = cross_val_predict(clf, np.concatenate(dec_scores), y, cv=k_folds) cv_performance = metrics.accuracy_score(predictions, y) p_value = randomization_test(y=y, yhat=predictions) clf.fit(np.concatenate(dec_scores), y) # mu = dec_scores.mean(0) # sd = dec_scores.std(0) print '%s-fold SVM performance: accuracy = %s%% , p = %.5f' % ( k_folds, np.mean(cv_performance * 100), p_value) np.savez( os.path.join(out_dir, 'svm_data'), yhat=predictions, y=y, scores=dec_scores, ckpts=ckpts, cv_performance=cv_performance, p_value=p_value, k_folds=k_folds, # mu=mu, # sd=sd, C=C) # Also save a csv with item/guess pairs try: trimmed_files = [re.split('/', x)[-1] for x in combined_files] trimmed_files = np.asarray(trimmed_files) dec_scores = np.asarray(dec_scores) yhat = np.asarray(yhat) df = pd.DataFrame(np.hstack( (trimmed_files.reshape(-1, 1), yhat.reshape(-1, 1), y.reshape(-1, 1))), columns=['files', 'guesses', 'true label']) df.to_csv(os.path.join(out_dir, 'prediction_file.csv')) print 'Saved csv to: %s' % out_dir except: print 'X' * 60 print 'Could not save a spreadsheet of file info' print 'X' * 60 # save the classifier with open('%s.pkl' % svm_model, 'wb') as fid: # model_dict = { # 'model': clf, # 'mu': mu, # 'sd': sd # } cPickle.dump(clf, fid) print 'Saved svm model to: %s.pkl' % svm_model
def test_vgg16(image_dir, model_file, autopsy_csv=None, autopsy_path=None, output_csv='prediction_file', target_layer='fc7', save_npy=False, shuffle_images=True, embedding_type='PCA'): """Testing function for pretrained vgg16.""" assert autopsy_csv is not None, 'You must pass an autopsy file name.' assert autopsy_path is not None, 'You must pass an autopsy path.' # Load autopsy information autopsy_data = pd.read_csv(os.path.join(autopsy_path, autopsy_csv)) # Load config and begin preparing data config = GEDIconfig() if image_dir is None: raise RuntimeError( 'You need to supply a directory path to the images.') combined_files = np.asarray( glob(os.path.join(image_dir, '*%s' % config.raw_im_ext))) if shuffle_images: combined_files = combined_files[np.random.permutation( len(combined_files))] if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() meta_file_pointer = os.path.join( model_file.split('/model')[0], 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file.' 'Download this from the link described in the README.md.') meta_data = np.load(meta_file_pointer) # Prepare image normalization values training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size, name='images') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(images, output_shape=config.output_shape) # Setup validation op scores = vgg[target_layer] preds = tf.argmax(vgg.prob, 1) # Derive pathologies from file names pathologies = [] for f in combined_files: sf = f.split('/')[-1].split('_') sf = '_'.join(sf[1:4]) it_path = autopsy_data[autopsy_data['plate_well_neuron'] == sf]['disease'] if not len(it_path): it_path = 'Absent' else: it_path = it_path.as_matrix()[0] pathologies += [it_path] pathologies = np.asarray(pathologies) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_scores, ckpt_file_array = [], [], [] print '-' * 60 print 'Beginning evaluation' print '-' * 60 if config.validation_batch > len(combined_files): print 'Trimming validation_batch size to %s.' % len(combined_files) config.validation_batch = len(combined_files) for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, file_array = [], [], [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float(config.validation_batch)).astype(int) for image_batch, file_batch in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, config=config, training_max=training_max, training_min=training_min), total=num_batches): feed_dict = {images: image_batch} sc, tyh = sess.run([scores, preds], feed_dict=feed_dict) dec_scores += [sc] yhat += [tyh] file_array += [file_batch] ckpt_yhat.append(yhat) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time) sess.close() # Create and plot an embedding im_path_map = pathologies[:num_batches * config.validation_batch] dec_scores = np.concatenate(dec_scores) mu = dec_scores.mean(0)[None, :] sd = dec_scores.std(0)[None, :] dec_scores = (dec_scores - mu) / sd yhat = np.concatenate(yhat) file_array = np.concatenate(file_array) if embedding_type == 'TSNE' or embedding_type == 'tsne': emb = manifold.TSNE(n_components=2, init='pca', random_state=0) elif embedding_type == 'PCA' or embedding_type == 'pca': emb = PCA(n_components=2, svd_solver='randomized', random_state=0) elif embedding_type == 'spectral': emb = manifold.SpectralEmbedding(n_components=2, random_state=0) y = emb.fit_transform(dec_scores) # Ouput csv df = pd.DataFrame(np.hstack( (y, im_path_map.reshape(-1, 1), file_array.reshape(-1, 1))), columns=['D1', 'D2', 'pathology', 'filename']) out_name = os.path.join(out_dir, 'embedding.csv') df.to_csv(out_name) print 'Saved csv to: %s' % out_name # Create plot f, ax = plt.subplots() unique_cats = np.unique(im_path_map) h = [] for idx, cat in enumerate(unique_cats): h += [ plt.scatter(y[im_path_map == cat, 0], y[im_path_map == cat, 1], c=plt.cm.Spectral(idx * 1000)) ] plt.legend(h, unique_cats) plt.axis('tight') plt.show() plt.savefig('embedding.png') plt.close(f) # Save everything if save_npy: np.savez(os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array)