def visualize_model( live_ims, dead_ims, model_file, output_folder, num_channels, smooth_iterations=50, untargeted=False, viz='none', per_timepoint=True): """Train an SVM for your dataset on GEDI-model encodings.""" config = GEDIconfig() if live_ims is None: raise RuntimeError( 'You need to supply a directory path to the live images.') if dead_ims is None: raise RuntimeError( 'You need to supply a directory path to the dead images.') live_files = glob(os.path.join(live_ims, '*%s' % config.raw_im_ext)) dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext)) combined_labels = np.concatenate(( np.zeros(len(live_files)), np.ones(len(dead_files)))) combined_files = np.concatenate((live_files, dead_files)) if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1]) meta_file_pointer = os.path.join( model_file_path, 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file: train_maximum_value.npz' 'Closest I could find from directory %s was %s.' 'Download this from the link described in the README.md.' % (model_file_path, glob(os.path.join(model_file_path, '*.npz')))) meta_data = np.load(meta_file_pointer) # Prepare image normalization values training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU images = tf.placeholder( tf.float32, shape=[None] + config.model_image_size, name='images') labels = tf.placeholder( tf.int64, shape=[None], name='labels') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct( vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build( images, output_shape=config.output_shape) # Setup validation op scores = vgg.fc7 preds = tf.argmax(vgg.prob, 1) activity_pattern = vgg.fc8 if not untargeted: oh_labels = tf.one_hot(labels, config.output_shape) activity_pattern *= oh_labels grad_image = tf.gradients(activity_pattern, images) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_y, ckpt_scores = [], [], [] ckpt_file_array, ckpt_viz_images = [], [] print '-' * 60 print 'Beginning evaluation' print '-' * 60 if config.validation_batch > len(combined_files): print 'Trimming validation_batch to %s (same as # of files).' % len( combined_files) config.validation_batch = len(combined_files) count = 0 for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, y, file_array, viz_images = [], [], [], [], [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group( tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float( config.validation_batch)).astype(int) for image_batch, label_batch, file_batch in tqdm( image_batcher( start=0, num_batches=num_batches, images=combined_files, labels=combined_labels, config=config, training_max=training_max, training_min=training_min, num_channels=num_channels, per_timepoint=per_timepoint), total=num_batches): feed_dict = { images: image_batch, labels: label_batch } it_grads = np.zeros((image_batch.shape)) sc, tyh = sess.run( [scores, preds], feed_dict=feed_dict) for idx in range(smooth_iterations): feed_dict = { images: add_noise(image_batch), labels: label_batch } it_grad = sess.run( grad_image, feed_dict=feed_dict) it_grads += it_grad[0] it_grads /= smooth_iterations # Mean across iterations it_grads = visualization_function(it_grads, viz) # Save each grad individually for grad_i, pred_i, file_i, label_i in zip( it_grads, tyh, file_batch, label_batch): out_pointer = os.path.join( output_folder, file_i.split(os.path.sep)[-1]) out_pointer = out_pointer.split('.')[0] + '.png' f = plt.figure() plt.imshow(grad_i) plt.title('Pred=%s, label=%s' % (pred_i, label_batch)) plt.savefig(out_pointer) plt.close(f) # Plot a moisaic of the grads if viz == 'none': pos_grads = normalize(np.maximum(it_grads, 0)) neg_grads = normalize(np.minimum(it_grads, 0)) alpha_mosaic( image_batch, pos_grads, 'pos_batch_%s.pdf' % count, title='Positive gradient overlays.', rc=1, cc=len(image_batch), cmap=plt.cm.Reds) alpha_mosaic( image_batch, neg_grads, 'neg_batch_%s.pdf' % count, title='Negative gradient overlays.', rc=1, cc=len(image_batch), cmap=plt.cm.Reds) else: alpha_mosaic( image_batch, it_grads, 'batch_%s.pdf' % count, title='Gradient overlays.', rc=1, cc=len(image_batch), cmap=plt.cm.Reds) count += 1 # Store the results dec_scores += [sc] yhat = np.append(yhat, tyh) y = np.append(y, label_batch) file_array = np.append(file_array, file_batch) viz_images += [it_grads] ckpt_yhat.append(yhat) ckpt_y.append(y) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) ckpt_viz_images.append(viz_images) print 'Batch %d took %.1f seconds' % ( idx, time.time() - start_time) sess.close() # Save everything np.savez( os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_y=ckpt_y, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array, ckpt_viz_images=ckpt_viz_images)
def train_vgg16(train_dir=None, validation_dir=None): config = GEDIconfig() if train_dir is None: # Use globals train_data = os.path.join( config.tfrecord_dir, config.tf_record_names['train']) meta_data = np.load( os.path.join( config.tfrecord_dir, '%s_%s' % (config.tvt_flags[0], config.max_file))) else: meta_data = np.load( os.path.join( train_dir, '%s_%s' % (config.tvt_flags[0], config.max_file))) # Prepare image normalization values if config.max_gedi is None: max_value = np.nanmax(meta_data['max_array']).astype(np.float32) if max_value == 0: max_value = None print 'Derived max value is 0' else: print 'Normalizing with empirical max.' if 'min_array' in meta_data.keys(): min_value = np.min(meta_data['min_array']).astype(np.float32) print 'Normalizing with empirical min.' else: min_value = None print 'Not normalizing with a min.' else: max_value = config.max_gedi min_value = config.min_gedi ratio = meta_data['ratio'] if config.encode_time_of_death: tod = pd.read_csv(config.encode_time_of_death) tod_data = tod['dead_tp'].as_matrix() mask = np.isnan(tod_data).astype(int) + ( tod['plate_well_neuron'] == 'empty').as_matrix().astype(int) tod_data = tod_data[mask == 0] tod_data = tod_data[tod_data > config.mask_timepoint_value] config.output_shape = len(np.unique(tod_data)) ratio = class_weight.compute_class_weight( 'balanced', np.sort(np.unique(tod_data)), tod_data) flip_ratio = False else: flip_ratio = True print 'Ratio is: %s' % ratio if validation_dir is None: # Use globals validation_data = os.path.join( config.tfrecord_dir, config.tf_record_names['val']) elif validation_dir is False: pass # Do not use validation data during training # Make output directories if they do not exist dt_stamp = re.split( '\.', str(datetime.now()))[0].\ replace(' ', '_').replace(':', '_').replace('-', '_') dt_dataset = config.which_dataset + '_' + dt_stamp + '/' config.train_checkpoint = os.path.join( config.train_checkpoint, dt_dataset) # timestamp this run out_dir = os.path.join(config.results, dt_dataset) dir_list = [ config.train_checkpoint, config.train_summaries, config.results, out_dir] [make_dir(d) for d in dir_list] # im_shape = get_image_size(config) im_shape = config.gedi_image_size print '-'*60 print('Training model:' + dt_dataset) print '-'*60 # Prepare data on CPU assert os.path.exists(train_data) assert os.path.exists(validation_data) assert os.path.exists(config.vgg16_weight_path) with tf.device('/cpu:0'): train_images, train_labels, train_gedi_images = inputs( train_data, config.train_batch, im_shape, config.model_image_size[:2], max_value=max_value, min_value=min_value, train=config.data_augmentations, num_epochs=config.epochs, normalize=config.normalize, return_gedi=config.include_GEDI_in_tfrecords, return_extra_gfp=config.extra_image, return_GEDI_derivative=True) val_images, val_labels, val_gedi_images = inputs( validation_data, config.validation_batch, im_shape, config.model_image_size[:2], max_value=max_value, min_value=min_value, num_epochs=config.epochs, normalize=config.normalize, return_gedi=config.include_GEDI_in_tfrecords, return_extra_gfp=config.extra_image, return_GEDI_derivative=True) if config.include_GEDI_in_tfrecords: extra_im_name = 'GEDI at current timepoint' else: extra_im_name = 'next gfp timepoint' tf.summary.image('train images', train_images) tf.summary.image('validation images', val_images) tf.summary.image('train %s' % extra_im_name, train_gedi_images) tf.summary.image('validation %s' % extra_im_name, val_gedi_images) # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn') as scope: if config.ordinal_classification is None: vgg_output = 2 # Sign of derivative (inf norm) train_labels = tf.cast(tf.sign(train_labels), tf.int32) val_labels = tf.cast(tf.sign(val_labels), tf.int32) elif config.ordinal_classification == 'regression': vgg_output = 1 else: raise RuntimeError( 'config.ordinal_classification must be sign or regression.' ) vgg = vgg16.model_struct() train_mode = tf.get_variable(name='training', initializer=True) # Mask NAN images from loss image_nan = tf.reduce_sum( tf.cast(tf.is_nan(train_images), tf.float32), reduction_indices=[1, 2, 3]) gedi_nan = tf.reduce_sum( tf.cast(tf.is_nan(train_gedi_images), tf.float32), reduction_indices=[1, 2, 3], keep_dims=True) image_mask = tf.cast(tf.equal(image_nan, 0.), tf.float32) gedi_nan = tf.cast(tf.equal(gedi_nan, 0.), tf.float32) train_images = tf.where( tf.is_nan(train_images), tf.zeros_like(train_images), train_images) train_gedi_images = tf.where( tf.is_nan(train_gedi_images), tf.zeros_like(train_gedi_images), train_gedi_images) train_images = tf.concat([train_images, train_images, train_images], axis=3) val_images = tf.concat([val_images, val_images, val_images], axis=3) vgg.build( train_images, output_shape=vgg_output, train_mode=train_mode, batchnorm=config.batchnorm_layers) # Prepare the cost function if config.ordinal_classification is None: # Encode y w/ k-hot and yhat w/ sigmoid ce. units capture dist. cost = softmax_cost( vgg.fc8, train_labels, mask=image_mask) elif config.ordinal_classification == 'regression': cost = tf.nn.l2_loss(tf.squeeze(vgg.fc8) - train_labels) class_loss = cost tf.summary.scalar("cce cost", cost) # Weight decay if config.wd_layers is not None: _, l2_wd_layers = fine_tune_prepare_layers( tf.trainable_variables(), config.wd_layers) l2_wd_layers = [ x for x in l2_wd_layers if 'biases' not in x.name] if len(l2_wd_layers) > 0: cost += (config.wd_penalty * tf.add_n( [tf.nn.l2_loss(x) for x in l2_wd_layers])) # Optimize train_op = tf.train.AdamOptimizer(config.new_lr).minimize(cost) if config.ordinal_classification is None: train_accuracy = class_accuracy( vgg.prob, train_labels) # training accuracy elif config.ordinal_classification == 'regression': train_accuracy = tf.nn.l2_loss( tf.squeeze(vgg.fc8) - train_labels) tf.summary.scalar("training accuracy", train_accuracy) # Setup validation op if validation_data is not False: scope.reuse_variables() # Validation graph is the same as training except no batchnorm val_vgg = vgg16.model_struct( fine_tune_layers=config.fine_tune_layers) val_vgg.build(val_images, output_shape=vgg_output) # Calculate validation accuracy if config.ordinal_classification is None: val_accuracy = class_accuracy(val_vgg.prob, val_labels) elif config.ordinal_classification == 'regression': val_accuracy = tf.nn.l2_loss(tf.squeeze(val_vgg.fc8) - val_labels) tf.summary.scalar("validation accuracy", val_accuracy) # Set up summaries and saver saver = tf.train.Saver( tf.global_variables(), max_to_keep=config.keep_checkpoints) summary_op = tf.summary.merge_all() # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Need to initialize both of these if supplying num_epochs to inputs sess.run(tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) summary_dir = os.path.join( config.train_summaries, config.which_dataset + '_' + dt_stamp) summary_writer = tf.summary.FileWriter(summary_dir, sess.graph) # Set up exemplar threading coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Restore model if requested if config.restore_path is not None: print '-' * 60 print 'Restoring from a previous model: %s' % config.restore_path print '-' * 60 saver.restore(sess, config.restore_path) # Start training loop np.save(out_dir + 'meta_info', config) step, losses = 0, [] # val_max = 0 try: # print response while not coord.should_stop(): start_time = time.time() _, loss_value, train_acc = sess.run( [train_op, cost, train_accuracy]) losses.append(loss_value) duration = time.time() - start_time if np.isnan(loss_value).sum(): import ipdb;ipdb.set_trace() assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % config.validation_steps == 0: if validation_data is not False: _, val_acc = sess.run([train_op, val_accuracy]) else: val_acc -= 1 # Store every checkpoint # Summaries summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Training status and validation accuracy format_str = ( '%s: step %d, loss = %.2f (%.1f examples/sec; ' '%.3f sec/batch) | Training accuracy = %s | ' 'Training %s = %s | Training class loss = %s | ' 'Validation accuracy = %s | Validation %s = %s | ' 'logdir = %s') print (format_str % ( datetime.now(), step, loss_value, config.train_batch / duration, float(duration), train_acc, extra_im_name, 0., 0., val_acc, extra_im_name, 0., summary_dir)) # Save the model checkpoint if it's the best yet if 1: # val_acc >= val_max: saver.save( sess, os.path.join( config.train_checkpoint, 'model_' + str(step) + '.ckpt'), global_step=step) # Store the new max validation accuracy # val_max = val_acc else: # Training status format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; ' '%.3f sec/batch) | Training accuracy = %s | ' 'Training %s = %s | Training class loss = %s') print (format_str % (datetime.now(), step, loss_value, config.train_batch / duration, float(duration), train_acc, extra_im_name, 0., 0.)) # End iteration step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (config.epochs, step)) finally: coord.request_stop() np.save(os.path.join(config.tfrecord_dir, 'training_loss'), losses) coord.join(threads) sess.close()
def test_vgg16(image_dir, model_file, output_csv='prediction_file', training_max=None): print(image_dir) # tf.set_random_seed(0) config = GEDIconfig() if image_dir is None: raise RuntimeError( 'You need to supply a directory path to the images.') combined_files = np.asarray( glob(os.path.join(image_dir, '*%s' % config.raw_im_ext))) if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1]) print('model file path', model_file_path) meta_file_pointer = os.path.join(model_file_path, 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file: train_maximum_value.npz' 'Closest I could find from directory %s was %s.' 'Download this from the link described in the README.md.' % (model_file_path, glob(os.path.join(model_file_path, '*.npz')))) meta_data = np.load(meta_file_pointer) # Prepare image normalization values if training_max is None: training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) print('out_dir', out_dir) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU if config.model_image_size[-1] < 3: print('*' * 60) print('Warning: model is expecting a H/W/1 image. ' 'Do you mean to set the last dimension of ' 'config.model_image_size to 3?') print('*' * 60) images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size, name='images') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(images, output_shape=config.output_shape) # Setup validation op scores = vgg.prob preds = tf.argmax(vgg.prob, 1) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], [] print('-' * 60) print('Beginning evaluation') print('-' * 60) if config.validation_batch > len(combined_files): print('Trimming validation_batch size to %s (same as # of files).' % len(combined_files)) config.validation_batch = len(combined_files) for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, file_array = [], [], [] # Initialize the graph # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float(config.validation_batch)).astype(int) for image_batch, file_batch in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, config=config, training_max=training_max, training_min=training_min), total=num_batches): feed_dict = {images: image_batch} sc, tyh = sess.run([scores, preds], feed_dict=feed_dict) dec_scores = np.append(dec_scores, sc) yhat = np.append(yhat, tyh) file_array = np.append(file_array, file_batch) ckpt_yhat.append(yhat) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) print('Batch %d took %.1f seconds' % (idx, time.time() - start_time)) # sess.close() # Save everything print('Save npz.') print(os.path.join(out_dir, 'validation_accuracies')) np.savez(os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array) # Also save a csv with item/guess pairs try: dec_scores = np.asarray(dec_scores) yhat = np.asarray(yhat) df = pd.DataFrame(np.hstack( (np.asarray(ckpt_file_array).reshape(-1, 1), yhat.reshape(-1, 1), dec_scores.reshape(dec_scores.shape[0] // 2, 2))), columns=[ 'files', 'live_guesses', 'classifier score dead', 'classifier score live' ]) output_name = image_dir.split('/')[-1] if output_name is None or len(output_name) == 0: output_name = 'output' df.to_csv(os.path.join(out_dir, '%s.csv' % output_name)) print('Saved csv to: %s' % os.path.join(out_dir, '%s.csv' % output_name)) except: print('X' * 60) print('Could not save a spreadsheet of file info') print('X' * 60) # Plot everything try: plot_accuracies(ckpt_y, ckpt_yhat, config, ckpts, os.path.join(out_dir, 'validation_accuracies.png')) plot_std(ckpt_y, ckpt_yhat, ckpts, os.path.join(out_dir, 'validation_stds.png')) plot_cms(ckpt_y, ckpt_yhat, config, os.path.join(out_dir, 'confusion_matrix.png')) plot_pr(ckpt_y, ckpt_yhat, ckpt_scores, os.path.join(out_dir, 'precision_recall.png')) # plot_cost( # os.path.join(out_dir, 'training_loss.npy'), ckpts, # os.path.join(out_dir, 'training_costs.png')) except: print('X' * 60) print('Could not locate the loss numpy') print('X' * 60)
def train_model(train_dir=None, validation_dir=None): config = GEDIconfig() if train_dir is None: # Use globals train_data = os.path.join(config.tfrecord_dir, config.tf_record_names['train']) meta_data = np.load( os.path.join(config.tfrecord_dir, '%s_%s' % (config.tvt_flags[0], config.max_file))) else: meta_data = np.load( os.path.join(train_dir, '%s_%s' % (config.tvt_flags[0], config.max_file))) # Prepare image normalization values if config.max_gedi is None: max_value = np.nanmax(meta_data['max_array']).astype(np.float32) if max_value == 0: max_value = None print 'Derived max value is 0' else: print 'Normalizing with empirical max.' if 'min_array' in meta_data.keys(): min_value = np.min(meta_data['min_array']).astype(np.float32) print 'Normalizing with empirical min.' else: min_value = None print 'Not normalizing with a min.' else: max_value = config.max_gedi min_value = config.min_gedi ratio = meta_data['ratio'] print 'Ratio is: %s' % ratio if validation_dir is None: # Use globals validation_data = os.path.join(config.tfrecord_dir, config.tf_record_names['val']) elif validation_dir is False: pass # Do not use validation data during training # Make output directories if they do not exist dt_stamp = re.split( '\.', str(datetime.now()))[0].\ replace(' ', '_').replace(':', '_').replace('-', '_') dt_dataset = config.which_dataset + '_' + dt_stamp + '/' config.train_checkpoint = os.path.join(config.train_checkpoint, dt_dataset) # timestamp this run out_dir = os.path.join(config.results, dt_dataset) dir_list = [ config.train_checkpoint, config.train_summaries, config.results, out_dir ] [make_dir(d) for d in dir_list] # im_shape = get_image_size(config) im_shape = config.gedi_image_size print '-' * 60 print('Training model:' + dt_dataset) print '-' * 60 # Prepare data on CPU assert os.path.exists(train_data) assert os.path.exists(validation_data) assert os.path.exists(config.vgg16_weight_path) with tf.device('/cpu:0'): train_images_0, train_images_1, train_labels, train_times = inputs( train_data, config.train_batch, im_shape, config.model_image_size[:2], max_value=max_value, min_value=min_value, train=config.data_augmentations, num_epochs=config.epochs, normalize=config.normalize, return_filename=True) val_images_0, val_images_1, val_labels, val_times = inputs( validation_data, config.validation_batch, im_shape, config.model_image_size[:2], max_value=max_value, min_value=min_value, num_epochs=config.epochs, normalize=config.normalize, return_filename=True) tf.summary.image('train image frame 0', train_images_0) tf.summary.image('train image frame 1', train_images_1) tf.summary.image('validation image frame 0', val_images_0) tf.summary.image('validation image frame 1', val_images_1) # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('gedi'): # Build training GEDI model for frame 0 vgg_train_mode = tf.get_variable(name='vgg_training', initializer=False) gedi_model_0 = vgg16.model_struct( vgg16_npy_path=config.gedi_weight_path, trainable=False) gedi_model_0.build(prep_images_for_gedi(train_images_0), output_shape=2, train_mode=vgg_train_mode) gedi_scores_0 = gedi_model_0.fc7 with tf.variable_scope('match'): # Build matching model for frame 0 model_0 = matching_gedi.model_struct() model_0.build(train_images_0) # Build frame 0 vector frame_0 = tf.concat([gedi_scores_0, model_0.output], axis=-1) # Build output layer if config.matching_combine == 'concatenate': output_shape = [int(frame_0.get_shape()[-1]) * 2, 2] elif config.matching_combine == 'subtract': output_shape = [int(frame_0.get_shape()[-1]), 2] else: raise RuntimeError # Build GEDI model for frame 1 with tf.variable_scope('gedi', reuse=True): gedi_model_1 = vgg16.model_struct( vgg16_npy_path=config.gedi_weight_path, trainable=False) gedi_model_1.build(prep_images_for_gedi(train_images_1), output_shape=2, train_mode=vgg_train_mode) gedi_scores_1 = gedi_model_1.fc7 with tf.variable_scope('match', reuse=True): # Build matching model for frame 1 model_1 = matching_gedi.model_struct() model_1.build(train_images_1) # Build frame 0 and frame 1 vectors frame_1 = tf.concat([gedi_scores_1, model_1.output], axis=-1) with tf.variable_scope('output'): # Concatenate or subtract if config.matching_combine == 'concatenate': output_scores = tf.concat([frame_0, frame_1], axis=-1) elif config.matching_combine == 'subtract': output_scores = frame_0 - frame_1 else: raise NotImplementedError # Build output layer output_shape = [int(output_scores.get_shape()[-1]), 2] output_weights = tf.get_variable( name='output_weights', shape=output_shape, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) output_bias = tf.get_variable(name='output_bias', initializer=tf.truncated_normal( [output_shape[-1]], .0, .001)) decision_logits = tf.nn.bias_add( tf.matmul(output_scores, output_weights), output_bias) train_soft_decisions = tf.nn.softmax(decision_logits) cost = softmax_cost(decision_logits, train_labels) tf.summary.scalar("cce loss", cost) cost += tf.nn.l2_loss(output_weights) # Weight decay if config.wd_layers is not None: _, l2_wd_layers = fine_tune_prepare_layers( tf.trainable_variables(), config.wd_layers) l2_wd_layers = [ x for x in l2_wd_layers if 'biases' not in x.name ] if len(l2_wd_layers) > 0: cost += (config.wd_penalty * tf.add_n([tf.nn.l2_loss(x) for x in l2_wd_layers])) # Optimize train_op = tf.train.AdamOptimizer(config.new_lr).minimize(cost) train_accuracy = class_accuracy(train_soft_decisions, train_labels) # training accuracy tf.summary.scalar("training accuracy", train_accuracy) # Setup validation op if validation_data is not False: with tf.variable_scope('gedi', reuse=tf.AUTO_REUSE): # FIX THIS # Validation graph is the same as training except no batchnorm val_gedi_model_0 = vgg16.model_struct( vgg16_npy_path=config.gedi_weight_path) val_gedi_model_0.build(prep_images_for_gedi(val_images_0), output_shape=2, train_mode=vgg_train_mode) val_gedi_scores_0 = val_gedi_model_0.fc7 # Build GEDI model for frame 1 val_gedi_model_1 = vgg16.model_struct( vgg16_npy_path=config.gedi_weight_path) val_gedi_model_1.build(prep_images_for_gedi(val_images_1), output_shape=2, train_mode=vgg_train_mode) val_gedi_scores_1 = val_gedi_model_1.fc7 with tf.variable_scope('match', reuse=tf.AUTO_REUSE): # Build matching model for frame 0 val_model_0 = matching_gedi.model_struct() val_model_0.build(val_images_0) # Build matching model for frame 1 val_model_1 = matching_gedi.model_struct() val_model_1.build(val_images_1) # Build frame 0 and frame 1 vectors val_frame_0 = tf.concat([val_gedi_scores_0, val_model_0.output], axis=-1) val_frame_1 = tf.concat([val_gedi_scores_1, val_model_1.output], axis=-1) # Concatenate or subtract if config.matching_combine == 'concatenate': val_output_scores = tf.concat([val_frame_0, val_frame_1], axis=-1) elif config.matching_combine == 'subtract': val_output_scores = val_frame_0 - val_frame_1 else: raise NotImplementedError with tf.variable_scope('output', reuse=tf.AUTO_REUSE): # Build output layer val_output_weights = tf.get_variable( name='val_output_weights', shape=output_shape, trainable=False, initializer=tf.contrib.layers.xavier_initializer( uniform=False)) val_output_bias = tf.get_variable( name='output_bias', trainable=False, initializer=tf.truncated_normal([output_shape[-1]], .0, .001)) val_decision_logits = tf.nn.bias_add( tf.matmul(val_output_scores, val_output_weights), val_output_bias) val_soft_decisions = tf.nn.softmax(val_decision_logits) # Calculate validation accuracy val_accuracy = class_accuracy(val_soft_decisions, val_labels) tf.summary.scalar("validation accuracy", val_accuracy) # Set up summaries and saver saver = tf.train.Saver(tf.global_variables(), max_to_keep=config.keep_checkpoints) summary_op = tf.summary.merge_all() # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Need to initialize both of these if supplying num_epochs to inputs sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) summary_dir = os.path.join(config.train_summaries, config.which_dataset + '_' + dt_stamp) summary_writer = tf.summary.FileWriter(summary_dir, sess.graph) # Set up exemplar threading coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Start training loop np.save(out_dir + 'meta_info', config) step, losses = 0, [] # val_max = 0 try: # print response while not coord.should_stop(): start_time = time.time() _, loss_value, train_acc, val_acc = sess.run( [train_op, cost, train_accuracy, val_accuracy]) losses += [loss_value] duration = time.time() - start_time if np.isnan(loss_value).sum(): assert not np.isnan(loss_value), 'Model loss = NaN' if step % config.validation_steps == 0: if validation_data is not False: val_acc = sess.run(val_accuracy) else: val_acc -= 1 # Store every checkpoint # Summaries summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Training status and validation accuracy format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; ' '%.3f sec/batch) | Training accuracy = %s | ' 'Validation accuracy = %s | ' 'logdir = %s') print(format_str % (datetime.now(), step, loss_value, config.train_batch / duration, float(duration), train_acc, val_acc, summary_dir)) # Save the model checkpoint if it's the best yet if 1: # val_acc >= val_max: saver.save(sess, os.path.join(config.train_checkpoint, 'model_' + str(step) + '.ckpt'), global_step=step) # Store the new max validation accuracy # val_max = val_acc else: # Training status format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; ' '%.3f sec/batch) | Training accuracy = %s | ' 'Training loss = %s') print(format_str % (datetime.now(), step, loss_value, config.train_batch / duration, float(duration), loss_value)) # End iteration step += 1 except tf.errors.OutOfRangeError: print('Done training for %d epochs, %d steps.' % (config.epochs, step)) finally: coord.request_stop() np.save(os.path.join(config.tfrecord_dir, 'training_loss'), losses) coord.join(threads) sess.close()
def test_vgg16(model_file, trained_svm, ims, dead_ims=None, output_csv='prediction_file', training_max=None, C=1e-3, k_folds=10): """Test an SVM you've trained on a new dataset.""" config = GEDIconfig() if ims is None: raise RuntimeError( 'You need to supply a directory path to the images.') if dead_ims is None: print 'Assuming all of your images are in the ims folder' + \ '-- will not derive labels to calculate accuracy.' # if not os.path.exists(trained_svm): # raise RuntimeError( # 'Cannot find the trained svm model. Check the path you passed.') try: clf = cPickle.load(open(trained_svm, 'rb')) # clf = model_dict['clf'] # mu = model_dict['mu'] # sd = model_dict['sd'] except: raise RuntimeError('Cannot find SVM file: %s' % trained_svm) if dead_ims is not None: live_files = glob(os.path.join(ims, '*%s' % config.raw_im_ext)) dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext)) combined_labels = np.concatenate( (np.zeros(len(live_files)), np.ones(len(dead_files)))) combined_files = np.concatenate((live_files, dead_files)) else: live_files = glob(os.path.join(ims, '*%s' % config.raw_im_ext)) combined_labels = None combined_files = np.asarray(live_files) if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1]) meta_file_pointer = os.path.join(model_file_path, 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file: train_maximum_value.npz' 'Closest I could find from directory %s was %s.' 'Download this from the link described in the README.md.' % (model_file_path, glob(os.path.join(model_file_path, '*.npz')))) meta_data = np.load(meta_file_pointer) # Prepare image normalization values if training_max is None: training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size, name='images') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(images, output_shape=config.output_shape) # Setup validation op scores = vgg.fc7 preds = tf.argmax(vgg.prob, 1) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], [] print '-' * 60 print 'Beginning evaluation' print '-' * 60 if config.validation_batch > len(combined_files): print 'Trimming validation_batch size to %s (same as # of files).' % len( combined_files) config.validation_batch = len(combined_files) for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, y, file_array = [], [], [], [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float(config.validation_batch)).astype(int) for image_batch, label_batch, file_batch in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, labels=combined_labels, config=config, training_max=training_max, training_min=training_min), total=num_batches): feed_dict = {images: image_batch} sc, tyh = sess.run([scores, preds], feed_dict=feed_dict) dec_scores += [sc] yhat = np.append(yhat, tyh) y = np.append(y, label_batch) file_array = np.append(file_array, file_batch) ckpt_yhat.append(yhat) ckpt_y.append(y) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time) sess.close() # Save everything new_dt_string = re.split('\.', str(datetime.now()))[0].\ replace(' ', '_').replace(':', '_').replace('-', '_') np.savez(os.path.join(out_dir, '%s_validation_accuracies' % new_dt_string), ckpt_yhat=ckpt_yhat, ckpt_y=ckpt_y, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array) # Run SVM all_scores = np.concatenate(dec_scores) # all_scores = (all_scores - mu) / sd predictions = clf.predict(all_scores) if dead_ims is not None: mean_acc = np.mean(predictions == y) p_value = randomization_test(y=y, yhat=predictions) print 'SVM performance: mean accuracy = %s%%, p = %.5f' % (mean_acc, p_value) df_col_label = 'true label' else: mean_acc, p_value = None, None y = np.copy(yhat) df_col_label = 'Dummy column (no labels supplied)' np.savez(os.path.join(out_dir, '%s_svm_test_data' % new_dt_string), yhat=yhat, y=y, scores=dec_scores, ckpts=ckpts, p_value=p_value) # Also save a csv with item/guess pairs trimmed_files = np.asarray([ x.split(os.path.sep)[-1] for x in np.asarray(ckpt_file_array).ravel() ]) yhat = np.asarray(yhat) df = pd.DataFrame( np.hstack((trimmed_files.reshape(-1, 1), yhat.reshape(-1, 1))), # y.reshape(-1, 1))), columns=['files', 'guesses']) # , df_col_label]) df.to_csv(os.path.join(out_dir, 'prediction_file.csv')) print 'Saved csv to: %s' % out_dir
def test_vgg16(live_ims, dead_ims, model_file, svm_model='svm_model', output_csv='prediction_file', training_max=None, C=1e-3, k_folds=10): """Train an SVM for your dataset on GEDI-model encodings.""" config = GEDIconfig() if live_ims is None: raise RuntimeError( 'You need to supply a directory path to the live images.') if dead_ims is None: raise RuntimeError( 'You need to supply a directory path to the dead images.') live_files = glob(os.path.join(live_ims, '*%s' % config.raw_im_ext)) dead_files = glob(os.path.join(dead_ims, '*%s' % config.raw_im_ext)) combined_labels = np.concatenate( (np.zeros(len(live_files)), np.ones(len(dead_files)))) combined_files = np.concatenate((live_files, dead_files)) if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() model_file_path = os.path.sep.join(model_file.split(os.path.sep)[:-1]) meta_file_pointer = os.path.join(model_file_path, 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file: train_maximum_value.npz' 'Closest I could find from directory %s was %s.' 'Download this from the link described in the README.md.' % (model_file_path, glob(os.path.join(model_file_path, '*.npz')))) meta_data = np.load(meta_file_pointer) # Prepare image normalization values if training_max is None: training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size, name='images') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(images, output_shape=config.output_shape) # Setup validation op scores = vgg.fc7 preds = tf.argmax(vgg.prob, 1) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_y, ckpt_scores, ckpt_file_array = [], [], [], [] print '-' * 60 print 'Beginning evaluation' print '-' * 60 if config.validation_batch > len(combined_files): print 'Trimming validation_batch size to %s (same as # of files).' % len( combined_files) config.validation_batch = len(combined_files) for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, y, file_array = [], [], [], [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float(config.validation_batch)).astype(int) for image_batch, label_batch, file_batch in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, labels=combined_labels, config=config, training_max=training_max, training_min=training_min), total=num_batches): feed_dict = {images: image_batch} sc, tyh = sess.run([scores, preds], feed_dict=feed_dict) dec_scores += [sc] yhat = np.append(yhat, tyh) y = np.append(y, label_batch) file_array = np.append(file_array, file_batch) ckpt_yhat.append(yhat) ckpt_y.append(y) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time) sess.close() # Save everything np.savez(os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_y=ckpt_y, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array) # Run SVM svm = LinearSVC(C=C, dual=False, class_weight='balanced') clf = make_pipeline(preprocessing.StandardScaler(), svm) predictions = cross_val_predict(clf, np.concatenate(dec_scores), y, cv=k_folds) cv_performance = metrics.accuracy_score(predictions, y) p_value = randomization_test(y=y, yhat=predictions) clf.fit(np.concatenate(dec_scores), y) # mu = dec_scores.mean(0) # sd = dec_scores.std(0) print '%s-fold SVM performance: accuracy = %s%% , p = %.5f' % ( k_folds, np.mean(cv_performance * 100), p_value) np.savez( os.path.join(out_dir, 'svm_data'), yhat=predictions, y=y, scores=dec_scores, ckpts=ckpts, cv_performance=cv_performance, p_value=p_value, k_folds=k_folds, # mu=mu, # sd=sd, C=C) # Also save a csv with item/guess pairs try: trimmed_files = [re.split('/', x)[-1] for x in combined_files] trimmed_files = np.asarray(trimmed_files) dec_scores = np.asarray(dec_scores) yhat = np.asarray(yhat) df = pd.DataFrame(np.hstack( (trimmed_files.reshape(-1, 1), yhat.reshape(-1, 1), y.reshape(-1, 1))), columns=['files', 'guesses', 'true label']) df.to_csv(os.path.join(out_dir, 'prediction_file.csv')) print 'Saved csv to: %s' % out_dir except: print 'X' * 60 print 'Could not save a spreadsheet of file info' print 'X' * 60 # save the classifier with open('%s.pkl' % svm_model, 'wb') as fid: # model_dict = { # 'model': clf, # 'mu': mu, # 'sd': sd # } cPickle.dump(clf, fid) print 'Saved svm model to: %s.pkl' % svm_model
def test_vgg16(image_dir, model_file, autopsy_csv=None, autopsy_path=None, output_csv='prediction_file', target_layer='fc7', save_npy=False, shuffle_images=True, embedding_type='PCA'): """Testing function for pretrained vgg16.""" assert autopsy_csv is not None, 'You must pass an autopsy file name.' assert autopsy_path is not None, 'You must pass an autopsy path.' # Load autopsy information autopsy_data = pd.read_csv(os.path.join(autopsy_path, autopsy_csv)) # Load config and begin preparing data config = GEDIconfig() if image_dir is None: raise RuntimeError( 'You need to supply a directory path to the images.') combined_files = np.asarray( glob(os.path.join(image_dir, '*%s' % config.raw_im_ext))) if shuffle_images: combined_files = combined_files[np.random.permutation( len(combined_files))] if len(combined_files) == 0: raise RuntimeError('Could not find any files. Check your image path.') config = GEDIconfig() meta_file_pointer = os.path.join( model_file.split('/model')[0], 'train_maximum_value.npz') if not os.path.exists(meta_file_pointer): raise RuntimeError( 'Cannot find the training data meta file.' 'Download this from the link described in the README.md.') meta_data = np.load(meta_file_pointer) # Prepare image normalization values training_max = np.max(meta_data['max_array']).astype(np.float32) training_min = np.min(meta_data['min_array']).astype(np.float32) # Find model checkpoints ds_dt_stamp = re.split('/', model_file)[-2] out_dir = os.path.join(config.results, ds_dt_stamp) # Make output directories if they do not exist dir_list = [config.results, out_dir] [make_dir(d) for d in dir_list] # Prepare data on CPU images = tf.placeholder(tf.float32, shape=[None] + config.model_image_size, name='images') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn'): vgg = vgg16.model_struct(vgg16_npy_path=config.vgg16_weight_path, fine_tune_layers=config.fine_tune_layers) vgg.build(images, output_shape=config.output_shape) # Setup validation op scores = vgg[target_layer] preds = tf.argmax(vgg.prob, 1) # Derive pathologies from file names pathologies = [] for f in combined_files: sf = f.split('/')[-1].split('_') sf = '_'.join(sf[1:4]) it_path = autopsy_data[autopsy_data['plate_well_neuron'] == sf]['disease'] if not len(it_path): it_path = 'Absent' else: it_path = it_path.as_matrix()[0] pathologies += [it_path] pathologies = np.asarray(pathologies) # Set up saver saver = tf.train.Saver(tf.global_variables()) # Loop through each checkpoint then test the entire validation set ckpts = [model_file] ckpt_yhat, ckpt_scores, ckpt_file_array = [], [], [] print '-' * 60 print 'Beginning evaluation' print '-' * 60 if config.validation_batch > len(combined_files): print 'Trimming validation_batch size to %s.' % len(combined_files) config.validation_batch = len(combined_files) for idx, c in tqdm(enumerate(ckpts), desc='Running checkpoints'): dec_scores, yhat, file_array = [], [], [] # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Set up exemplar threading saver.restore(sess, c) start_time = time.time() num_batches = np.floor( len(combined_files) / float(config.validation_batch)).astype(int) for image_batch, file_batch in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, config=config, training_max=training_max, training_min=training_min), total=num_batches): feed_dict = {images: image_batch} sc, tyh = sess.run([scores, preds], feed_dict=feed_dict) dec_scores += [sc] yhat += [tyh] file_array += [file_batch] ckpt_yhat.append(yhat) ckpt_scores.append(dec_scores) ckpt_file_array.append(file_array) print 'Batch %d took %.1f seconds' % (idx, time.time() - start_time) sess.close() # Create and plot an embedding im_path_map = pathologies[:num_batches * config.validation_batch] dec_scores = np.concatenate(dec_scores) mu = dec_scores.mean(0)[None, :] sd = dec_scores.std(0)[None, :] dec_scores = (dec_scores - mu) / sd yhat = np.concatenate(yhat) file_array = np.concatenate(file_array) if embedding_type == 'TSNE' or embedding_type == 'tsne': emb = manifold.TSNE(n_components=2, init='pca', random_state=0) elif embedding_type == 'PCA' or embedding_type == 'pca': emb = PCA(n_components=2, svd_solver='randomized', random_state=0) elif embedding_type == 'spectral': emb = manifold.SpectralEmbedding(n_components=2, random_state=0) y = emb.fit_transform(dec_scores) # Ouput csv df = pd.DataFrame(np.hstack( (y, im_path_map.reshape(-1, 1), file_array.reshape(-1, 1))), columns=['D1', 'D2', 'pathology', 'filename']) out_name = os.path.join(out_dir, 'embedding.csv') df.to_csv(out_name) print 'Saved csv to: %s' % out_name # Create plot f, ax = plt.subplots() unique_cats = np.unique(im_path_map) h = [] for idx, cat in enumerate(unique_cats): h += [ plt.scatter(y[im_path_map == cat, 0], y[im_path_map == cat, 1], c=plt.cm.Spectral(idx * 1000)) ] plt.legend(h, unique_cats) plt.axis('tight') plt.show() plt.savefig('embedding.png') plt.close(f) # Save everything if save_npy: np.savez(os.path.join(out_dir, 'validation_accuracies'), ckpt_yhat=ckpt_yhat, ckpt_scores=ckpt_scores, ckpt_names=ckpts, combined_files=ckpt_file_array)