def save_progress(config, weight_dict, it_val_dict, exp_label, step, directories, sess, saver, val_check, val_score, val_loss, val_perf, train_score, train_loss, timer, num_params, log, use_db, summary_op, summary_writer, save_activities, save_gradients, save_checkpoints): """Save progress and important data.""" # Update best val if len(val_check): val_check_idx = val_check[0] val_perf[val_check_idx] = val_loss # Then trigger optional saves if config.save_weights and len(val_check): it_weights = {k: it_val_dict[k] for k in weight_dict.keys()} py_utils.save_npys(data=it_weights, model_name='%s_%s' % (exp_label, step), output_string=directories['weights']) if save_activities and len(val_check): py_utils.save_npys(data=it_val_dict, model_name='%s_%s' % (exp_label, step), output_string=directories['weights']) ckpt_path = os.path.join(directories['checkpoints'], 'model_%s.ckpt' % step) if save_checkpoints and len(val_check): log.info('Saving checkpoint to: %s' % ckpt_path) saver.save(sess, ckpt_path, global_step=step) val_check = val_check[0] val_perf[val_check] = val_loss if save_gradients and len(val_check): # np.savez( # os.path.join( # config.results, # '%s_train_gradients' % exp_label), # **it_train_dict) np.savez(os.path.join(config.results, '%s_val_gradients' % exp_label), **it_val_dict) if use_db: db.update_performance(experiment_id=config._id, experiment=config.experiment, train_score=float(train_score), train_loss=float(train_loss), val_score=float(val_score), val_loss=float(val_loss), step=step, num_params=int(num_params), ckpt_path=ckpt_path, results_path=config.results, summary_path=directories['summaries']) # Summaries summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) return val_perf
def save_progress(config, weight_dict, it_val_dict, exp_label, step, directories, sess, saver, data_structure, val_acc, val_lo, train_acc, train_loss, timesteps, log, summary_op, summary_writer, save_activities, save_checkpoints): """Save progress and important data.""" if config.save_weights: it_weights = {k: it_val_dict[k] for k in weight_dict.keys()} py_utils.save_npys(data=it_weights, model_name='%s_%s' % (exp_label, step), output_string=directories['weights']) if save_activities: py_utils.save_npys(data=it_val_dict, model_name='%s_%s' % (exp_label, step), output_string=directories['weights']) if save_checkpoints: ckpt_path = os.path.join(directories['checkpoints'], 'model_%s.ckpt' % step) saver.save(sess, ckpt_path, global_step=step) try: data_structure.update_validation(validation_accuracy=val_acc, validation_loss=val_lo, validation_step=step) data_structure.save() except Exception as e: log.warning('Failed to save validation info: %s' % e) try: data_structure.update_training(train_accuracy=train_acc, train_loss=train_loss, train_step=timesteps) data_structure.save() except Exception as e: log.warning('Failed to save training info: %s' % e) # Summaries try: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) except Exception: print 'Failed to update summaries.'
def main(experiment_name, list_experiments=False, gpu_device='/gpu:0'): """Create a tensorflow worker to run experiments in your DB.""" if list_experiments: exps = db.list_experiments() print '_' * 30 print 'Initialized experiments:' print '_' * 30 for l in exps: print l.values()[0] print '_' * 30 print 'You can add to the DB with: '\ 'python prepare_experiments.py --experiment=%s' % \ exps[0].values()[0] return if experiment_name is None: print 'No experiment specified. Pulling one out of the DB.' experiment_name = db.get_experiment_name() # Prepare to run the model config = Config() condition_label = '%s_%s' % (experiment_name, py_utils.get_dt_stamp()) experiment_label = '%s' % (experiment_name) log = logger.get(os.path.join(config.log_dir, condition_label)) experiment_dict = experiments.experiments()[experiment_name]() config = add_to_config(d=experiment_dict, config=config) # Globals config, exp_params = process_DB_exps( experiment_name=experiment_name, log=log, config=config) # Update config w/ DB params dataset_module = py_utils.import_module(model_dir=config.dataset_info, dataset=config.dataset) dataset_module = dataset_module.data_processing() # hardcoded class name train_data, train_means = get_data_pointers( dataset=config.dataset, base_dir=config.tf_records, cv=dataset_module.folds.keys()[1], # TODO: SEARCH FOR INDEX. log=log) val_data, val_means = get_data_pointers(dataset=config.dataset, base_dir=config.tf_records, cv=dataset_module.folds.keys()[0], log=log) # Initialize output folders dir_list = { 'checkpoints': os.path.join(config.checkpoints, condition_label), 'summaries': os.path.join(config.summaries, condition_label), 'condition_evaluations': os.path.join(config.condition_evaluations, condition_label), 'experiment_evaluations': os.path.join( # DEPRECIATED config.experiment_evaluations, experiment_label), 'visualization': os.path.join(config.visualizations, condition_label), 'weights': os.path.join(config.condition_evaluations, condition_label, 'weights') } [py_utils.make_dir(v) for v in dir_list.values()] # Prepare data loaders on the cpu config.data_augmentations = py_utils.flatten_list( config.data_augmentations, log) with tf.device('/cpu:0'): train_images, train_labels = data_loader.inputs( dataset=train_data, batch_size=config.batch_size, model_input_image_size=dataset_module.model_input_image_size, tf_dict=dataset_module.tf_dict, data_augmentations=config.data_augmentations, num_epochs=config.epochs, tf_reader_settings=dataset_module.tf_reader, shuffle=config.shuffle) val_images, val_labels = data_loader.inputs( dataset=val_data, batch_size=config.batch_size, model_input_image_size=dataset_module.model_input_image_size, tf_dict=dataset_module.tf_dict, data_augmentations=config.data_augmentations, num_epochs=config.epochs, tf_reader_settings=dataset_module.tf_reader, shuffle=config.shuffle) log.info('Created tfrecord dataloader tensors.') # Load model specification struct_name = config.model_struct.split(os.path.sep)[-1] try: model_dict = py_utils.import_module( dataset=struct_name, model_dir=os.path.join('models', 'structs', experiment_name).replace(os.path.sep, '.')) except IOError: print 'Could not find the model structure: %s' % experiment_name # Inject model_dict with hyperparameters if requested model_dict.layer_structure = hp_opt_utils.inject_model_with_hps( layer_structure=model_dict.layer_structure, exp_params=exp_params) # Prepare model on GPU with tf.device(gpu_device): with tf.variable_scope('cnn') as scope: # Training model if len(dataset_module.output_size) > 1: log.warning('Found > 1 dimension for your output size.' 'Converting to a scalar.') dataset_module.output_size = np.prod( dataset_module.output_size) if hasattr(model_dict, 'output_structure'): # Use specified output layer output_structure = model_dict.output_structure else: output_structure = None model = model_utils.model_class( mean=train_means, training=True, output_size=dataset_module.output_size) train_scores, model_summary = model.build( data=train_images, layer_structure=model_dict.layer_structure, output_structure=output_structure, log=log, tower_name='cnn') log.info('Built training model.') log.debug(json.dumps(model_summary, indent=4), verbose=0) print_model_architecture(model_summary) # Prepare the loss function train_loss, _ = loss_utils.loss_interpreter( logits=train_scores, labels=train_labels, loss_type=config.loss_function, dataset_module=dataset_module) # Add weight decay if requested if len(model.regularizations) > 0: train_loss = loss_utils.wd_loss( regularizations=model.regularizations, loss=train_loss, wd_penalty=config.regularization_strength) train_op = loss_utils.optimizer_interpreter( loss=train_loss, lr=config.lr, optimizer=config.optimizer, constraints=config.optimizer_constraints, model=model) log.info('Built training loss function.') train_accuracy = eval_metrics.metric_interpreter( metric=dataset_module.score_metric, pred=train_scores, labels=train_labels) # training accuracy if int(train_images.get_shape()[-1]) <= 3: tf.summary.image('train images', train_images) tf.summary.scalar('training loss', train_loss) tf.summary.scalar('training accuracy', train_accuracy) log.info('Added training summaries.') # Validation model scope.reuse_variables() val_model = model_utils.model_class( mean=val_means, training=True, output_size=dataset_module.output_size) val_scores, _ = val_model.build( # Ignore summary data=val_images, layer_structure=model_dict.layer_structure, output_structure=output_structure, log=log, tower_name='cnn') log.info('Built validation model.') val_loss, _ = loss_utils.loss_interpreter( logits=val_scores, labels=val_labels, loss_type=config.loss_function, dataset_module=dataset_module) val_accuracy = eval_metrics.metric_interpreter( metric=dataset_module.score_metric, pred=val_scores, labels=val_labels) # training accuracy if int(train_images.get_shape()[-1]) <= 3: tf.summary.image('val images', val_images) tf.summary.scalar('validation loss', val_loss) tf.summary.scalar('validation accuracy', val_accuracy) log.info('Added validation summaries.') # Set up summaries and saver saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge_all() # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Need to initialize both of these if supplying num_epochs to inputs sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) summary_writer = tf.summary.FileWriter(dir_list['summaries'], sess.graph) # Set up exemplar threading coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Create dictionaries of important training and validation information train_dict = { 'train_loss': train_loss, 'train_accuracy': train_accuracy, 'train_images': train_images, 'train_labels': train_labels, 'train_op': train_op, 'train_scores': train_scores } val_dict = { 'val_loss': val_loss, 'val_accuracy': val_accuracy, 'val_images': val_images, 'val_labels': val_labels, 'val_scores': val_scores, } # Start training loop np.save( os.path.join(dir_list['condition_evaluations'], 'training_config_file'), config) log.info('Starting training') output_dict = training.training_loop( config=config, db=db, coord=coord, sess=sess, summary_op=summary_op, summary_writer=summary_writer, saver=saver, threads=threads, summary_dir=dir_list['summaries'], checkpoint_dir=dir_list['checkpoints'], weight_dir=dir_list['weights'], train_dict=train_dict, val_dict=val_dict, train_model=model, val_model=val_model, exp_params=exp_params) log.info('Finished training.') model_name = config.model_struct.replace('/', '_') py_utils.save_npys(data=output_dict, model_name=model_name, output_string=dir_list['experiment_evaluations'])
def evaluation_loop( config, db, coord, sess, summary_op, summary_writer, saver, threads, summary_dir, checkpoint_dir, weight_dir, train_dict, val_dict, train_model, val_model, exp_params, placeholder_data=None, performance_metric='validation_loss', aggregator='max'): """Run the model training loop.""" step = 0 train_losses, train_accs, train_aux, timesteps = {}, {}, {}, {} val_scores, val_aux, val_labels = {}, {}, {} train_images, val_images = {}, {} train_scores, train_labels = {}, {} train_aux_check = np.any(['aux_score' in k for k in train_dict.keys()]) val_aux_check = np.any(['aux_score' in k for k in val_dict.keys()]) # Restore model saver.restore(sess, config.load_and_evaluate_ckpt) # Start evaluation if config.save_weights: weight_dict = { k[0]: v for k, v in train_model.var_dict.iteritems() if k[1] == 0} val_dict = dict( val_dict, **weight_dict) if placeholder_data is not None: num_batches = len(placeholder_data['label_data']) // config.batch_size batch_index = np.arange(num_batches).repeat(config.batch_size) for idx in np.arange(num_batches): batch_images = placeholder_data['image_data'][batch_index == idx] batch_labels = placeholder_data['label_data'][batch_index == idx] import ipdb;ipdb.set_trace() batch_images = batch_images.reshape( placeholder_data['val_image_shape']) batch_labels = batch_labels.reshape( placeholder_data['val_label_shape']) feed_dict = { placeholder_data['val_images']: batch_images, placeholder_data['val_labels']: batch_labels, } it_vars = sess.run(val_dict.values(), feed_dict=feed_dict) it_dict = {k: v for k, v in zip( val_dict.keys(), it_vars)} a = 2 else: try: while not coord.should_stop(): start_time = time.time() if 1: # step % config.validation_iters == 0: it_val_scores, it_val_labels, it_val_aux = [], [], [] for num_vals in range(config.num_validation_evals): # Validation accuracy as the average of n batches val_vars = sess.run(val_dict.values()) it_val_dict = {k: v for k, v in zip( val_dict.keys(), val_vars)} it_val_labels += [it_val_dict['val_labels']] it_val_scores += [it_val_dict['val_scores']] if val_aux_check: iva = { itk: itv for itk, itv in it_val_dict.iteritems() if 'aux_score' in itk} it_val_aux += [iva] val_scores[step] = np.concatenate(it_val_scores) val_labels[step] = np.concatenate(it_val_labels) val_aux[step] = it_val_aux val_images[step] = it_val_dict['val_images'] # Save the model checkpoint if it's the best yet it_weights = { k: it_val_dict[k] for k in weight_dict.keys()} py_utils.save_npys( data=it_weights, model_name='%s_%s' % ( config.experiment_name, step), output_string=weight_dir) # End iteration step += 1 except tf.errors.OutOfRangeError: print 'Done with evaluation for %d epochs, %d steps.' % ( config.epochs, step) print 'Saved to: %s' % checkpoint_dir finally: coord.request_stop() coord.join(threads) sess.close() # Package images into a dictionary image_dict = { # 'train_images': train_images, 'val_images': val_images, # 'train_scores': train_scores, # 'train_labels': train_labels, 'val_scores': val_scores, 'val_labels': val_labels } py_utils.save_npys( data=image_dict, model_name='%s_%s' % ( config.experiment_name, step), output_string=weight_dir) return val_labels, val_scores
def main( experiment_name, list_experiments=False, load_and_evaluate_ckpt=None, placeholder_data=None, grad_images=False, gpu_device='/gpu:0'): """Create a tensorflow worker to run experiments in your DB.""" if list_experiments: exps = db.list_experiments() print '_' * 30 print 'Initialized experiments:' print '_' * 30 for l in exps: print l.values()[0] print '_' * 30 if len(exps) == 0: print 'No experiments found.' else: print 'You can add to the DB with: '\ 'python prepare_experiments.py --experiment=%s' % \ exps[0].values()[0] return if experiment_name is None: print 'No experiment specified. Pulling one out of the DB.' experiment_name = db.get_experiment_name() # Prepare to run the model config = Config() condition_label = '%s_%s' % (experiment_name, py_utils.get_dt_stamp()) experiment_label = '%s' % (experiment_name) log = logger.get(os.path.join(config.log_dir, condition_label)) assert experiment_name is not None, 'Empty experiment name.' experiment_dict = experiments.experiments()[experiment_name]() config = add_to_config(d=experiment_dict, config=config) # Globals config.load_and_evaluate_ckpt = load_and_evaluate_ckpt if load_and_evaluate_ckpt is not None: # Remove the train operation and add a ckpt pointer from ops import evaluation config, exp_params = process_DB_exps( experiment_name=experiment_name, log=log, config=config) # Update config w/ DB params dataset_module = py_utils.import_module( model_dir=config.dataset_info, dataset=config.dataset) dataset_module = dataset_module.data_processing() # hardcoded class name train_key = [k for k in dataset_module.folds.keys() if 'train' in k] if not len(train_key): train_key = 'train' else: train_key = train_key[0] train_data, train_means_image, train_means_label = get_data_pointers( dataset=config.dataset, base_dir=config.tf_records, cv=train_key, log=log) val_key = [k for k in dataset_module.folds.keys() if 'val' in k] if not len(val_key): val_key = 'train' else: val_key = val_key[0] val_data, val_means_image, val_means_label = get_data_pointers( dataset=config.dataset, base_dir=config.tf_records, cv=val_key, log=log) # Initialize output folders dir_list = { 'checkpoints': os.path.join( config.checkpoints, condition_label), 'summaries': os.path.join( config.summaries, condition_label), 'condition_evaluations': os.path.join( config.condition_evaluations, condition_label), 'experiment_evaluations': os.path.join( # DEPRECIATED config.experiment_evaluations, experiment_label), 'visualization': os.path.join( config.visualizations, condition_label), 'weights': os.path.join( config.condition_evaluations, condition_label, 'weights') } [py_utils.make_dir(v) for v in dir_list.values()] # Prepare data loaders on the cpu if all(isinstance(i, list) for i in config.data_augmentations): if config.data_augmentations: config.data_augmentations = py_utils.flatten_list( config.data_augmentations, log) if load_and_evaluate_ckpt is not None: config.epochs = 1 config.train_shuffle = False config.val_shuffle = False with tf.device('/cpu:0'): if placeholder_data: placeholder_shape = placeholder_data['train_image_shape'] placeholder_dtype = placeholder_data['train_image_dtype'] original_train_images = tf.placeholder( dtype=placeholder_dtype, shape=placeholder_shape, name='train_images') placeholder_shape = placeholder_data['train_label_shape'] placeholder_dtype = placeholder_data['train_label_dtype'] original_train_labels = tf.placeholder( dtype=placeholder_dtype, shape=placeholder_shape, name='train_labels') placeholder_shape = placeholder_data['val_image_shape'] placeholder_dtype = placeholder_data['val_image_dtype'] original_val_images = tf.placeholder( dtype=placeholder_dtype, shape=placeholder_shape, name='val_images') placeholder_shape = placeholder_data['val_label_shape'] placeholder_dtype = placeholder_data['val_label_dtype'] original_val_labels = tf.placeholder( dtype=placeholder_dtype, shape=placeholder_shape, name='val_labels') # Apply augmentations ( train_images, train_labels ) = data_loader.placeholder_image_augmentations( images=original_train_images, model_input_image_size=dataset_module.model_input_image_size, labels=original_train_labels, data_augmentations=config.data_augmentations, batch_size=config.batch_size) ( val_images, val_labels ) = data_loader.placeholder_image_augmentations( images=original_val_images, model_input_image_size=dataset_module.model_input_image_size, labels=original_val_labels, data_augmentations=config.data_augmentations, batch_size=config.batch_size) # Store in the placeholder dict placeholder_data['train_images'] = original_train_images placeholder_data['train_labels'] = original_train_labels placeholder_data['val_images'] = original_val_images placeholder_data['val_labels'] = original_val_labels else: train_images, train_labels = data_loader.inputs( dataset=train_data, batch_size=config.batch_size, model_input_image_size=dataset_module.model_input_image_size, tf_dict=dataset_module.tf_dict, data_augmentations=config.data_augmentations, num_epochs=config.epochs, tf_reader_settings=dataset_module.tf_reader, shuffle=config.shuffle_train, resize_output=config.resize_output) if hasattr(config, 'val_augmentations'): val_augmentations = config.val_augmentations else: val_augmentations = config.data_augmentations val_images, val_labels = data_loader.inputs( dataset=val_data, batch_size=config.batch_size, model_input_image_size=dataset_module.model_input_image_size, tf_dict=dataset_module.tf_dict, data_augmentations=val_augmentations, num_epochs=config.epochs, tf_reader_settings=dataset_module.tf_reader, shuffle=config.shuffle_val, resize_output=config.resize_output) log.info('Created tfrecord dataloader tensors.') # Load model specification struct_name = config.model_struct.split(os.path.sep)[-1] try: model_dict = py_utils.import_module( dataset=struct_name, model_dir=os.path.join( 'models', 'structs', experiment_name).replace(os.path.sep, '.') ) except IOError: print 'Could not find the model structure: %s in folder %s' % ( struct_name, experiment_name) # Inject model_dict with hyperparameters if requested model_dict.layer_structure = hp_opt_utils.inject_model_with_hps( layer_structure=model_dict.layer_structure, exp_params=exp_params) # Prepare variables for the models if len(dataset_module.output_size) == 2: log.warning( 'Found > 1 dimension for your output size.' 'Converting to a scalar.') dataset_module.output_size = np.prod( dataset_module.output_size) if hasattr(model_dict, 'output_structure'): # Use specified output layer output_structure = model_dict.output_structure else: output_structure = None # Correct number of output neurons if needed if config.dataloader_override and\ 'weights' in output_structure[-1].keys(): output_neurons = output_structure[-1]['weights'][0] size_check = output_neurons != dataset_module.output_size fc_check = output_structure[-1]['layers'][0] == 'fc' if size_check and fc_check: output_structure[-1]['weights'][0] = dataset_module.output_size log.warning('Adjusted output neurons from %s to %s.' % ( output_neurons, dataset_module.output_size)) # Prepare model on GPU if not hasattr(dataset_module, 'input_normalization'): dataset_module.input_normalization = None with tf.device(gpu_device): with tf.variable_scope('cnn') as scope: # Training model model = model_utils.model_class( mean=train_means_image, training=True, output_size=dataset_module.output_size, input_normalization=dataset_module.input_normalization) train_scores, model_summary, _ = model.build( data=train_images, layer_structure=model_dict.layer_structure, output_structure=output_structure, log=log, tower_name='cnn') if grad_images: oh_dims = int(train_scores.get_shape()[-1]) target_scores = tf.one_hot(train_labels, oh_dims) * train_scores train_gradients = tf.gradients(target_scores, train_images)[0] log.info('Built training model.') log.debug( json.dumps(model_summary, indent=4), verbose=0) print_model_architecture(model_summary) # Normalize labels on GPU if needed if 'normalize_labels' in exp_params.keys(): if exp_params['normalize_labels'] == 'zscore': train_labels -= train_means_label['mean'] train_labels /= train_means_label['std'] val_labels -= train_means_label['mean'] val_labels /= train_means_label['std'] log.info('Z-scoring labels.') elif exp_params['normalize_labels'] == 'mean': train_labels -= train_means_label['mean'] val_labels -= val_means_label['mean'] log.info('Mean-centering labels.') # Check the shapes of labels and scores if not isinstance(train_scores, list): if len( train_scores.get_shape()) != len( train_labels.get_shape()): train_shape = train_scores.get_shape().as_list() label_shape = train_labels.get_shape().as_list() val_shape = val_scores.get_shape().as_list() val_label_shape = val_labels.get_shape().as_list() if len( train_shape) == 2 and len( label_shape) == 1 and train_shape[-1] == 1: train_labels = tf.expand_dims(train_labels, axis=-1) val_labels = tf.expand_dims(val_labels, axis=-1) elif len( train_shape) == 2 and len( label_shape) == 1 and train_shape[-1] == 1: train_scores = tf.expand_dims(train_scores, axis=-1) val_scores = tf.expand_dims(val_scores, axis=-1) # Prepare the loss function train_loss, _ = loss_utils.loss_interpreter( logits=train_scores, # TODO labels=train_labels, loss_type=config.loss_function, weights=config.loss_weights, dataset_module=dataset_module) # Add loss tensorboard tracking if isinstance(train_loss, list): for lidx, tl in enumerate(train_loss): tf.summary.scalar('training_loss_%s' % lidx, tl) train_loss = tf.add_n(train_loss) else: tf.summary.scalar('training_loss', train_loss) # Add weight decay if requested if len(model.regularizations) > 0: train_loss = loss_utils.wd_loss( regularizations=model.regularizations, loss=train_loss, wd_penalty=config.regularization_strength) assert config.lr is not None, 'No learning rate.' # TODO: Make a QC function if config.lr > 1: old_lr = config.lr config.lr = loss_utils.create_lr_schedule( train_batch=config.batch_size, num_training=config.lr) config.optimizer = 'momentum' log.info('Forcing momentum classifier.') else: old_lr = None train_op = loss_utils.optimizer_interpreter( loss=train_loss, lr=config.lr, optimizer=config.optimizer, constraints=config.optimizer_constraints, model=model) log.info('Built training loss function.') # Add a score for the training set train_accuracy = eval_metrics.metric_interpreter( metric=dataset_module.score_metric, # TODO: Attach to exp cnfg pred=train_scores, # TODO labels=train_labels) # Add aux scores if requested train_aux = {} if hasattr(dataset_module, 'aux_scores'): for m in dataset_module.aux_scores: train_aux[m] = eval_metrics.metric_interpreter( metric=m, pred=train_scores, labels=train_labels) # [0] # TODO: Fix for multiloss # Prepare remaining tensorboard summaries if config.tensorboard_images: if len(train_images.get_shape()) == 4: tf_fun.image_summaries(train_images, tag='Training images') if (np.asarray( train_labels.get_shape().as_list()) > 1).sum() > 2: tf_fun.image_summaries( train_labels, tag='Training_targets') tf_fun.image_summaries( train_scores, tag='Training_predictions') if isinstance(train_accuracy, list): for tidx, ta in enumerate(train_accuracy): tf.summary.scalar('training_accuracy_%s' % tidx, ta) else: tf.summary.scalar('training_accuracy', train_accuracy) if config.pr_curve: if isinstance(train_scores, list): for pidx, train_score in enumerate(train_scores): train_label = train_labels[:, pidx] pr_summary.op( tag='training_pr_%s' % pidx, predictions=tf.cast( tf.argmax( train_score, axis=-1), tf.float32), labels=tf.cast(train_label, tf.bool), display_name='training_precision_recall_%s' % pidx) else: pr_summary.op( tag='training_pr', predictions=tf.cast( tf.argmax( train_scores, axis=-1), tf.float32), labels=tf.cast(train_labels, tf.bool), display_name='training_precision_recall') log.info('Added training summaries.') with tf.variable_scope('cnn', tf.AUTO_REUSE) as scope: # Validation model scope.reuse_variables() val_model = model_utils.model_class( mean=train_means_image, # Normalize with train data training=False, output_size=dataset_module.output_size, input_normalization=dataset_module.input_normalization) val_scores, _, _ = val_model.build( # Ignore summary data=val_images, layer_structure=model_dict.layer_structure, output_structure=output_structure, log=log, tower_name='cnn') if grad_images: oh_dims = int(val_scores.get_shape()[-1]) target_scores = tf.one_hot(val_labels, oh_dims) * val_scores val_gradients = tf.gradients(target_scores, val_images)[0] log.info('Built validation model.') # Check the shapes of labels and scores val_loss, _ = loss_utils.loss_interpreter( logits=val_scores, labels=val_labels, loss_type=config.loss_function, weights=config.loss_weights, dataset_module=dataset_module) # Add loss tensorboard tracking if isinstance(val_loss, list): for lidx, tl in enumerate(val_loss): tf.summary.scalar('validation_loss_%s' % lidx, tl) val_loss = tf.add_n(val_loss) else: tf.summary.scalar('validation_loss', val_loss) # Add a score for the validation set val_accuracy = eval_metrics.metric_interpreter( metric=dataset_module.score_metric, # TODO pred=val_scores, labels=val_labels) # Add aux scores if requested val_aux = {} if hasattr(dataset_module, 'aux_scores'): for m in dataset_module.aux_scores: val_aux[m] = eval_metrics.metric_interpreter( metric=m, pred=val_scores, labels=val_labels) # [0] # TODO: Fix for multiloss # Prepare tensorboard summaries if config.tensorboard_images: if len(val_images.get_shape()) == 4: tf_fun.image_summaries( val_images, tag='Validation') if (np.asarray( val_labels.get_shape().as_list()) > 1).sum() > 2: tf_fun.image_summaries( val_labels, tag='Validation_targets') tf_fun.image_summaries( val_scores, tag='Validation_predictions') if isinstance(val_accuracy, list): for vidx, va in enumerate(val_accuracy): tf.summary.scalar('validation_accuracy_%s' % vidx, va) else: tf.summary.scalar('validation_accuracy', val_accuracy) if config.pr_curve: if isinstance(val_scores, list): for pidx, val_score in enumerate(val_scores): val_label = val_labels[:, pidx] pr_summary.op( tag='validation_pr_%s' % pidx, predictions=tf.cast( tf.argmax( val_score, axis=-1), tf.float32), labels=tf.cast(val_label, tf.bool), display_name='validation_precision_recall_%s' % pidx) else: pr_summary.op( tag='validation_pr', predictions=tf.cast( tf.argmax( val_scores, axis=-1), tf.float32), labels=tf.cast(val_labels, tf.bool), display_name='validation_precision_recall') log.info('Added validation summaries.') # Set up summaries and saver if not hasattr(config, 'max_to_keep'): config.max_to_keep = None saver = tf.train.Saver( var_list=tf.global_variables(), max_to_keep=config.max_to_keep) summary_op = tf.summary.merge_all() # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Need to initialize both of these if supplying num_epochs to inputs sess.run( tf.group( tf.global_variables_initializer(), tf.local_variables_initializer()) ) summary_writer = tf.summary.FileWriter(dir_list['summaries'], sess.graph) # Set up exemplar threading if placeholder_data: coord, threads = None, None else: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Create dictionaries of important training and validation information train_dict = { 'train_loss': train_loss, 'train_images': train_images, 'train_labels': train_labels, 'train_op': train_op, 'train_scores': train_scores } val_dict = { 'val_loss': val_loss, 'val_images': val_images, 'val_labels': val_labels, 'val_scores': val_scores, } if grad_images: train_dict['train_gradients'] = train_gradients val_dict['val_gradients'] = val_gradients if isinstance(train_accuracy, list): for tidx, (ta, va) in enumerate(zip(train_accuracy, val_accuracy)): train_dict['train_accuracy_%s' % tidx] = ta val_dict['val_accuracy_%s' % tidx] = va else: train_dict['train_accuracy_0'] = train_accuracy val_dict['val_accuracy_0'] = val_accuracy if load_and_evaluate_ckpt is not None: # Remove the train operation and add a ckpt pointer del train_dict['train_op'] if hasattr(dataset_module, 'aux_score'): # Attach auxillary scores to tensor dicts for m in dataset_module.aux_scores: train_dict['train_aux_%s' % m] = train_aux[m] val_dict['val_aux_%s' % m] = val_aux[m] # Start training loop if old_lr is not None: config.lr = old_lr np.save( os.path.join( dir_list['condition_evaluations'], 'training_config_file'), config) log.info('Starting training') if load_and_evaluate_ckpt is not None: return evaluation.evaluation_loop( config=config, db=db, coord=coord, sess=sess, summary_op=summary_op, summary_writer=summary_writer, saver=saver, threads=threads, summary_dir=dir_list['summaries'], checkpoint_dir=dir_list['checkpoints'], weight_dir=dir_list['weights'], train_dict=train_dict, val_dict=val_dict, train_model=model, val_model=val_model, exp_params=exp_params, placeholder_data=placeholder_data) else: output_dict = training.training_loop( config=config, db=db, coord=coord, sess=sess, summary_op=summary_op, summary_writer=summary_writer, saver=saver, threads=threads, summary_dir=dir_list['summaries'], checkpoint_dir=dir_list['checkpoints'], weight_dir=dir_list['weights'], train_dict=train_dict, val_dict=val_dict, train_model=model, val_model=val_model, exp_params=exp_params) log.info('Finished training.') model_name = config.model_struct.replace('/', '_') if output_dict is not None: py_utils.save_npys( data=output_dict, model_name=model_name, output_string=dir_list['experiment_evaluations'])
def evaluation_loop(config, db, coord, sess, summary_op, summary_writer, saver, threads, summary_dir, checkpoint_dir, weight_dir, train_dict, val_dict, train_model, val_model, exp_params, performance_metric='validation_loss', aggregator='max'): """Run the model training loop.""" step = 0 train_losses, train_accs, train_aux, timesteps = {}, {}, {}, {} val_losses, val_accs, val_scores, val_aux, val_labels = {}, {}, {}, {}, {} train_images, val_images = {}, {} train_scores, train_labels = {}, {} train_aux_check = np.any(['aux_score' in k for k in train_dict.keys()]) val_aux_check = np.any(['aux_score' in k for k in val_dict.keys()]) # Restore model saver.restore(sess, config.load_and_evaluate_ckpt) # Start evaluation if config.save_weights: weight_dict = { k[0]: v for k, v in train_model.var_dict.iteritems() if k[1] == 0 } val_dict = dict(val_dict, **weight_dict) try: while not coord.should_stop(): start_time = time.time() train_vars = sess.run(train_dict.values()) it_train_dict = { k: v for k, v in zip(train_dict.keys(), train_vars) } duration = time.time() - start_time train_losses[step] = it_train_dict['train_loss'] train_accs[step] = it_train_dict['train_accuracy'] train_images[step] = it_train_dict['train_images'] train_labels[step] = it_train_dict['train_labels'] train_scores[step] = it_train_dict['train_scores'] timesteps[step] = duration if train_aux_check: # Loop through to find aux scores it_train_aux = { itk: itv for itk, itv in it_train_dict.iteritems() if 'aux_score' in itk } train_aux[step] = it_train_aux assert not np.isnan(it_train_dict['train_loss']).any( ), 'Model diverged with loss = NaN' if step % config.validation_iters == 0: it_val_acc = np.asarray([]) it_val_loss = np.asarray([]) it_val_scores, it_val_labels, it_val_aux = [], [], [] for num_vals in range(config.num_validation_evals): # Validation accuracy as the average of n batches val_vars = sess.run(val_dict.values()) it_val_dict = { k: v for k, v in zip(val_dict.keys(), val_vars) } it_val_acc = np.append(it_val_acc, it_val_dict['val_accuracy']) it_val_loss = np.append(it_val_loss, it_val_dict['val_loss']) it_val_labels += [it_val_dict['val_labels']] it_val_scores += [it_val_dict['val_scores']] if val_aux_check: iva = { itk: itv for itk, itv in it_val_dict.iteritems() if 'aux_score' in itk } it_val_aux += [iva] val_acc = it_val_acc.mean() val_lo = it_val_loss.mean() val_accs[step] = val_acc val_losses[step] = val_lo val_scores[step] = it_val_scores val_labels[step] = it_val_labels val_aux[step] = it_val_aux val_images[step] = it_val_dict['val_images'] # Save the model checkpoint if it's the best yet it_weights = {k: it_val_dict[k] for k in weight_dict.keys()} py_utils.save_npys(data=it_weights, model_name='%s_%s' % (config.experiment_name, step), output_string=weight_dir) # End iteration step += 1 except tf.errors.OutOfRangeError: print 'Done with evaluation for %d epochs, %d steps.' % (config.epochs, step) print 'Saved to: %s' % checkpoint_dir finally: coord.request_stop() coord.join(threads) sess.close() # Package images into a dictionary image_dict = { 'train_images': train_images, 'val_images': val_images, 'train_scores': train_scores, 'train_labels': train_labels, 'val_scores': val_scores, 'val_labels': val_labels } py_utils.save_npys(data=image_dict, model_name='%s_%s' % (config.experiment_name, step), output_string=weight_dir)
def training_loop(config, db, coord, sess, summary_op, summary_writer, saver, threads, summary_dir, checkpoint_dir, weight_dir, train_dict, val_dict, train_model, val_model, exp_params): """Run the model training loop.""" step, time_elapsed = 0, 0 train_losses, train_accs, timesteps = {}, {}, {} val_losses, val_accs, val_scores, val_labels = {}, {}, {}, {} if config.save_weights: weight_dict = { k[0]: v for k, v in val_model.var_dict.iteritems() if k[1] == 0 } val_dict = dict(val_dict, **weight_dict) try: while not coord.should_stop(): start_time = time.time() train_vars = sess.run(train_dict.values()) it_train_dict = { k: v for k, v in zip(train_dict.keys(), train_vars) } duration = time.time() - start_time train_losses[step] = it_train_dict['train_loss'] train_accs[step] = it_train_dict['train_accuracy'] timesteps[step] = duration assert not np.isnan(it_train_dict['train_loss']).any( ), 'Model diverged with loss = NaN' if step % config.validation_iters == 0: it_val_acc = np.asarray([]) it_val_loss = np.asarray([]) it_val_scores = np.asarray([]) it_val_labels = np.asarray([]) for num_vals in range(config.num_validation_evals): # Validation accuracy as the average of n batches val_vars = sess.run(val_dict.values()) it_val_dict = { k: v for k, v in zip(val_dict.keys(), val_vars) } it_val_acc = np.append(it_val_acc, it_val_dict['val_accuracy']) it_val_loss = np.append(it_val_loss, it_val_dict['val_loss']) it_val_scores = np.append(it_val_loss, it_val_dict['val_scores']) it_val_labels = np.append(it_val_loss, it_val_dict['val_labels']) val_acc = it_val_acc.mean() val_lo = it_val_loss.mean() val_accs[step] = val_acc val_losses[step] = val_lo val_scores[step] = it_val_scores val_labels[step] = it_val_labels # Summaries summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Training status and validation accuracy format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; ' '%.3f sec/batch) | Training accuracy = %s | ' 'Validation accuracy = %s | logdir = %s') print format_str % ( datetime.now(), step, it_train_dict['train_loss'], config.batch_size / duration, float(duration), it_train_dict['train_accuracy'], val_acc, summary_dir) # Save the model checkpoint if it's the best yet if config.top_n_validation > 0: rep_idx = val_acc > val_accs if sum(rep_idx) > 0: force_save = True val_accs[np.argmax(rep_idx)] = val_acc else: force_save = True if force_save: ckpt_path = os.path.join(checkpoint_dir, 'model_' + str(step) + '.ckpt') saver.save(sess, ckpt_path, global_step=step) print 'Saved checkpoint to: %s' % ckpt_path force_save = False time_elapsed += float(duration) db.update_performance( experiment_id=config._id, experiment_name=config.experiment_name, summary_dir=summary_dir, ckpt_file=ckpt_path, training_loss=float(it_train_dict['train_loss']), validation_loss=float(val_acc), time_elapsed=time_elapsed, training_step=step) if config.save_weights: it_weights = { k: it_val_dict[k] for k in weight_dict.keys() } py_utils.save_npys(data=it_weights, model_name='%s_%s' % (config.experiment_name, step), output_string=weight_dir) if config.early_stop: keys = np.sort([int(k) for k in val_accs.keys()]) sorted_vals = np.asarray([val_accs[k] for k in keys]) if check_early_stop(sorted_vals): print 'Triggered an early stop.' break else: # Training status format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; ' '%.3f sec/batch) | Training accuracy = %s') print format_str % ( datetime.now(), step, it_train_dict['train_loss'], config.batch_size / duration, float(duration), it_train_dict['train_accuracy']) # End iteration step += 1 except tf.errors.OutOfRangeError: print 'Done training for %d epochs, %d steps.' % (config.epochs, step) print 'Saved to: %s' % checkpoint_dir finally: coord.request_stop() coord.join(threads) sess.close() # If using hp optimization, store performance here if exp_params['hp_current_iteration'] is not None: exp_params['hp_current_iteration'] += 1 # Package output variables into a dictionary output_dict = { 'train_losses': train_losses, 'train_accs': val_losses, 'timesteps': train_accs, 'val_losses': val_accs, 'val_accs': timesteps, 'val_scores': val_scores, 'val_labels': val_labels, } return output_dict
def training_loop(config, db, coord, sess, summary_op, summary_writer, saver, threads, summary_dir, checkpoint_dir, weight_dir, train_dict, val_dict, train_model, val_model, exp_params, performance_metric='validation_loss', aggregator='max'): """Run the model training loop.""" step, time_elapsed = 0, 0 train_losses, train_accs, train_aux, timesteps = {}, {}, {}, {} val_losses, val_accs, val_scores, val_aux, val_labels = {}, {}, {}, {}, {} train_aux_check = np.any(['aux_score' in k for k in train_dict.keys()]) val_aux_check = np.any(['aux_score' in k for k in val_dict.keys()]) if config.save_weights: weight_dict = { k[0]: v for k, v in val_model.var_dict.iteritems() if k[1] == 0 } val_dict = dict(val_dict, **weight_dict) try: while not coord.should_stop(): start_time = time.time() train_vars = sess.run(train_dict.values()) it_train_dict = { k: v for k, v in zip(train_dict.keys(), train_vars) } duration = time.time() - start_time train_losses[step] = it_train_dict['train_loss'] train_accs[step] = it_train_dict['train_accuracy'] timesteps[step] = duration if train_aux_check: # Loop through to find aux scores it_train_aux = { itk: itv for itk, itv in it_train_dict.iteritems() if 'aux_score' in itk } train_aux[step] = it_train_aux assert not np.isnan(it_train_dict['train_loss']).any( ), 'Model diverged with loss = NaN' if step % config.validation_iters == 0: it_val_acc = np.asarray([]) it_val_loss = np.asarray([]) it_val_scores, it_val_labels, it_val_aux = [], [], [] for num_vals in range(config.num_validation_evals): # Validation accuracy as the average of n batches val_vars = sess.run(val_dict.values()) it_val_dict = { k: v for k, v in zip(val_dict.keys(), val_vars) } it_val_acc = np.append(it_val_acc, it_val_dict['val_accuracy']) it_val_loss = np.append(it_val_loss, it_val_dict['val_loss']) it_val_labels += [it_val_dict['val_labels']] it_val_scores += [it_val_dict['val_scores']] if val_aux_check: iva = { itk: itv for itk, itv in it_val_dict.iteritems() if 'aux_score' in itk } it_val_aux += [iva] val_acc = it_val_acc.mean() val_lo = it_val_loss.mean() val_accs[step] = val_acc val_losses[step] = val_lo val_scores[step] = it_val_scores val_labels[step] = it_val_labels val_aux[step] = it_val_aux # Summaries summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Training status and validation accuracy format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; ' '%.3f sec/batch) | Training accuracy = %s | ' 'Validation accuracy = %s | logdir = %s') print format_str % ( datetime.now(), step, it_train_dict['train_loss'], config.batch_size / duration, float(duration), it_train_dict['train_accuracy'], val_acc, summary_dir) # Save the model checkpoint if it's the best yet if config.top_n_validation > 0: rep_idx = val_acc > val_accs if sum(rep_idx) > 0: force_save = True val_accs[np.argmax(rep_idx)] = val_acc else: force_save = True if force_save: ckpt_path = os.path.join(checkpoint_dir, 'model_' + str(step) + '.ckpt') saver.save(sess, ckpt_path, global_step=step) print 'Saved checkpoint to: %s' % ckpt_path force_save = False time_elapsed += float(duration) db.update_performance( experiment_id=config._id, experiment_name=config.experiment_name, summary_dir=summary_dir, ckpt_file=ckpt_path, training_loss=float(it_train_dict['train_loss']), validation_loss=float(val_acc), time_elapsed=time_elapsed, training_step=step) if config.save_weights: it_weights = { k: it_val_dict[k] for k in weight_dict.keys() } py_utils.save_npys(data=it_weights, model_name='%s_%s' % (config.experiment_name, step), output_string=weight_dir) if config.early_stop: keys = np.sort([int(k) for k in val_accs.keys()]) sorted_vals = np.asarray([val_accs[k] for k in keys]) if check_early_stop(sorted_vals): print 'Triggered an early stop.' break else: # Training status format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; ' '%.3f sec/batch) | Training accuracy = %s') print format_str % ( datetime.now(), step, it_train_dict['train_loss'], config.batch_size / duration, float(duration), it_train_dict['train_accuracy']) # End iteration step += 1 except tf.errors.OutOfRangeError: print 'Done training for %d epochs, %d steps.' % (config.epochs, step) print 'Saved to: %s' % checkpoint_dir finally: coord.request_stop() coord.join(threads) sess.close() # If using online hp optimization, update the database with performance if exp_params['hp_current_iteration'] is not None: # If we have not exceeded the maximum online hp optimizations: if exp_params['hp_current_iteration'] < exp_params['hp_max_studies']: # Database lookup to get all performance for this hp-thread performance_history = db.query_hp_hist( exp_params=exp_params, performance_metric=performance_metric, aggregator=aggregator) # Call on online optimization tools exp_params = hp_opt_utils.hp_optim_interpreter( performance_history=performance_history, aggregator=aggregator) # Prepare parameters for DB pk = prepare_experiments.protected_keys() exp_params = prepare_experiments.prepare_hp_params( parameter_dict=exp_params, pk=pk) # Iterate the count exp_params['hp_current_iteration'] += 1 for k, v in exp_params.iteritems(): if isinstance(v, basestring) and 'null' in v: exp_params[k] = None # Update the database with the new hyperparameters db.update_online_experiment( exp_combos=[exp_params], experiment_link=exp_params['experiment_link']) # Package output variables into a dictionary output_dict = { 'train_losses': train_losses, 'train_accs': train_accs, 'train_aux': train_aux, 'timesteps': timesteps, 'val_losses': val_losses, 'val_accs': val_accs, 'val_scores': val_scores, 'val_labels': val_labels, 'val_aux': val_aux, } return output_dict
def main(experiment_name, list_experiments=False): """Create a tensorflow worker to run experiments in your DB.""" if list_experiments: exps = db.list_experiments() print '_' * 30 print 'Initialized experiments:' print '_' * 30 for l in exps: print l.values()[0] print '_' * 30 return # Prepare to run the model config = Config() condition_label = '%s_%s' % (experiment_name, get_dt_stamp()) experiment_label = '%s' % (experiment_name) log = logger.get(os.path.join(config.log_dir, condition_label)) experiment_dict = experiments.experiments()[experiment_name]() config = add_to_config(d=experiment_dict, config=config) # Globals config = process_DB_exps(experiment_name=experiment_name, log=log, config=config) # Update config w/ DB params dataset_module = py_utils.import_module(model_dir=config.dataset_info, dataset=config.dataset) dataset_module = dataset_module.data_processing() # hardcoded class name # Prepare data loaders on the cpu with tf.device('/cpu:0'): # Test issues with data loading? Try placeholders instead. train_images = tf.placeholder(tf.float32, name='train_images', shape=[config.batch_size] + dataset_module.im_size) train_labels = tf.placeholder(tf.int64, name='train_labels', shape=[config.batch_size]) val_images = tf.placeholder(tf.float32, name='val_images', shape=[config.batch_size] + dataset_module.im_size) val_labels = tf.placeholder(tf.int64, name='val_labels', shape=[config.batch_size]) log.info('Created tfrecord dataloader tensors.') # Prepare model on GPU with tf.device('/gpu:0'): with tf.variable_scope('cnn') as scope: # Training model if len(dataset_module.output_size) > 1: log.warning('Found > 1 dimension for your output size.' 'Converting to a scalar.') dataset_module.output_size = np.prod( dataset_module.output_size) # Click weighting flat_ims = tf.reshape( train_images, [config.batch_size, np.prod(dataset_module.im_size)]) W = tf.get_variable( name='W', initializer=tf.truncated_normal_initializer(stddev=0.1), shape=[ np.prod(dataset_module.im_size), dataset_module.output_size ]) b = tf.get_variable( name='b', initializer=tf.truncated_normal_initializer(stddev=0.1), shape=[dataset_module.output_size]) output_scores = tf.matmul(flat_ims, W) + b # Prepare the loss function train_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=train_labels, logits=output_scores)) train_op = tf.train.GradientDescentOptimizer( config.lr).minimize(train_loss) log.info('Built training loss function.') # Set up summaries and saver saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge_all() # Initialize the graph sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # Need to initialize both of these if supplying num_epochs to inputs sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) # Start training loop step, time_elapsed = 0, 0 train_losses, train_accs, val_losses, val_accs, timesteps = {}, {}, {}, {}, {} files, labels = dataset_module.get_data() combined_files = files['train'] combined_labels = labels['train'] batch_size = config.batch_size num_batches = len(combined_files) // batch_size log.info('Finished training.') for image_batch, label_batch, _ in tqdm(image_batcher( start=0, num_batches=num_batches, images=combined_files, labels=combined_labels, batch_size=batch_size), total=num_batches): feed_dict = { train_images: image_batch.astype(np.float32), train_labels: np.asarray(label_batch).astype(int) } import ipdb ipdb.set_trace() start_time = time.time() _, loss_value = sess.run([ train_op, train_loss, ], feed_dict=feed_dict) files_to_save = { 'training_loss': tr_loss, 'validation_loss': val_loss, 'training_acc': tr_accs, 'validation_acc': val_accs, 'timesteps': timesteps } model_name = config.model_struct.replace('/', '_') py_utils.save_npys(data=files_to_save, model_name=model_name, output_string=dir_list['experiment_evaluations']) # Compare this condition w/ all others. plotting.plot_data(train_loss=tr_loss, val_loss=val_loss, model_name=model_name, timesteps=timesteps, config=config, output=os.path.join(dir_list['condition_evaluations'], 'loss'), output_ext='.pdf', data_type='loss') plotting.plot_data(tr_accs=tr_accs, val_accs=val_accs, model_name=model_name, timesteps=timesteps, config=config, output=os.path.join(dir_list['condition_evaluations'], 'acc'), output_ext='.pdf', data_type='acc') log.info('Completed plots.')