def train(model, input_dims, output_dims, seq_length, size, num_gpus, dataset, experiment_name, load_model, num_vids, n_epochs, split, base_data_path, f_name, learning_rate_init, wd, save_freq, clip_length, video_offset, clip_offset, num_clips, clip_stride, batch_size, loss_type, metrics_dir, loaded_checkpoint, verbose, opt_choice, gpu_list, grad_clip_value, preproc_method, random_init, shuffle_seed, preproc_debugging, reverse): """ Training function used to train or fine-tune a chosen model Args: :model: tf-activity-recognition framework model object :input_dims: Number of frames used in input :output_dims: Integer number of classes in current dataset :seq_length: Length of output sequence expected from LSTM :size: List detailing height and width of frame :num_gpus: Number of gpus to use when training :dataset: Name of dataset being processed :experiment_name: Name of current experiment :load_model: Boolean variable indicating whether to load from a checkpoint or not :num_vids: Number of videos to be used for training :n_epochs: Total number of epochs to train :split: Split of dataset being used :base_data_path: Full path to root directory containing datasets :f_name: Specific video directory within a chosen split of a dataset :learning_rate_init: Initializer for learning rate :wd: Weight decay :save_freq: Frequency, in epochs, with which to save :clip_length: Length of clips to cut video into, -1 indicates using the entire video as one clip') :video_offset: String indicating where to begin selecting video clips (provided clipOffset is None) :clip_offset: "none" or "random" indicating where to begin selecting video clips :num_clips: Number of clips to break video into :clip_stride: Number of frames that overlap between clips, 0 indicates no overlap and negative values indicate a gap of frames between clips :batch_size: Number of clips to load into the model each step. :loss_type: String declaring loss type associated with a chosen model :metrics_dir: Name of subdirectory within the experiment to store metrics. Unique directory names allow for parallel testing :loaded_checkpoint: Specify the exact checkpoint of saved model to be loaded for further training/testing :verbose: Boolean to indicate if all print statement should be procesed or not :opt_choice: String indicating optimizer selected :gpu_list: List of GPU IDs to be used :grad_clip_value: Float value at which to clip normalized gradients :lr_boundaries: List of epoch boundaries at which lr will be updated :lr_values: List of lr multipliers to learning_rate_init at boundaries mentioned in lr_boundaries :preproc_method: The preprocessing method to use, default, cvr, rr, sr, or any other custom preprocessing :random_init: Randomly initialize model weights, not loading from any files (deafult False) :preproc_debugging: Boolean indicating whether to load videos and clips in a queue or to load them directly for debugging (Default 0) :reverse: Boolean indicating whether reverse videos and classify them as a new action class. Returns: Does not return anything """ with tf.name_scope("my_scope") as scope: # Initializers for checkpoint and global step variable ckpt = None gs_init = 0 ################################### Checkpoint loading block ####################################################### # Load pre-trained/saved model to continue training (or fine-tune) if load_model: try: ckpt, gs_init, learning_rate_init = load_checkpoint( model.name, dataset, experiment_name, loaded_checkpoint, preproc_method) if verbose: print 'A better checkpoint is found. The global_step value is: ' + str( gs_init) except: if verbose: print "Failed loading checkpoint requested. Please check." exit() # END TRY else: ckpt = model.load_default_weights() # END IF ###################################################################################################################### # Initialize model variables global_step = tf.Variable(gs_init, name='global_step', trainable=False) number_of_videos = tf.Variable(num_vids, name='number_of_videos', trainable=False) number_of_epochs = tf.Variable(n_epochs, name='number_of_epochs', trainable=False) video_step = tf.Variable(1.0, name='video_step', trainable=False) istraining = True reuse_variables = None # TF session setup config = tf.ConfigProto( allow_soft_placement=True ) #, gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)) sess = tf.Session(config=config) init = tf.global_variables_initializer() # Variables get randomly initialized into tf graph sess.run(init) tower_losses = [] tower_grads = [] tower_slogits = [] data_path = os.path.join(base_data_path, 'tfrecords_' + dataset, 'Split' + str(split), f_name) # Setup tensors for models # input_data_tensor - [batchSize, inputDims, height, width, channels] input_data_tensor, labels_tensor, names_tensor = load_dataset( model, num_gpus, batch_size, output_dims, input_dims, seq_length, size, data_path, dataset, istraining, clip_length, video_offset, clip_offset, num_clips, clip_stride, video_step, preproc_debugging, shuffle_seed, verbose, reverse=reverse) ############### TO DO: FIX THIS ASAP ######################## if ((batch_size == 1) and (num_clips == 1)): sess.run(tf.assign_add(video_step, -2)) else: sess.run(tf.assign_add(video_step, -1)) # END IF ############################################################ learning_rate = tf.Variable(learning_rate_init, name='learning_rate', trainable=False) # Define optimizer (Current selection is only momentum optimizer) if opt_choice == 'gd': optimizer = lambda lr: tf.train.GradientDescentOptimizer(lr) elif opt_choice == 'adam': optimizer = lambda lr: tf.train.AdamOptimizer(lr) else: optimizer = lambda lr: tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9) # END IF """ Multi-GPU setup: 1) Associate gpu device to specific model replica 2) Setup tower name scope for variables """ ################# GPU list check block #################### assert ((len(gpu_list) == num_gpus) or (len(gpu_list) == 0)) if len(gpu_list) == 0: gpu_list = [str(x) for x in range(num_gpus)] # END IF ########################################################### ################################################## Setup TF graph block ###################################################### for gpu_idx in range(num_gpus): with tf.device('/gpu:' + str(gpu_list[gpu_idx])): with tf.name_scope('%s_%d' % ('tower', int(gpu_list[gpu_idx]))) as scope: with tf.variable_scope(tf.get_variable_scope(), reuse=reuse_variables): returned_layers = model.inference( input_data_tensor[gpu_idx * batch_size:gpu_idx * batch_size + batch_size, :, :, :, :], istraining, input_dims, output_dims, seq_length, scope, return_layer=['logits'], weight_decay=wd) logits = tf.cast(returned_layers[0], tf.float32) # Calculating Softmax for probability outcomes : Can be modified, make function internal to model slogits = tf.nn.softmax(logits) # END WITH reuse_variables = True """ Within GPU mini-batch: 1) Calculate loss, 2) Initialize optimizer with required learning rate and 3) Compute gradients 4) Aggregate losses, gradients and logits """ total_loss = model.loss( logits, labels_tensor[gpu_idx * batch_size:gpu_idx * batch_size + batch_size, :], loss_type) opt = optimizer(learning_rate) gradients = opt.compute_gradients( total_loss, vars_.trainable_variables()) tower_losses.append(total_loss) tower_grads.append(gradients) tower_slogits.append(slogits) # END WITH # END WITH # END FOR """ After: 1) Computing gradients and losses need to be stored and averaged 2) Clip gradients by norm to required value 3) Apply mean gradient updates """ gradients = _average_gradients(tower_grads) gradients, variables = zip(*gradients) clipped_gradients, _ = clip_ops.clip_by_global_norm( gradients, grad_clip_value) gradients = list(zip(clipped_gradients, variables)) grad_updates = opt.apply_gradients(gradients, global_step=global_step, name="train") train_op = grad_updates ############################################################################################################################################ if save_bool: ######################### Logger Setup block ###################################### # Logging setup initialization (Naming format: Date, month, hour, minute, second) log_name = ( "exp_train_%s_%s_%s" % (time.strftime("%d_%m_%H_%M_%S"), dataset, experiment_name)) make_dir('results') make_dir(os.path.join('results', model.name)) make_dir(os.path.join('results', model.name, dataset)) make_dir( os.path.join('results', model.name, dataset, preproc_method)) make_dir( os.path.join('results', model.name, dataset, preproc_method, experiment_name)) make_dir( os.path.join('results', model.name, dataset, preproc_method, experiment_name, 'checkpoints')) curr_logger = Logger( os.path.join('logs', model.name, dataset, preproc_method, metrics_dir, log_name)) #################################################################################### # END IF init = tf.global_variables_initializer() coord = tf.train.Coordinator() threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord) # Variables get randomly initialized into tf graph sess.run(init) # Check that weights were loaded or random initializations are requested if ((ckpt == None) or (random_init)): print "Caution: Model weights are not being loaded, using random initialization." else: # Model variables initialized from previous saved models initialize_from_dict(sess, ckpt, model.name) # END IF del ckpt # Initialize tracking variables previous_vid_name = "" videos_loaded = 0 tot_count = 0 acc = 0 epoch_count = 0 tot_load_time = 0.0 tot_train_time = 0.0 last_loss = None losses = [] total_pred = [] save_data = [] total_params = [] losses_tracker = [] # Timing test setup time_init = time.time() batch_count = 0 epoch_acc = 0 l_r = learning_rate_init ########################################## Training loop block ################################################################ # Loop epoch number of time over the training set while videos_loaded < n_epochs * num_vids: # Variable to update during epoch intervals if (epoch_count + 1) * num_vids <= videos_loaded < ( epoch_count + 1) * num_vids + num_gpus * batch_size: batch_count = 0 epoch_acc = 0 if epoch_count % save_freq == 0 and tot_count > 0: if save_bool: if verbose: print "Saving..." save_checkpoint(sess, model.name, dataset, experiment_name, preproc_method, l_r, global_step.eval(session=sess)) # END IF epoch_count += 1 # END IF time_pre_train = time.time() ######################################### Running TF training session block ################################## _, loss_train, predictions, gs, labels, vid_names, l_r, track_vars = sess.run( [ train_op, tower_losses, tower_slogits, global_step, labels_tensor, names_tensor, learning_rate, model.get_track_variables() ]) ################################################################################################################ if verbose: print vid_names for name in vid_names: if name != previous_vid_name: videos_loaded += 1 previous_vid_name = name tot_count += 1 ######## Adaptive Learning Rate Control Block ############################ losses_tracker.append(np.mean(loss_train)) if videos_loaded % 10 == 0 and videos_loaded > 0: if last_loss is None: last_loss = sum(losses_tracker) / 10 else: difference_loss = last_loss - sum(losses_tracker) / 10 last_loss = sum(losses_tracker) / 10 if abs(difference_loss) < 0.001: learning_rate /= 10 # END IF # END IF if len(losses_tracker) == 10: losses_tracker = [] # END IF # END IF ########################################################################### # Transpose the extracted layers such that the mean is taken across the gpus and over any matrix with more than 1 dimension params_array = [] for key in track_vars.keys(): curr_params = np.array(track_vars[key]) if len(curr_params.shape) > 1: indices = np.arange(len(curr_params.shape)) + 1 indices[-1] = 0 curr_params = curr_params.transpose(indices) params_array.append( np.mean(curr_params, axis=tuple(range(len(curr_params.shape))[1:]))) else: params_array.append([np.mean(curr_params)]) # END IF # END FOR #################### Training accuracy computation block ############### # Compute training epoch accuracy for gpu_pred_idx in range(len(predictions)): for batch_idx in range(predictions[gpu_pred_idx].shape[0]): pred = np.mean(predictions[gpu_pred_idx][batch_idx], 0).argmax() if pred == labels[gpu_pred_idx * batch_size + batch_idx][0]: epoch_acc += 1 # END IF batch_count += 1 # END FOR # END FOR ###################### Add variables to be tracked to logger ############# time_post_train = time.time() tot_train_time += time_post_train - time_pre_train if verbose: print 'train_time: ', time_post_train - time_pre_train print 'step, loss: ', gs, loss_train print 'labels: ', labels # END IF if save_bool: curr_logger.add_scalar_value('train/train_time', time_post_train - time_pre_train, step=gs) curr_logger.add_scalar_value('train/loss', float(np.mean(loss_train)), step=gs) curr_logger.add_scalar_value('train/epoch_acc', epoch_acc / float(batch_count), step=gs) for layer in range(len(params_array)): for p in range(len(params_array[layer])): curr_logger.add_scalar_value( 'tracked_training_variables/' + str(track_vars.keys()[layer] + '_' + str(p)), float(params_array[layer][p]), step=gs) # END FOR # END FOR total_params.append(params_array) curr_logger.add_scalar_value( 'tracked_training_variables/learning_rate', float(l_r), step=gs) # END IF # END WHILE ######################################################################################################################################################### if save_bool: if verbose: print "Saving..." # END IF save_checkpoint(sess, model.name, dataset, experiment_name, preproc_method, l_r, gs) coord.request_stop() coord.join(threads) # END IF if verbose: print "Tot train time: ", tot_train_time print "Tot time: ", time.time() - time_init # END WITH if save_bool: # Save tracked parameterization variables as a numpy file if len(total_params) != 0: total_params = np.array(total_params).flatten() make_dir( os.path.join('results', model.name, dataset, preproc_method, experiment_name, metrics_dir)) if os.path.isfile( os.path.join('results', model.name, dataset, preproc_method, experiment_name, metrics_dir, 'train_params_' + dataset + '.npy')): loaded_params = np.load( os.path.join('results', model.name, dataset, preproc_method, experiment_name, metrics_dir, 'train_params_' + dataset + '.npy')) total_params = np.concatenate( [loaded_params, total_params]) # END IF np.save( os.path.join('results', model.name, dataset, preproc_method, experiment_name, metrics_dir, 'train_params_' + dataset + '.npy'), total_params)