def train_cross_modality_hasher(X_train, Y_train, X_validate, Y_validate, num_filters, filter_size, ds, hidden_layer_sizes, alpha_XY, m_XY, n_bits=16, dropout=False, learning_rate=.001, momentum=.0, batch_size=50, sequence_length=100, epoch_size=100, initial_patience=1000, improvement_threshold=0.99, patience_increase=10, max_iter=100000): ''' Utility function for training a siamese net for cross-modality hashing So many parameters. Assumes X_train[n] should be mapped close to Y_train[m] only when n == m The number of convolutional/pooling layers in inferred from the length of the entries in the num_filters, filter_size, ds dicts (all of which should have the same length). The number of hidden layers is inferred from the length of the entries of the hidden_layer_sizes dict. A final dense output layer is also included. :parameters: - X_train, Y_train, X_validate, Y_validate : list of np.ndarray List of train/validate sequences from X/Y modality Each shape=(n_channels, n_time_steps, n_features) - num_filters : dict of list-like Number of features in each convolutional layer for X/Y network - filter_size : dict of list-like Number of features in each convolutional layer for X/Y network - ds : dict of list-like Number of features in each convolutional layer for X/Y network - hidden_layer_sizes : dict of list-like Size of each hidden layer in X/Y network - alpha_XY : float Scaling parameter for cross-modality negative example cost - m_XY : int Cross-modality negative example threshold - n_bits : int Number of bits in the output representation - dropout : bool Whether to use dropout between the hidden layers - learning_rate : float SGD learning rate - momentum : float SGD momentum - batch_size : int Mini-batch size - sequence_length : int Size of extracted sequences - epoch_size : int Number of mini-batches per epoch - initial_patience : int Always train on at least this many batches - improvement_threshold : float Validation cost must decrease by this factor to increase patience - patience_increase : int How many more epochs should we wait when we increase patience - max_iter : int Maximum number of batches to train on :returns: - epoch : iterator Results for each epoch are yielded ''' # First neural net, for X modality X_p_input = T.tensor4('X_p_input') X_n_input = T.tensor4('X_n_input') # For eval X_input = T.tensor4('X_input') # Second neural net, for Y modality Y_p_input = T.tensor4('Y_p_input') Y_n_input = T.tensor4('Y_n_input') Y_input = T.tensor4('Y_input') # Create networks layers = { 'X': hashing_utils.build_network( (None, X_train[0].shape[0], sequence_length, X_train[0].shape[2]), num_filters['X'], filter_size['X'], ds['X'], hidden_layer_sizes['X'], dropout, n_bits), 'Y': hashing_utils.build_network( (None, Y_train[0].shape[0], sequence_length, Y_train[0].shape[2]), num_filters['Y'], filter_size['Y'], ds['Y'], hidden_layer_sizes['Y'], dropout, n_bits)} # Compute \sum max(0, m - ||a - b||_2)^2 def hinge_cost(m, a, b): dist = m - T.sqrt(T.sum((a - b)**2, axis=1)) return T.mean((dist*(dist > 0))**2) def hasher_cost(deterministic): X_p_output = lasagne.layers.get_output( layers['X'][-1], X_p_input, deterministic=deterministic) X_n_output = lasagne.layers.get_output( layers['X'][-1], X_n_input, deterministic=deterministic) Y_p_output = lasagne.layers.get_output( layers['Y'][-1], Y_p_input, deterministic=deterministic) Y_n_output = lasagne.layers.get_output( layers['Y'][-1], Y_n_input, deterministic=deterministic) # Unthresholded, unscaled cost of positive examples across modalities cost_p = T.mean((X_p_output - Y_p_output)**2) # Thresholded, scaled cost of cross-modality negative examples cost_n = alpha_XY*hinge_cost(m_XY, X_n_output, Y_n_output) # Sum positive and negative costs for overall cost cost = cost_p + cost_n return cost # Combine all parameters from both networks params = (lasagne.layers.get_all_params(layers['X'][-1]) + lasagne.layers.get_all_params(layers['Y'][-1])) # Compute RMSProp gradient descent updates updates = lasagne.updates.rmsprop(hasher_cost(False), params, learning_rate, momentum) # Function for training the network train = theano.function( [X_p_input, X_n_input, Y_p_input, Y_n_input], hasher_cost(False), updates=updates) # Compute cost without training cost = theano.function( [X_p_input, X_n_input, Y_p_input, Y_n_input], hasher_cost(True)) # Start with infinite validate cost; we will always increase patience once current_validate_cost = np.inf patience = initial_patience # Functions for computing the neural net output on the train and val sets X_output = theano.function([X_input], lasagne.layers.get_output( layers['X'][-1], X_input, deterministic=True)) Y_output = theano.function([Y_input], lasagne.layers.get_output( layers['Y'][-1], Y_input, deterministic=True)) # Extract sample seqs from the validation set (only need to do this once) X_validate, Y_validate = hashing_utils.sample_sequences( X_validate, Y_validate, sequence_length) # Create fixed negative example validation set X_validate_n = X_validate[np.random.permutation(X_validate.shape[0])] Y_validate_n = Y_validate[np.random.permutation(Y_validate.shape[0])] X_validate_shuffle = np.random.permutation(X_output(X_validate).shape[0]) data_iterator = hashing_utils.get_next_batch( X_train, Y_train, batch_size, sequence_length, max_iter) # We will accumulate the mean train cost over each epoch train_cost = 0 for n, (X_p, Y_p, X_n, Y_n) in enumerate(data_iterator): # Occasionally Theano was raising a MemoryError, this fails gracefully try: train_cost += train(X_p, X_n, Y_p, Y_n) except MemoryError: return # Stop training if a NaN is encountered if not np.isfinite(train_cost): print 'Bad training cost {} at iteration {}'.format(train_cost, n) break # Validate the net after each epoch if n and (not n % epoch_size): epoch_result = collections.OrderedDict() epoch_result['iteration'] = n # Compute average training cost over the epoch epoch_result['train_cost'] = train_cost / float(epoch_size) # Reset training cost mean accumulation train_cost = 0 # Also compute validate cost epoch_result['validate_cost'] = cost( X_validate, X_validate_n, Y_validate, Y_validate_n) # Compute statistics on validation set X_val_output = X_output(X_validate) Y_val_output = Y_output(Y_validate) in_dist, in_mean, in_std = hashing_utils.statistics( X_val_output > 0, Y_val_output > 0) out_dist, out_mean, out_std = hashing_utils.statistics( X_val_output[X_validate_shuffle] > 0, Y_val_output > 0) epoch_result['validate_accuracy'] = in_dist[0] epoch_result['validate_in_class_distance_mean'] = in_mean epoch_result['validate_in_class_distance_std'] = in_std epoch_result['validate_collisions'] = out_dist[0] epoch_result['validate_out_of_class_distance_mean'] = out_mean epoch_result['validate_out_of_class_distance_std'] = out_std X_entropy = hashing_utils.hash_entropy(X_val_output > 0) epoch_result['validate_hash_entropy_X'] = X_entropy Y_entropy = hashing_utils.hash_entropy(Y_val_output > 0) epoch_result['validate_hash_entropy_Y'] = Y_entropy # Objective is negative bhattacharyya distance # We should try to maximize it # When either is small, it's not really valid if out_dist[0] > 1e-5 and in_dist[0] > 1e-2: bhatt_coeff = -np.sum(np.sqrt(in_dist*out_dist)) epoch_result['validate_objective'] = bhatt_coeff else: epoch_result['validate_objective'] = -1 if epoch_result['validate_cost'] < current_validate_cost: patience_cost = improvement_threshold*current_validate_cost if epoch_result['validate_cost'] < patience_cost: patience += epoch_size*patience_increase current_validate_cost = epoch_result['validate_cost'] # Yield scores and statistics for this epoch X_params = lasagne.layers.get_all_param_values(layers['X'][-1]) Y_params = lasagne.layers.get_all_param_values(layers['Y'][-1]) yield (epoch_result, X_params, Y_params) if n > patience: break return
# Validate the net after each epoch if n and (not n % epoch_size): epoch_result = collections.OrderedDict() epoch_result['iteration'] = n # Store current SGD cost epoch_result['train_cost'] = train_cost # Also compute validate cost (more stable) epoch_result['validate_cost'] = cost(X_validate, X_validate_n, Y_validate, Y_validate_n, hp['alpha_XY'], hp['m_XY'], hp['alpha_X'], hp['m_X'], hp['alpha_Y'], hp['m_Y']) # Get accuracy and diagnostic figures for both train and validation sets for name, X_output, Y_output in [('train', X_train_output.eval(), Y_train_output.eval()), ('validate', X_validate_output.eval(), Y_validate_output.eval())]: N = X_output.shape[1] # Compute and display metrics on the resulting hashes correct, in_class_mean, in_class_std = hashing_utils.statistics(X_output > 0, Y_output > 0) collisions, out_of_class_mean, out_of_class_std = hashing_utils.statistics(X_output[:, np.random.permutation(N)] > 0, Y_output > 0) epoch_result[name + '_accuracy'] = correct/float(N) epoch_result[name + '_in_class_distance_mean'] = in_class_mean epoch_result[name + '_in_class_distance_std'] = in_class_std epoch_result[name + '_collisions'] = collisions/float(N) epoch_result[name + '_out_of_class_distance_mean'] = out_of_class_mean epoch_result[name + '_out_of_class_distance_std'] = out_of_class_std epoch_result[name + '_hash_entropy_X'] = hashing_utils.hash_entropy(X_output > 0) epoch_result[name + '_hash_entropy_Y'] = hashing_utils.hash_entropy(Y_output > 0) if epoch_result['validate_cost'] < current_validate_cost: if epoch_result['validate_cost'] < improvement_threshold*current_validate_cost: patience *= patience_increase print " ... increasing patience to {} because {} < {}*{}".format(patience, epoch_result['validate_cost'],
# Also compute validate cost (more stable) epoch_result['validate_cost'] = cost(X_validate, X_validate_n, Y_validate, Y_validate_n, hp['alpha_XY'], hp['m_XY'], hp['alpha_X'], hp['m_X'], hp['alpha_Y'], hp['m_Y']) # Get accuracy and diagnostic figures for both train and validation sets for name, X_output, Y_output in [ ('train', X_train_output.eval(), Y_train_output.eval()), ('validate', X_validate_output.eval(), Y_validate_output.eval()) ]: N = X_output.shape[1] # Compute and display metrics on the resulting hashes correct, in_class_mean, in_class_std = hashing_utils.statistics( X_output > 0, Y_output > 0) collisions, out_of_class_mean, out_of_class_std = hashing_utils.statistics( X_output[:, np.random.permutation(N)] > 0, Y_output > 0) epoch_result[name + '_accuracy'] = correct / float(N) epoch_result[name + '_in_class_distance_mean'] = in_class_mean epoch_result[name + '_in_class_distance_std'] = in_class_std epoch_result[name + '_collisions'] = collisions / float(N) epoch_result[name + '_out_of_class_distance_mean'] = out_of_class_mean epoch_result[name + '_out_of_class_distance_std'] = out_of_class_std epoch_result[name + '_hash_entropy_X'] = hashing_utils.hash_entropy( X_output > 0) epoch_result[name + '_hash_entropy_Y'] = hashing_utils.hash_entropy(