# Iterate over batch start index
    for batch_idx in range(0, len(Y_files), batch_size):
        # Store the npy files corresponding to this batch
        npy_mapping += Y_files[batch_idx:batch_idx + batch_size]
        # Load in a list of this batch
        batch = [np.load(Y_files[n])
                 for n in range(batch_idx, batch_idx + batch_size)
                 if n < len(Y_files)]
        # Compute mean/std for each sequence in this batch
        for n, Y in enumerate(batch):
            statistics_Y[batch_idx + n, :D] = np.mean(Y, axis=0)
            statistics_Y[batch_idx + n, D:] = np.std(Y, axis=0)
        # Normalize batch by trianing set mean/std
        batch = [(Y - Y_mean)/Y_std for Y in batch]
        # Randomly sample subsequences of this batch
        batch = utils.sample_sequences(batch, max_length_Y)
        # Embed the batch and store the embedding in the matrix
        embedded_Y[batch_idx:batch_idx + batch_size] = embed_Y(batch)
        # Report percent done and time
        print "\r{:.3f}% in {}s".format(
            (100.*(batch_idx + batch_size))/len(Y_files),
            time.time() - now),
        now = time.time()
        sys.stdout.flush()
    print 'Done.'
    # Write out embedding matrix statistics matrix and mapping
    np.save('data/msd_embedded.npy', embedded_Y)
    np.save('data/msd_statistics.npy', statistics_Y)
    with open('data/msd_embedded_mapping.pkl', 'wb') as f:
        pickle.dump(npy_mapping, f)
Beispiel #2
0
def train(data, sample_size_X, sample_size_Y, conv_layer_specs,
          dense_layer_specs, alpha_XY, m_XY, optimizer=lasagne.updates.rmsprop,
          batch_size=20, epoch_size=100, initial_patience=1000,
          improvement_threshold=0.99, patience_increase=5, max_iter=100000):
    ''' Utility function for training a siamese net for cross-modality hashing
    Assumes data['X_train'][n] should be mapped close to data['Y_train'][m]
    only when n == m

    :parameters:
        - data : dict of list of np.ndarray
            Training/validation sequences in X/Y modality
            Should contain keys X_train, Y_train, X_validate, Y_validate
            Sequence matrix shape=(n_sequences, n_time_steps, n_features)
        - sample_size_X, sample_size_Y : int
            Sampled sequence length for X/Y modalities
        - conv_layer_specs, dense_layer_specs : list of dict
            List of dicts, where each dict corresponds to keyword arguments
            for each subsequent layer.  Note that
            dense_layer_specs[-1]['num_units'] should be the output
            dimensionality of the network.
        - alpha_XY : float
            Scaling parameter for cross-modality negative example cost
        - m_XY : int
            Cross-modality negative example threshold
        - optimizer: function
            Function which takes a Theano expression and parameters and
            computes parameter updates to minimize the Theano expression (for
            example, something from lasagne.updates).
        - batch_size : int
            Mini-batch size
        - epoch_size : int
            Number of mini-batches per epoch
        - initial_patience : int
            Always train on at least this many batches
        - improvement_threshold : float
            Validation cost must decrease by this factor to increase patience
        - patience_increase : int
            How many more epochs should we wait when we increase patience
        - max_iter : int
            Maximum number of batches to train on

    :returns:
        - epoch : iterator
            Results for each epoch are yielded
    '''
    # Create networks
    layers = {
        'X': utils.build_network(
            (None, None, data['X_train'][0].shape[-1]), conv_layer_specs,
            dense_layer_specs),
        'Y': utils.build_network(
            (None, None, data['Y_train'][0].shape[-1]), conv_layer_specs,
            dense_layer_specs)}
    # Inputs to X modality neural nets
    X_p_input = T.tensor3('X_p_input')
    X_n_input = T.tensor3('X_n_input')
    # Y network
    Y_p_input = T.tensor3('Y_p_input')
    Y_n_input = T.tensor3('Y_n_input')

    # Compute \sum max(0, m - ||a - b||_2)^2
    def hinge_cost(m, a, b):
        dist = m - T.sqrt(T.sum((a - b)**2, axis=1))
        return T.mean((dist*(dist > 0))**2)

    def hasher_cost(deterministic):
        X_p_output = lasagne.layers.get_output(
            layers['X']['out'],
            {layers['X']['in']: X_p_input},
            deterministic=deterministic)
        X_n_output = lasagne.layers.get_output(
            layers['X']['out'],
            {layers['X']['in']: X_n_input},
            deterministic=deterministic)
        Y_p_output = lasagne.layers.get_output(
            layers['Y']['out'],
            {layers['Y']['in']: Y_p_input},
            deterministic=deterministic)
        Y_n_output = lasagne.layers.get_output(
            layers['Y']['out'],
            {layers['Y']['in']: Y_n_input},
            deterministic=deterministic)
        # Unthresholded, unscaled cost of positive examples across modalities
        cost_p = T.mean(T.sum((X_p_output - Y_p_output)**2, axis=1))
        # Thresholded, scaled cost of cross-modality negative examples
        cost_n = alpha_XY*hinge_cost(m_XY, X_n_output, Y_n_output)
        # Sum positive and negative costs for overall cost
        cost = cost_p + cost_n
        return cost

    # Combine all parameters from both networks
    params = (lasagne.layers.get_all_params(layers['X']['out'])
              + lasagne.layers.get_all_params(layers['Y']['out']))
    # Compute RMSProp gradient descent updates
    updates = optimizer(hasher_cost(False), params)
    # Function for training the network
    train = theano.function([X_p_input, X_n_input, Y_p_input, Y_n_input],
                            hasher_cost(False), updates=updates)

    # Compute cost without training
    cost = theano.function([X_p_input, X_n_input, Y_p_input, Y_n_input],
                           hasher_cost(True))

    # Compute output without training
    X_output = theano.function(
        [layers['X']['in'].input_var],
        lasagne.layers.get_output(layers['X']['out'], deterministic=True))
    Y_output = theano.function(
        [layers['Y']['in'].input_var],
        lasagne.layers.get_output(layers['Y']['out'], deterministic=True))

    # Start with infinite validate cost; we will always increase patience once
    current_validate_cost = np.inf
    patience = initial_patience

    # Create sampled sequences for validation
    X_validate = utils.sample_sequences(
        data['X_validate'], sample_size_X)
    Y_validate = utils.sample_sequences(
        data['Y_validate'], sample_size_Y)
    # Create fixed negative example validation set
    X_validate_shuffle = np.random.permutation(X_validate.shape[0])
    Y_validate_shuffle = X_validate_shuffle[
        utils.random_derangement(X_validate.shape[0])]
    # Create iterator to sample sequences from training data
    data_iterator = utils.get_next_batch(
        data['X_train'], data['Y_train'], sample_size_X, sample_size_Y,
        batch_size, max_iter)
    # We will accumulate the mean train cost over each epoch
    train_cost = 0

    for n, (X_p, Y_p, X_n, Y_n) in enumerate(data_iterator):
        # Occasionally Theano was raising a MemoryError, this fails gracefully
        try:
            train_cost += train(X_p, X_n, Y_p, Y_n)
        except MemoryError as e:
            print "MemoryError: {}".format(e)
            return
        # Stop training if a NaN is encountered
        if not np.isfinite(train_cost):
            print 'Bad training cost {} at iteration {}'.format(train_cost, n)
            break
        # Validate the net after each epoch
        if n and (not n % epoch_size):
            epoch_result = collections.OrderedDict()
            epoch_result['iteration'] = n
            # Compute average training cost over the epoch
            epoch_result['train_cost'] = train_cost / float(epoch_size)
            # Reset training cost mean accumulation
            train_cost = 0

            # We need to accumulate the validation cost and network output over
            # batches to avoid MemoryErrors
            epoch_result['validate_cost'] = 0
            validate_batches = 0
            X_val_output = []
            Y_val_output = []
            for batch_idx in range(0, X_validate.shape[0], batch_size):
                # Extract slice from validation set for this batch
                batch_slice = slice(batch_idx, batch_idx + batch_size)
                # Compute and accumulate cost
                epoch_result['validate_cost'] += cost(
                    X_validate[batch_slice],
                    X_validate[X_validate_shuffle][batch_slice],
                    Y_validate[batch_slice],
                    Y_validate[Y_validate_shuffle][batch_slice])
                # Keep track of # of batches for normalization
                validate_batches += 1
                # Compute network output and accumulate result
                X_val_output.append(X_output(X_validate[batch_slice]))
                Y_val_output.append(Y_output(Y_validate[batch_slice]))
            # Normalize cost by number of batches and store
            epoch_result['validate_cost'] /= float(validate_batches)
            # Concatenate per-batch output to tensors
            X_val_output = np.concatenate(X_val_output, axis=0)
            Y_val_output = np.concatenate(Y_val_output, axis=0)
            # Compute in-class and out-of-class distances
            in_dists = np.mean((X_val_output - Y_val_output)**2, axis=1)
            out_dists = np.mean((X_val_output[X_validate_shuffle] -
                                Y_val_output[Y_validate_shuffle])**2, axis=1)
            # Objective is Bhattacharrya coefficient of in-class and
            # out-of-class distances
            epoch_result['validate_objective'] = utils.bhatt_coeff(
                in_dists, out_dists)

            # Test whether this validate cost is the new smallest
            if epoch_result['validate_cost'] < current_validate_cost:
                # To update patience, we must be smaller than
                # improvement_threshold*(previous lowest validation cost)
                patience_cost = improvement_threshold*current_validate_cost
                if epoch_result['validate_cost'] < patience_cost:
                    # Increase patience by the supplied about
                    patience += epoch_size*patience_increase
                # Even if we didn't increase patience, update lowest valid cost
                current_validate_cost = epoch_result['validate_cost']
            # Store patience after this epoch
            epoch_result['patience'] = patience

            # Yield scores and statistics for this epoch
            X_params = lasagne.layers.get_all_param_values(layers['X']['out'])
            Y_params = lasagne.layers.get_all_param_values(layers['Y']['out'])
            yield (epoch_result, X_params, Y_params)

            if n > patience:
                break

    return