def extract_cnn_features(cnn_model, layer, features_dir):
    # prepare data augmentation configuration
    datagen = ImageDataGenerator(rescale=1. / 255)

    base_model = cnn_model.load(weights=cnn_model.weights)

    target_size = (cnn_model.img_height, cnn_model.img_width)

    layers_by_name = {l.name: l for l in base_model.layers}
    outputs = layers_by_name[layer].output
    model = Model(inputs=base_model.input, outputs=outputs)

    users = IO.load_annotations(ntcir.filepaths)

    for user_id, user in users.iteritems():
        for date, day in user.iteritems():
            for image in day.images:
                img = load_image(datagen, image.path, target_size)
                image.features = model.predict(img).copy()

    features_filepath = os.path.join(features_dir,
                                     "features." + cnn_model.name + ".pkl")
    with open(features_filepath, 'w') as f:
        pickle.dump(users, f, pickle.HIGHEST_PROTOCOL)

    del model
    if K.backend() == 'tensorflow':
        K.clear_session()
def extract_cnn_features(cnn_model, layers, features_dir, start_fold=1, end_fold=10):
    folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)]
    for fold in folds:

        # prepare data augmentation configuration
        datagen = ImageDataGenerator(rescale=1. / 255)

        init_weights = cnn_model.weights.format(fold)
        base_model = cnn_model.load(weights=init_weights)

        target_size = (cnn_model.img_height, cnn_model.img_width)

        layers_by_name = {l.name: l for l in base_model.layers}
        outputs = [layers_by_name[l].output for l in layers]
        model = Model(inputs=base_model.input, outputs=outputs)

        users = IO.load_annotations(ntcir.filepaths)
        for user_id, user in users.iteritems():
            for date, day in user.iteritems():
                for image in day.images:
                    img = load_image(datagen, image.path, target_size)

                    predictions = model.predict(img)
                    if len(model.output_layers) == 1:
                        predictions = [predictions]

                    image.features = {l: predictions[i].copy() for i, l in enumerate(layers)}

        features_filepath = os.path.join(features_dir, "features." + cnn_model.name + ".fold_" + fold + ".pkl")
        with open(features_filepath, 'w') as f:
            pickle.dump(users, f, pickle.HIGHEST_PROTOCOL)

        del model
        if K.backend() == 'tensorflow':
            K.clear_session()
Example #3
0
def train(features_filepath, weights_dir, rf_model, start_fold=1, end_fold=1, timestep=5, progress_percent=0.05, iccv_epic=True, features_size=4096, cores=None):
    np.random.seed(42)

    users = IO.load_annotations(ntcir.filepaths)
    sorted_users = ntcir.utils.sort(users)

    num_frames_per_day = 2880
    sequences = ntcir.get_sequences(sorted_users, num_frames_per_day)

    if not start_fold:
        start_fold = current_fold(weights_dir, rf_model.name)

    folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)]
    for fold in folds:
        with open(features_filepath.format(fold), 'r') as f:
            user_features = pickle.load(f)

        if iccv_epic:
            train_split = ntcir.read_split('datasets/ntcir/training_split.txt')
        else:
            train_split = ntcir.get_split_fold(sorted_users, int(fold) - 1)

        training_batches = ntcir.get_training_batches(train_split, sequences, timestep=timestep)

        num_features = timestep*features_size
        num_training_batches = len(training_batches)

        # Extract features of the images
        features = np.zeros((num_training_batches, num_features))
        targets = np.zeros(num_training_batches)

        if progress_percent:
            training_progress_percent = int(num_training_batches * progress_percent)
            print "Creating training matrix for fold {}".format(fold)

        for i, batch in enumerate(training_batches):
            day = user_features[batch.user_id][batch.date]
            for j, ind in enumerate(batch.indices):
                image = day.images[ind]
                start_ind = j * features_size
                end_ind = (j+1) * features_size
                features[i, start_ind:end_ind] = image.features

            last_ind = batch.indices[-1]
            targets[i] = day.images[last_ind].label

            if progress_percent and (i + 1) % training_progress_percent == 0:
                print("Progress %3.2f%% (%d/%d)" % ((i + 1) / num_training_batches * 100, i + 1, num_training_batches))
        gc.collect()
        if not cores:
            cores = multiprocessing.cpu_count()
        random_forest = RandomForestClassifier(n_estimators=rf_model.num_estimators, n_jobs=cores)
        random_forest.fit(features, targets)

        weights_filepath = os.path.join(weights_dir, "weights." + rf_model.name + ".fold_" + fold + ".pkl")
        with open(weights_filepath, 'w') as f:
            pickle.dump(random_forest, f, pickle.HIGHEST_PROTOCOL)
    return random_forest
def train(features_filepath, weights_dir, sgd_params, base_model, start_fold=None, end_fold=5, timestep=10,
          batch_size=1, iccv_epic=False):
    np.random.seed(42)

    users = IO.load_annotations(ntcir.filepaths)
    sorted_users = ntcir.utils.sort(users)

    num_frames_per_day = 2880
    sequences = ntcir.get_sequences(sorted_users, num_frames_per_day)

    backend = 'tf' if K.backend() == 'tensorflow' else 'th'

    if not start_fold:
        start_fold = current_fold(weights_dir, base_model.name)

    folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)]
    for fold in folds:
        with open(features_filepath.format(fold), 'r') as f:
            features = pickle.load(f)

        if iccv_epic:
            train_split = ntcir.read_split('datasets/ntcir/training_split.txt')
            test_split = ntcir.read_split('datasets/ntcir/validation_split.txt')
        else:
            train_split = ntcir.get_split_fold(sorted_users, int(fold) - 1)
            test_split = ntcir.get_split_fold(sorted_users, int(fold) - 1, False)

        training_batches = ntcir.get_training_batches(train_split, sequences, timestep=timestep)
        test_batches = ntcir.get_batches(test_split, sequences, timestep=timestep)

        K.set_learning_phase(1)

        model = base_model.load(feature_vector_length=base_model.feature_vector_length, timestep=timestep)
        sgd = SGD(lr=sgd_params.lr, decay=sgd_params.decay, momentum=sgd_params.momentum, nesterov=sgd_params.nesterov)
        model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

        steps_per_epoch = int(len(training_batches) / batch_size)
        train_generator = generate_batch(features, training_batches, base_model.feature_vector_length, batch_size,
                                         timestep, steps_per_epoch)
        validation_steps = int(len(test_batches) / batch_size)
        validation_generator = generate_batch(features, test_batches, base_model.feature_vector_length, batch_size,
                                              timestep, validation_steps)

        # checkpoint
        base_model_weights = "weights." + base_model.name + ".fold_" + fold + ".epoch_{epoch:02d}." + backend + ".hdf5"
        weights_filepath = os.path.join(weights_dir, base_model_weights)
        checkpoint = ModelCheckpoint(weights_filepath, monitor='val_acc', verbose=1, save_best_only=False)
        history = HistoryLog()

        # fine-tune the model
        model.fit_generator(
            train_generator,
            steps_per_epoch=steps_per_epoch,
            epochs=10,
            callbacks=[checkpoint, history],
            validation_data=validation_generator,
            validation_steps=validation_steps)

        ts = time()
        timestamp = datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')

        loss_filepath = os.path.join(weights_dir,
                                     "{}.fold_{}.loss.{}.log".format(base_model.name, fold, timestamp))
        history.log_training_loss(loss_filepath)

        epoch_filepath = os.path.join(weights_dir,
                                      "{}.fold_{}.epoch.{}.log".format(base_model.name, fold, timestamp))
        history.log_epoch(epoch_filepath)

        del model
        if K.backend() == 'tensorflow':
            K.clear_session()
Example #5
0
import numpy as np
import ntcir
import ntcir.IO as IO
import os
import os.path as osp
import itertools
import utils
import shutil
from collections import defaultdict
from easydict import EasyDict as edict


# In[2]:


users = IO.load_annotations(ntcir.filepaths)
sorted_users = ntcir.utils.sort(users)
categories = IO.load_categories(ntcir.filepaths)
users_ids = sorted(users.keys())

days = defaultdict(lambda: defaultdict(ntcir.Day))
for user in sorted_users:
    for day in user.days:
        days[user.id_][day.date] = day

splits = edict({'train': 0, 'validation': 1, 'test': 2})


# # Classification dataset split

# In[ ]:
Example #6
0
def test(features_filepath,
         results_dir,
         rf_model,
         start_fold=1,
         end_fold=1,
         timestep=5,
         progress_percent=0.05,
         iccv_epic=True,
         features_size=4096):
    np.random.seed(42)

    users = IO.load_annotations(ntcir.filepaths)
    sorted_users = ntcir.utils.sort(users)

    num_frames_per_day = 2880
    sequences = ntcir.get_sequences(sorted_users, num_frames_per_day)

    folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)]
    for fold in folds:
        with open(features_filepath.format(fold), 'r') as f:
            user_features = pickle.load(f)

        weights = rf_model.weights.format(fold)
        rf = load_random_forest(weights)

        if iccv_epic:
            test_split = ntcir.read_split('datasets/ntcir/test_split.txt')
        else:
            test_split = ntcir.get_split_fold(sorted_users,
                                              int(fold) - 1, False)
        test_batches = ntcir.get_training_batches(test_split,
                                                  sequences,
                                                  timestep=timestep)

        num_features = timestep * features_size
        num_test_batches = len(test_batches)

        features = np.zeros((num_test_batches, num_features))
        img_paths = list()
        labels = list()
        for i, batch in enumerate(test_batches):
            day = user_features[batch.user_id][batch.date]
            for j, ind in enumerate(batch.indices):
                image = day.images[ind]
                start_ind = j * features_size
                end_ind = (j + 1) * features_size
                features[i, start_ind:end_ind] = image.features

            last_ind = batch.indices[-1]
            img_paths.append(day.images[last_ind].path)
            labels.append(day.images[last_ind].label)

        predictions = rf.predict(features)

        results = list()
        for i in range(num_test_batches):
            results.append((img_paths[i], labels[i], predictions[i]))

        #ORIGINAL
        # num_features = timestep * features_size
        # num_test_batches = len(test_batches)
        #
        # if progress_percent:
        #     test_progress_percent = int(num_test_batches * progress_percent)
        #     print "Testing fold {}".format(fold)
        #
        # results = list()
        # features = np.zeros(num_features)
        # for i, batch in enumerate(test_batches):
        #     day = user_features[batch.user_id][batch.date]
        #     for j, ind in enumerate(batch.indices):
        #         image = day.images[ind]
        #         start_ind = j * features_size
        #         end_ind = (j + 1) * features_size
        #         features[start_ind:end_ind] = image.features
        #
        #     last_ind = batch.indices[-1]
        #     img_path = day.images[last_ind].path
        #     label = day.images[last_ind].label
        #     prediction = rf.predict([features])[0].astype(np.int)
        #
        #     results.append((img_path, label, prediction))
        #     if progress_percent and (i + 1) % test_progress_percent == 0:
        #         print("Progress %3.2f%% (%d/%d)" % ((i + 1) / num_test_batches * 100, i + 1, num_test_batches))

        results_fname = "{}.fold_{}.{}.csv".format(rf_model.name, fold,
                                                   backend)
        results_filepath = os.path.join(results_dir, results_fname)
        write_results(results, results_filepath)
def extract_castro_features(cnn_model,
                            data_dir,
                            features_dir,
                            start_fold=1,
                            end_fold=5,
                            num_categories=21,
                            num_bins=10,
                            progress_percent=.05):

    backend = 'tf' if K.backend() == 'tensorflow' else 'th'
    target_size = (cnn_model.img_height, cnn_model.img_width)
    datagen = ImageDataGenerator(rescale=1. / 255)

    folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)]
    for fold in folds:
        weights = cnn_model.weights.format(fold)
        model = cnn_model.load(weights=weights)

        users = IO.load_annotations(ntcir.filepaths)

        ind_by_img_path = dict()
        for user_id, days in users.iteritems():
            for date, day in days.iteritems():
                for ind, image in enumerate(day.images):
                    relative_path = '/'.join(image.path.split('/')[-3:])
                    ind_by_img_path[relative_path] = ind

        test_dir = os.path.join(data_dir, fold, 'test')
        train_dir = os.path.join(data_dir, fold, 'train')
        validation_dir = os.path.join(data_dir, fold, 'validation')

        if os.path.isdir(validation_dir):
            images = read_fold_dir(train_dir) + read_fold_dir(
                test_dir) + read_fold_dir(validation_dir)
        else:
            images = read_fold_dir(train_dir) + read_fold_dir(test_dir)

        num_images = len(images)
        images_progress_percent = int(num_images * progress_percent)

        print 'Extracting temporal features on fold {} for {}'.format(
            fold, cnn_model.name)

        for i, (label, img_path) in enumerate(images):

            img = load_image(datagen, img_path, target_size)

            features = np.zeros((num_categories + 3 * num_bins + 3))
            features[:num_categories] = model.predict(img)
            features[num_categories] = image.hour
            features[num_categories + 1] = image.minute
            features[num_categories + 2] = image.weekday
            features[num_categories + 3:] = get_histogram(image.path, num_bins)

            rpath = os.path.realpath(img_path)
            user_id, date, filename = rpath.split('/')[-3:]

            relative_path = '/'.join([user_id, date, filename])

            img_ind = ind_by_img_path[relative_path]
            image = users[user_id][date].images[img_ind]
            image.features = features

            if progress_percent and (i + 1) % images_progress_percent == 0:
                print("Progress %3.2f%% (%d/%d)" %
                      ((i + 1) / num_images * 100, i + 1, num_images))

        features_filepath = "features.{}.fold_{}.{}.pkl".format(
            rf_model.name, fold, backend)
        features_filepath = os.path.join(features_dir, features_filepath)
        with open(features_filepath, 'w') as f:
            pickle.dump(users, f, pickle.HIGHEST_PROTOCOL)

        del model
        if K.backend() == 'tensorflow':
            K.clear_session()
def extract_rf_features(data_dir,
                        features_dir,
                        cnn_model,
                        rf_model,
                        start_fold=1,
                        end_fold=5,
                        progress_percent=.1):

    backend = 'tf' if K.backend() == 'tensorflow' else 'th'
    target_size = (cnn_model.img_height, cnn_model.img_width)
    datagen = ImageDataGenerator(rescale=1. / 255)

    folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)]
    for fold in folds:
        weights = cnn_model.weights.format(fold)
        base_model = cnn_model.load(weights=weights)

        layers_by_name = {l.name: l for l in base_model.layers}
        outputs = [layers_by_name[rf_model.layer].output]
        model = Model(inputs=base_model.input, outputs=outputs)

        weights = rf_model.weights.format(fold)
        rf = load_random_forest(weights)

        users = IO.load_annotations(ntcir.filepaths)

        ind_by_img_path = dict()
        for user_id, days in users.iteritems():
            for date, day in days.iteritems():
                for ind, image in enumerate(day.images):
                    relative_path = '/'.join(image.path.split('/')[-3:])
                    ind_by_img_path[relative_path] = ind

        test_dir = os.path.join(data_dir, fold, 'test')
        train_dir = os.path.join(data_dir, fold, 'train')
        validation_dir = os.path.join(data_dir, fold, 'validation')

        if os.path.isdir(validation_dir):
            images = read_fold_dir(train_dir) + read_fold_dir(
                test_dir) + read_fold_dir(validation_dir)
        else:
            images = read_fold_dir(train_dir) + read_fold_dir(test_dir)

        num_images = len(images)
        images_progress_percent = int(num_images * progress_percent)

        print 'Extracting temporal features on fold {} for {} + RF on layer {}'.format(
            fold, cnn_model.name, rf_model.layer)

        for i, (label, img_path) in enumerate(images):

            img = load_image(datagen, img_path, target_size)

            predictions = model.predict(img)

            # Concatenating features
            features = predictions[0].copy()
            probability = rf.predict_proba([features])[0]

            rpath = os.path.realpath(img_path)
            user_id, date, filename = rpath.split('/')[-3:]

            relative_path = '/'.join([user_id, date, filename])

            img_ind = ind_by_img_path[relative_path]
            image = users[user_id][date].images[img_ind]
            image.features = probability.copy()

            if progress_percent and (i + 1) % images_progress_percent == 0:
                print("Progress %3.2f%% (%d/%d)" %
                      ((i + 1) / num_images * 100, i + 1, num_images))

        features_filepath = "features.{}.fold_{}.{}.pkl".format(
            rf_model.name, fold, backend)
        features_filepath = os.path.join(features_dir, features_filepath)
        with open(features_filepath, 'w') as f:
            pickle.dump(users, f, pickle.HIGHEST_PROTOCOL)

        del model
        if K.backend() == 'tensorflow':
            K.clear_session()
def test(features_filepath,
         results_dir,
         base_model,
         start_fold,
         end_fold,
         timestep,
         iccv_epic=False):
    users = IO.load_annotations(ntcir.filepaths)
    sorted_users = ntcir.utils.sort(users)

    num_frames_per_day = 2880
    sequences = ntcir.get_sequences(sorted_users, num_frames_per_day)

    backend = 'tf' if K.backend() == 'tensorflow' else 'th'

    if not start_fold:
        start_fold = current_fold(results_dir, base_model.name + '.fold')

    folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)]
    for fold in folds:
        with open(features_filepath.format(fold), 'r') as f:
            features = pickle.load(f)

        if iccv_epic:
            test_split = ntcir.read_split('datasets/ntcir/test_split.txt')
        else:
            test_split = ntcir.get_split_fold(sorted_users,
                                              int(fold) - 1, False)
        test_batches = ntcir.get_batches(test_split,
                                         sequences,
                                         timestep=timestep,
                                         include_last=True)

        K.set_learning_phase(False)

        weights = base_model.best_weights.format(fold)
        model = base_model.load(
            feature_vector_length=base_model.feature_vector_length,
            weights=weights,
            timestep=timestep)

        frames = list()
        groundtruth = list()
        predictions = list()
        for i, batch in enumerate(test_batches):
            x, y = load_batch(
                features,
                batch,
                feature_vector_length=base_model.feature_vector_length,
                batch_size=1,
                timestep=timestep)

            prediction = model.predict_on_batch(x)
            prediction = np.argmax(prediction, axis=2).squeeze()[0:batch.size]

            predictions.extend(prediction)
            groundtruth.extend(np.argmax(y, axis=2).squeeze()[0:batch.size])

            for j, ind in enumerate(batch.indices):
                image = features[batch.user_id][batch.date].images[ind]
                frames.append(image.path)

        results_fname = "{}.fold_{}.{}.csv".format(base_model.name, fold,
                                                   backend)
        results_filepath = os.path.join(results_dir, results_fname)
        write_results(frames, groundtruth, predictions, results_filepath)

        del model
        if K.backend() == 'tensorflow':
            K.clear_session()
def test(features_filepath,
         results_dir,
         base_model,
         start_fold,
         end_fold,
         timestep,
         iccv_epic=False,
         progress_percent=0.05):
    users = IO.load_annotations(ntcir.filepaths)
    sorted_users = ntcir.utils.sort(users)

    num_frames_per_day = 2880
    sequences = ntcir.get_sequences(sorted_users, num_frames_per_day)

    backend = 'tf' if K.backend() == 'tensorflow' else 'th'

    if not start_fold:
        start_fold = current_fold(results_dir, base_model.name + '.fold')

    folds = [str(fold).zfill(2) for fold in range(start_fold, end_fold + 1)]
    for fold in folds:
        with open(features_filepath.format(fold), 'r') as f:
            features = pickle.load(f)

        if iccv_epic:
            test_split = ntcir.read_split('datasets/ntcir/test_split.txt')
        else:
            test_split = ntcir.get_split_fold(sorted_users,
                                              int(fold) - 1, False)
        test_batches = ntcir.get_training_batches(test_split,
                                                  sequences,
                                                  timestep=timestep)

        K.set_learning_phase(False)

        weights = base_model.best_weights.format(fold)
        model = base_model.load(
            feature_vector_length=base_model.feature_vector_length,
            weights=weights,
            timestep=timestep)

        num_test_batches = len(test_batches)

        if progress_percent:
            test_progress_percent = int(num_test_batches * progress_percent)
            print "Testing fold {}".format(fold)

        results = list()
        for i, batch in enumerate(test_batches):
            x, y = load_batch(
                features,
                batch,
                feature_vector_length=base_model.feature_vector_length,
                batch_size=1,
                timestep=timestep)

            prediction = model.predict_on_batch(x)
            prediction = np.argmax(prediction, axis=2).squeeze()[-1]

            ind = batch.indices[-1]
            image = features[batch.user_id][batch.date].images[ind]

            results.append((image.path, image.label, prediction))
            if progress_percent and (i + 1) % test_progress_percent == 0:
                print("Progress %3.2f%% (%d/%d)" % (
                    (i + 1) / num_test_batches * 100, i + 1, num_test_batches))

        results_fname = "{}.many2one.fold_{}.{}.csv".format(
            base_model.name, fold, backend)
        results_filepath = os.path.join(results_dir, results_fname)
        write_results(results, results_filepath)

        del model
        if K.backend() == 'tensorflow':
            K.clear_session()