Ejemplo n.º 1
0
def make_submission_vgg19(name, name_ext, dropout_p, penultimate_size):
    data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name)
    _, _, _, _, _, te_names = create_embeddings(name)
    batch_size = 32

    datagen = ImageDataGenerator(
        preprocessing_function=preprocess_single_input)
    datagen = datagen.flow_from_directory(directory=data_info['dir_te'],
                                          target_size=(HEIGHT, WIDTH),
                                          class_mode=None,
                                          batch_size=batch_size,
                                          shuffle=False)

    model_file = join(MODELS_DIR, MODEL_FILE.format(name, name_ext))
    model = VGG19(weights='imagenet',
                  include_top=False,
                  input_shape=(HEIGHT, WIDTH, 3))
    top_classifier = _top_classifier(l2_reg=0,
                                     dropout_p=dropout_p,
                                     input_shape=(9, 9, 512),
                                     penultimate_size=penultimate_size)
    model = Model(inputs=model.input, outputs=top_classifier(model.output))
    model.load_weights(model_file)

    probs_pred = model.predict_generator(generator=datagen,
                                         steps=ceil(data_info['num_te'] /
                                                    batch_size))

    submission_file = 'vgg19_fine_tuned_{:s}.csv'.format(name)
    create_submission_file(image_names=te_names,
                           probs=probs_pred,
                           file_name=join(SUBMISSIONS_DIR, submission_file))
Ejemplo n.º 2
0
def _create_submission_file_avg_cnns():
    data_info = load_organized_data_info(IMGS_DIM_1D)
    model = _get_model('/home/chris/painters/models/')
    X_test, img_filenames = _average_embedded_test_data(model, data_info)
    features_lookup = {n: f for n, f in zip(img_filenames, X_test)}
    _create_submission_file(BATCH_SIZE, features_lookup,
                            _calculate_batch_prediction_dot)
Ejemplo n.º 3
0
def _average_embedded_test_data(model, data_info):
    X_test, y_test = None, None
    data_info = load_organized_data_info(IMGS_DIM_1D)
    dir_te, num_te = data_info['dir_te'], data_info['num_te']
    dir_tr = data_info['dir_tr']
    gen = testing_generator(dir_tr=dir_tr)
    gen_test = init_directory_generator(gen,
                                        dir_te,
                                        BATCH_SIZE,
                                        class_mode='sparse',
                                        shuffle_=False)

    num_batch_per_epoch = num_te // BATCH_SIZE
    last_batch_size = num_te - (num_batch_per_epoch * BATCH_SIZE)

    for i in range(num_batch_per_epoch + 1):
        X_batch, y_batch = next(gen_test)

        if i == num_batch_per_epoch:
            X_batch = X_batch[:last_batch_size]

        if X_test is None:
            X_test = model.predict(X_batch)
        else:
            X_test = np.vstack((X_test, model.predict(X_batch)))

    # gen_test.filenames is ordered the same as X_test
    # (image file names with corresponding features).
    img_filenames = [basename(p) for p in gen_test.filenames]
    return X_test, img_filenames
Ejemplo n.º 4
0
def clean(imgs_dim=299, name=''):
    """Deletes all resized images datasests (i.e. train, val, test) and the
    info file.
    """
    data_info = load_organized_data_info(imgs_dim, name)
    rmtree(data_info['dir_tr'])
    rmtree(data_info['dir_val'])
    rmtree(data_info['dir_te'])
    remove(organized_data_info_file(imgs_dim, name))
Ejemplo n.º 5
0
    def __init__(self, hparams):
        super().__init__(hparams)

        self.kernel_size = pedl.get_hyperparameter("kernel_size")
        self.dropout = pedl.get_hyperparameter("dropout")
        self.pool_size = pedl.get_hyperparameter("pool_size")
        self.l2_reg = pedl.get_hyperparameter("l2_reg")
        self.lr = pedl.get_hyperparameter("lr")
        self.my_batch_size = pedl.get_hyperparameter("batch_size")
        self.data_info = load_organized_data_info(IMGS_DIM_1D)
Ejemplo n.º 6
0
def _train_model():
    # Change IMGS_DIM_3D to 0 instead of index 2 because we reversed order for the tensorflow switch
    data_info = load_organized_data_info(IMGS_DIM_3D[0])
    dir_tr = data_info['dir_tr']
    dir_val = data_info['dir_val']

    #just threw this in here from  https://github.com/keras-team/keras/issues/8649 flagged in git issue 6.
    # currently it just saves every epoch. can probably modify to only save if highest accuracy.
    class MyCbk(Callback):
        def __init__(self, model):
            self.model_to_save = model

        def on_epoch_end(self, epoch, logs=None):
            self.model_to_save.save('model_at_epoch_%d.h5' % epoch)

    gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val)

    # with tf.device('/cpu:0'): # important!!!
    #     model = _cnn(IMGS_DIM_3D)
    #     print('Single tower model:')
    #     model.summary()
    #
    #     if gpu_count > 1:
    #         model = make_parallel(model, gpu_count)
    #
    #         print('Multi-GPU model:')
    #         model.summary()
    #
    #     model = compile_model(model)
    #     model.fit_generator(
    #         generator=gen_tr,
    #         epochs=MAX_EPOCHS,
    #         steps_per_epoch=data_info['num_tr'],
    #         validation_data=gen_val,
    #         validation_steps=data_info['num_val'],
    #         callbacks=[ModelCheckpoint(CNN_MODEL_FILE, save_best_only=True)],
    #         verbose=1)

    model = _cnn(IMGS_DIM_3D)  #new

    #model.load_weights("/home/nkim/art_cnn/models/cnn.h5")
    #print("Model weights have been updated!")

    parallel_model = multi_gpu_model(model, gpus=2)
    parallel_model = compile_model(parallel_model)
    cbk = MyCbk(model)  #new

    parallel_model.fit_generator(
        generator=gen_tr,
        epochs=MAX_EPOCHS,
        steps_per_epoch=data_info['num_tr'],
        validation_data=gen_val,
        validation_steps=data_info['num_val'],
        callbacks=[cbk],  #new
        verbose=1)
Ejemplo n.º 7
0
def make_data_loaders(experiment_config, hparams):
    # multi_crop improves training, but was not used for author's submission
    data_info = load_organized_data_info(IMGS_DIM_3D[1], multi_crop=True)
    dir_tr = data_info['dir_tr']
    dir_val = data_info['dir_val']

    gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val)

    gen_tr = KerasDataAdapter(gen_tr, workers=16, use_multiprocessing=True)
    gen_val = KerasDataAdapter(gen_val, workers=16, use_multiprocessing=True)

    return (gen_tr, gen_val)
Ejemplo n.º 8
0
def stack(group):
    name, width, height = group['name'], group['width'], group['height']
    group_uid, models = group['uid'], group['models']

    meta_model_file = join(
        MODELS_DIR,
        'stacking_meta_model_group_{:d}_{:s}.pickle'.format(group_uid, name))
    meta_model_fitted = isfile(meta_model_file)

    data_info = load_organized_data_info(imgs_dim=width, name=name)

    if not meta_model_fitted:
        preds_val = np.empty((data_info['num_val'], 0))
    preds_te = np.empty((data_info['num_te'], 0))

    for model_name, preprocess_func in models:
        model_path = join(MODELS_DIR, model_name)

        if not meta_model_fitted:
            model_preds_val = _make_predictions(
                height=height,
                width=width,
                model_path=model_path,
                preprocess_func=preprocess_func,
                data_info=data_info,
                dir_id='val')

        model_preds_te = _make_predictions(height=height,
                                           width=width,
                                           model_path=model_path,
                                           preprocess_func=preprocess_func,
                                           data_info=data_info,
                                           dir_id='te')

        if not meta_model_fitted:
            preds_val = np.hstack((preds_val, model_preds_val))
        preds_te = np.hstack((preds_te, model_preds_te))

    _, _, _, y_val, _, te_names = create_embeddings(name=name)

    if meta_model_fitted:
        meta_model = load(meta_model_file)
    else:
        meta_model = LogisticRegression(C=1e10)
        meta_model.fit(preds_val, y_val)
        dump(meta_model, meta_model_file)

    te_pred = meta_model.predict_proba(preds_te)
    return te_names, te_pred
Ejemplo n.º 9
0
def train(model_file, reduce_lr_factor=1e-1, num_freeze_layers=0, epochs=10,
          name='', reg='l2', reg_strength=0.0, dropout=0.5,
          early_stopping=False):
    data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name)
    _, X_tr, Y_tr = _get_tagged_images(
        data_info['dir_tr'], truncate_to_id=True)
    _, X_val, Y_val = _get_tagged_images(
        data_info['dir_val'], truncate_to_id=True)

    def _image_generator(generator, data, labels):
        return generator.flow(
            data, labels,
            batch_size=32,
            shuffle=True,
        )

    model = _cnn(
        model_file=model_file,
        reg=reg,
        reg_strength=reg_strength,
        dropout_p=dropout
    )
    model.compile(loss='mean_squared_error', optimizer='adam')

    # model has 134 layers
    for layer in model.layers[:num_freeze_layers]:
        layer.trainable = False

    callbacks = [
        ReduceLROnPlateau(factor=reduce_lr_factor),
        ModelCheckpoint(_model_file_name(
            name, reg, reg_strength, dropout
        ), save_best_only=True),
        TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True),
    ]
    if early_stopping:
        callbacks.append(EarlyStopping(
            monitor='val_loss', min_delta=1, patience=early_stopping
        ))

    generator = ImageDataGenerator()
    model.fit_generator(
        generator=_image_generator(generator, X_tr, Y_tr),
        steps_per_epoch=len(X_tr),
        epochs=epochs,
        callbacks=callbacks,
        validation_data=_image_generator(generator, X_val, Y_val),
        validation_steps=len(X_val),
    )
Ejemplo n.º 10
0
def _softmax_dot():
    data_info = load_organized_data_info(IMGS_DIM_1D)
    X_avg, y_val = _average_embedded_val_data(data_info)

    batches_val = _create_pairs_generator(X_avg,
                                          y_val,
                                          lambda u, v: [u, v],
                                          num_groups=32,
                                          batch_size=1000000)

    y_pred, y_true = np.array([]), np.array([])
    for X, y in batches_val:
        y_pred = np.hstack((y_pred, pairs_dot(X)))
        y_true = np.hstack((y_true, y))

    print("Validation AUC: {:.4f}".format(roc_auc_score(y_true, y_pred)))
Ejemplo n.º 11
0
def _train_model():
    data_info = load_organized_data_info(IMGS_DIM_3D[1])
    dir_tr = data_info['dir_tr']
    dir_val = data_info['dir_val']

    gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val)
    model = _cnn(IMGS_DIM_3D)

    model.fit_generator(
        generator=gen_tr,
        nb_epoch=MAX_EPOCHS,
        samples_per_epoch=data_info['num_tr'],
        validation_data=gen_val,
        nb_val_samples=data_info['num_val'],
        callbacks=[ModelCheckpoint(CNN_MODEL_FILE, save_best_only=True)],
        verbose=2)
Ejemplo n.º 12
0
def _create_embedded_test_set(layer, model_path, test_set_file):
    data_info = load_organized_data_info(IMGS_DIM_1D)
    dir_te, num_te = data_info['dir_te'], data_info['num_te']
    dir_tr = data_info['dir_tr']

    model = LAYER_RESULT_FUNCS[layer](model_path)
    gen = testing_generator(dir_tr=dir_tr)

    X_te, names = _create_embedded_data_from_dir(model,
                                                 gen,
                                                 dir_te,
                                                 num_te,
                                                 LAYER_SIZES[layer],
                                                 is_test_set=True)

    _save_np_compressed_data(test_set_file, X_te, names)
    return X_te, names
Ejemplo n.º 13
0
def _create_embedded_train_val_split(layer, model_path, train_val_split_file):
    data_info = load_organized_data_info(IMGS_DIM_1D)
    dir_tr, num_tr = data_info['dir_tr'], data_info['num_tr']
    dir_val, num_val = data_info['dir_val'], data_info['num_val']

    model = LAYER_RESULT_FUNCS[layer](model_path)
    gen = testing_generator(dir_tr=dir_tr)

    X_tr, y_tr, names_tr = _create_embedded_data_from_dir(
        model, gen, dir_tr, num_tr, LAYER_SIZES[layer])

    X_val, y_val, names_val = _create_embedded_data_from_dir(
        model, gen, dir_val, num_val, LAYER_SIZES[layer])

    _save_np_compressed_data(train_val_split_file, X_tr, y_tr, names_tr, X_val,
                             y_val, names_val)

    return X_tr, y_tr, names_tr, X_val, y_val, names_val
Ejemplo n.º 14
0
def _train_model():
    data_info = load_organized_data_info(IMGS_DIM_3D[1])
    dir_tr = data_info['dir_tr']
    dir_val = data_info['dir_val']

    gen_tr, gen_val = train_val_dirs_generators(BATCH_SIZE, dir_tr, dir_val)
    model = _cnn(IMGS_DIM_3D)

    model.fit_generator(
        generator=gen_tr,
        epochs=MAX_EPOCHS,
        steps_per_epoch=300,
        validation_data=gen_val,
        validation_steps=math.ceil(data_info['num_val'] / BATCH_SIZE),
        validation_freq=10,
        callbacks=[ModelCheckpoint(CNN_MODEL_FILE, save_best_only=True)],
        workers=16,
        use_multiprocessing=True,
        verbose=1)
Ejemplo n.º 15
0
def resume_training(model_file, name='', reduce_lr_factor=1e-1,
                    num_freeze_layers=0, epochs=10, reg='l2', reg_strength=0.0,
                    dropout=0.5):
    data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name)
    _, X_tr, Y_tr = _get_tagged_images(
        data_info['dir_tr'], truncate_to_id=True)
    _, X_val, Y_val = _get_tagged_images(
        data_info['dir_val'], truncate_to_id=True)

    def _image_generator(generator, data, labels):
        return generator.flow(
            data, labels,
            batch_size=32,
            shuffle=True,
        )
    model = load_model(model_file)

    for layer in model.layers[:num_freeze_layers]:
        layer.trainable = False

    callbacks = [
        ReduceLROnPlateau(factor=reduce_lr_factor),
        ModelCheckpoint(_model_file_name(
            name, reg, reg_strength, dropout
        ), save_best_only=True),
        TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True),
    ]
    generator = ImageDataGenerator()
    model.fit_generator(
        generator=_image_generator(generator, X_tr, Y_tr),
        steps_per_epoch=len(X_tr),
        epochs=epochs,
        callbacks=callbacks,
        validation_data=_image_generator(generator, X_val, Y_val),
        validation_steps=len(X_val),
    )
Ejemplo n.º 16
0
def create_embeddings(name):
    """Returns vgg16 embeddings (outputs of the last conv layer).

    Returns
    -------
    tuple
        X_tr (n_samples, 9, 9, 512)
        y_tr (n_samples,)
        X_val (n_samples, 9, 9, 512)
        y_val (n_samples,)
        X_te (n_samples, 9, 9, 512)
        te_names (n_samples,)
    """
    embeddings_file = join(DATA_DIR, EMBEDDINGS_FILE.format(name))

    if isfile(embeddings_file):
        d = np.load(embeddings_file)
        return d['X_tr'], d['y_tr'], d['X_val'], d['y_val'], d['X_te'],\
            d['te_names']

    data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name)
    datagen = ImageDataGenerator(preprocessing_function=preprocess_single_input)
    batch_size = 32

    def dir_datagen(dir_):
        return datagen.flow_from_directory(
            directory=dir_,
            target_size=(HEIGHT, WIDTH),
            class_mode=None,
            batch_size=batch_size,
            shuffle=False
        )

    model = VGG16(
        weights='imagenet',
        include_top=False,
        input_shape=(HEIGHT, WIDTH, 3)
    )

    def embed(dir_, num, data_is_labeled):
        X = model.predict_generator(
            generator=dir_datagen(dir_),
            steps=ceil(num / batch_size)
        )

        if data_is_labeled:
            num_per_cls = num_examples_per_class_in_dir(dir_)
            y_0 = np.zeros(num_per_cls['Type_1'])
            y_1 = np.zeros(num_per_cls['Type_2']) + 1
            y_2 = np.zeros(num_per_cls['Type_3']) + 2
            y = np.hstack((y_0, y_1, y_2))
            return X, y

        # unlabeled (test) dataset
        names = [x for x in listdir(join(dir_, 'all')) if x.endswith('.jpg')]
        return X, np.array(names)

    dir_tr, num_tr = data_info['dir_tr'], data_info['num_tr']
    X_tr, y_tr = embed(dir_tr, num_tr, data_is_labeled=True)

    dir_val, num_val = data_info['dir_val'], data_info['num_val']
    X_val, y_val = embed(dir_val, num_val, data_is_labeled=True)

    dir_te, num_te = data_info['dir_te'], data_info['num_te']
    X_te, te_names = embed(dir_te, num_te, data_is_labeled=False)

    np.savez_compressed(
        file=embeddings_file,
        X_tr=X_tr,
        y_tr=y_tr,
        X_val=X_val,
        y_val=y_val,
        X_te=X_te,
        te_names=te_names
    )

    print("Embedded data shapes:")
    print("X_tr {0}".format(X_tr.shape))
    print("y_tr {0}".format(y_tr.shape))
    print("X_val {0}".format(X_val.shape))
    print("y_val {0}".format(y_val.shape))
    print("X_te {0}".format(X_te.shape))
    print("te_names {0}".format(te_names.shape))
    return X_tr, y_tr, X_val, y_val, X_te, te_names
Ejemplo n.º 17
0
def fine_tune(name, name_ext, lr=1e-4, reduce_lr_factor=0.1,
              reduce_lr_patience=3, epochs=10, batch_size=32, l2_reg=0,
              dropout_p=0.5, num_freeze_layers=0, save_best_only=True,
              loss_stop_val=0.00001, penultimate_size=256):

    data_info = load_organized_data_info(imgs_dim=HEIGHT, name=name)
    tr_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_single_input,
        rotation_range=180,
        vertical_flip=True,
        horizontal_flip=True,
        # width_shift_range=0.1,
        # height_shift_range=0.1,
        # zoom_range=0.1,
        # shear_range=0.3,
        # fill_mode='reflect'
    )
    val_datagen = ImageDataGenerator(
        preprocessing_function=preprocess_single_input
    )
    batch_size = 32

    def dir_datagen(dir_, gen):
        return gen.flow_from_directory(
            directory=dir_,
            target_size=(HEIGHT, WIDTH),
            class_mode='categorical',
            batch_size=batch_size,
            shuffle=True
        )

    dir_tr, num_tr = data_info['dir_tr'], data_info['num_tr']
    dir_val, num_val = data_info['dir_val'], data_info['num_val']

    top_classifier_file = join(
        MODELS_DIR,
        TOP_CLASSIFIER_FILE.format(name, penultimate_size)
    )
    model_file = join(MODELS_DIR, MODEL_FILE.format(name, name_ext))

    model = VGG16(
        weights='imagenet',
        include_top=False,
        input_shape=(HEIGHT, WIDTH, 3)
    )
    top_classifier = _top_classifier(
        l2_reg=l2_reg,
        dropout_p=dropout_p,
        input_shape=(9, 9, 512),
        penultimate_size=penultimate_size
    )
    top_classifier.load_weights(top_classifier_file)
    model = Model(inputs=model.input, outputs=top_classifier(model.output))
    model.compile(Adam(lr=lr), loss='categorical_crossentropy')

    # model has 20 layers
    for layer in model.layers[:num_freeze_layers]:
        layer.trainable = False

    log_dir = join(EXPERIMENTS_DIR, 'vgg16_fine_tuned_{:s}'.format(name))
    callbacks = [
        EarlyStoppingByLoss(monitor='loss', value=loss_stop_val),
        ReduceLROnPlateau(factor=reduce_lr_factor, patience=reduce_lr_patience),
        ModelCheckpoint(model_file, save_best_only=save_best_only),
        TensorBoard(
            log_dir=log_dir,
            write_graph=False
        )
    ]

    model.fit_generator(
        generator=dir_datagen(dir_tr, tr_datagen),
        steps_per_epoch=ceil(num_tr / batch_size),
        epochs=epochs,
        validation_data=dir_datagen(dir_val, val_datagen),
        validation_steps=ceil(num_val / batch_size),
        callbacks=callbacks
    )
Ejemplo n.º 18
0
def _add_test_info_to_organized_data_info(imgs_dim, name, num_test_samples,
                                          new_dir_te):
    data_info = load_organized_data_info(imgs_dim, name)
    data_info['dir_te'] = dirname(new_dir_te)
    data_info['num_te'] = num_test_samples
    save_organized_data_info(data_info, imgs_dim, name)
Ejemplo n.º 19
0
def _create_submission_file_avg_cnns():
    data_info = load_organized_data_info(IMGS_DIM_1D)
    X_avg, names = _average_embedded_test_data(data_info)
    features_lookup = {n: f for n, f in zip(names, X_avg)}
    _create_submission_file(BATCH_SIZE, features_lookup,
                            _calculate_batch_prediction_dot)
Ejemplo n.º 20
0
def _append_num_te_to_organized_data_info(num_test_samples, multi_crop):
    data_info = load_organized_data_info(IMGS_DIM_2D[0], multi_crop=multi_crop)
    data_info['num_te'] = num_test_samples
    save_organized_data_info(data_info, IMGS_DIM_2D[0], multi_crop=multi_crop)