Example #1
0
def load_data(base_dir, min_samples, images_per_class=None):

    train = build_files_dataframe(os.path.join(args.base_dir, 'train'))
    print(train.head())
    train = prune_file_list(train,
                            label_col='label',
                            min_samples=args.min_samples)
    train = train.sample(frac=1).reset_index(drop=True)

    # Filter out the classes that we do not wait
    # in our dev set.
    classes = np.unique(train['label'])

    # We either get None or some
    # integer which specifies the
    # target number of samples.
    if images_per_class:

        counts = {}
        for c in classes:
            counts[c] = len(train[train['label'] == c])

        if images_per_class > max(counts.values()):
            print((
                "[FATAL] The number of images per class requested is larger than "
                " the number of samples in the majority class.  This is a fatal error!"
            ))
            exit()

        train_dataframes = []
        for c in classes:
            train_dataframes.append(
                resample(train, replace=True, n_samples=images_per_class))

        # Replace the training dataframe by the resampled stuffs
        # here.
        train = pd.concat(train_dataframes)

    # Load dev set and return it
    dev = build_files_dataframe(os.path.join(args.base_dir, 'dev'))
    print(dev.head())
    dev = dev.sample(frac=1).reset_index(drop=True)
    return_cols = list(dev.columns)
    dev['keep'] = dev['label'].apply(lambda x: x in classes)
    dev = dev[dev['keep'] == True]

    # Load test set and return it
    test = build_files_dataframe(os.path.join(args.base_dir, 'test'))
    print(test.head())
    test = test.sample(frac=1).reset_index(drop=True)
    return_cols = list(test.columns)
    test['keep'] = test['label'].apply(lambda x: x in classes)
    test = test[test['keep'] == True]

    return train, dev[return_cols], test[return_cols]
Example #2
0
def load_dataframes(data_dir, min_samples):

    train = build_files_dataframe(os.path.join(data_dir, 'train'))
    train = prune_file_list(train, 'label', min_samples)

    dev = build_files_dataframe(os.path.join(data_dir, 'dev'))
    dev_cols = list(dev.columns)
    classes = np.unique(train['label'])
    dev['keep'] = dev['label'].apply(lambda x: x in classes)
    dev = dev[dev['keep'] == True]

    train = train.sample(frac=1).reset_index(drop=True)
    dev = dev.sample(frac=1).reset_index(drop=True)
    return train, dev
Example #3
0
    ap.add_argument('--backbone', required=True, type=str)
    ap.add_argument('--pooling', required=True, type=str)
    ap.add_argument('--output_dir', required=True, type=str)
    ap.add_argument('--min_samples', required=True, type=int)
    ap.add_argument('--cores', required=True, type=int)
    ap.add_argument('--save_features', action='store_true')
    return ap.parse_args()


if __name__ == "__main__":

    args = get_args()

    # Load images and remove the classes with
    # too few examples.
    train = build_files_dataframe(os.path.join(args.base_dir, 'train'))
    train = prune_file_list(data=train,
                            label_col='label',
                            min_samples=args.min_samples)
    n_classes = train['label'].nunique()
    print("We have {} classes.".format(n_classes))

    # Setup output
    create_directory(args.output_dir, recursive=True)

    # Build the model and import the correct pre-processing
    # function.  Each model uses a different function.
    # Maybe they're the same under the hood because
    # they are all trained with imagenet (something to look
    # into).
    model, preprocess_input = model_factory(args.backbone, args.pooling)