Beispiel #1
0
def create_subset_data(face_encoding_dir, src_dataset_dir, dst_dataset_dir):
    src_dst_copy_paths = []
    mids = utils.get_dir_names(face_encoding_dir)
    for i, mid in enumerate(mids):
        file_path = os.path.join(face_encoding_dir, mid)
        img_fencoding_map = utils.load_json(file_path)

        file_names, face_encodings = [], []
        for fname, fencoding in img_fencoding_map.items():
            file_names.append(fname)
            face_encodings.append(np.array(fencoding))

        similarites = [(fname, sim) for fname, sim in zip(
            file_names, calculate_similarity(face_encodings))]
        similarites.sort(key=lambda x: x[1], reverse=True)

        print("{}/{} Calculate similarity of mid {} done".format(
            i + 1, len(mids), mid))

        # Remain number images corresponding with highest similarity
        if len(similarites) > 0:
            num_remain_images = math.ceil(similarites[0][1] * 100)
            for fname, _ in similarites[:num_remain_images]:
                src_path = os.path.join(src_dataset_dir, mid, fname)
                dst_path = os.path.join(dst_dataset_dir, mid, fname)

                src_dst_copy_paths.append((src_path, dst_path))
        else:
            utils.make_dirs(os.path.join(dst_dataset_dir, mid))

    num_success = utils.copy_files(src_dst_copy_paths)
    print("Create subset data (size = {}) from {} to {} done".format(
        num_success, src_dataset_dir, dst_dataset_dir))
Beispiel #2
0
def save_face_encoding(dataset_dir="../Temp/Dataset/Original",
                       save_dir="../Temp/Dataset/Process"):
    start_time = time.time()
    save_dir = os.path.join(save_dir, "face_encodings")
    total_files = 0

    dirs = utils.get_dir_names(parent_dir=dataset_dir)
    total_dirs = len(dirs)
    for i, dir in enumerate(dirs):
        fencoding_of_dir = _get_face_encodings(os.path.join(dataset_dir, dir))
        fencoding_map = {
            fname: fencoding
            for fname, fencoding in fencoding_of_dir
        }
        total_files += len(fencoding_map)

        save_path = os.path.join(save_dir, dir)
        utils.save_json(fencoding_map, save_path)
        print("Calculate and Save {}/{} face encoding dir done".format(
            i + 1, total_dirs))

    exec_time = time.time() - start_time
    print("\nCalculate face encodings of {} dirs and {} files in dir {} done".
          format(total_dirs, total_files, dataset_dir))
    print("Save face encoding to dir {} done".format(save_dir))
    print("Time : {:.2f} seconds".format(exec_time))
def copy_subset_args():

    ap = argparse.ArgumentParser()
    ap.add_argument("--src_dataset_dir", required=True)
    ap.add_argument("--dst_dataset_dir", required=True)
    ap.add_argument("--min_imgs_per_class", "-mipc", default="50")
    ap.add_argument("--max_classes", default="500")

    args = vars(ap.parse_args())
    src_dataset_dir = args["src_dataset_dir"]
    dst_dataset_dir = args["dst_dataset_dir"]
    min_imgs_per_class = int(args["min_imgs_per_class"])
    max_classes = int(args["max_classes"])

    num_classes = 0
    for class_name in utils.get_dir_names(src_dataset_dir):
        file_names = utils.get_file_names(os.path.join(src_dataset_dir, class_name))
        if len(file_names) >= min_imgs_per_class:
            num_classes += 1
            src_dst_paths = [(os.path.join(src_dataset_dir, class_name, src_name),
                              os.path.join(dst_dataset_dir, class_name, src_name))
                             for src_name in file_names]
            utils.copy_files(src_dst_paths)

            print("\nCopy {}/{} classes done".format(num_classes, max_classes))
            if num_classes >= max_classes:
                break
Beispiel #4
0
def generate_batch(dataset_dir, batch_size=64, image_size=160):

    seq = iaa.Sequential([
        iaa.Scale({"height": image_size, "width": image_size}),
        iaa.Fliplr(0.5, random_state=7),
        iaa.Affine(scale={"x": (0.9, 1.1), "y": (0.9, 1.1)}, rotate=(-15, 15))
    ])

    random.seed(7)
    mid_names = utils.get_dir_names(dataset_dir)
    random.shuffle(mid_names)

    map_mid_idx = {mid: 0 for mid in mid_names}
    map_mid_fpaths = {mid: utils.get_file_paths(os.path.join(dataset_dir, mid))
                      for mid in mid_names}

    mid_idx = 0
    max_iter = 0
    while True:
        x_batch1, x_batch2, y_batch = [], [], []
        for i in range(int(batch_size / 2)):
            mid_name = mid_names[mid_idx]
            # Generate same class pair
            fpaths = map_mid_fpaths.get(mid_name)
            path_idx = map_mid_idx.get(mid_name)

            img1 = cv2.imread(fpaths[path_idx])
            # seq.show_grid(img1, cols=8, rows=8)
            img2 = cv2.imread(fpaths[(path_idx + 1) % len(fpaths)])

            x_batch1.append(seq.augment_image(img1))
            x_batch2.append(seq.augment_image(img2))
            y_batch.append(1)

            path_idx = (path_idx + 1) % len(fpaths)
            map_mid_idx.update({mid_name: path_idx})
            if path_idx == 0:
                random.shuffle(fpaths)

            # Generate different class pair
            next_mid_name = mid_names[(mid_idx + 1) % len(mid_names)]
            fpaths = map_mid_fpaths.get(next_mid_name)
            img2 = cv2.imread(random.choice(fpaths))

            x_batch1.append(seq.augment_image(img1))
            x_batch2.append(seq.augment_image(img2))
            y_batch.append(0)

            mid_idx = (mid_idx + 1) % len(mid_names)
            if mid_idx == 0:
                random.shuffle(mid_names)

        yield [np.array(x_batch1), np.array(x_batch2)], np.array(y_batch)
Beispiel #5
0
    def _init_data(self):
        self.mid_name_map = project_utils.load_mid_name_map(self.mid_name_path)

        mids_train = utils.get_dir_names(self.training_data_dir)
        self.mid_class_map, self.class_mid_map = {}, {}
        for i, mid in enumerate(mids_train):
            self.mid_class_map.update({mid: i})
            self.class_mid_map.update({i: mid})

        self.num_classes = len(mids_train)

        if self.mode == "train":
            eda_save_dir = os.path.join(self.experiment_dir, "EDA_Result")
            calculate_class_distribution(self.training_data_dir,
                                         save_dir=eda_save_dir)
Beispiel #6
0
def split_dataset(src_dataset_dir,
                  dst_dataset_dir,
                  test_size=0.1,
                  valid_size=0.0):
    start_time = time.time()

    train, valid, test = [], [], []
    for dir in utils.get_dir_names(src_dataset_dir):
        fnames = utils.get_file_names(os.path.join(src_dataset_dir, dir))
        num_fnames = len(fnames)
        num_test = int(math.ceil(test_size * num_fnames))
        num_valid = int(math.ceil(valid_size * num_fnames))
        num_train = num_fnames - num_test - num_valid

        random.shuffle(fnames)
        dir_fnames = [(dir, fname) for fname in fnames]
        train.extend(dir_fnames[:num_train])
        valid.extend(dir_fnames[num_train:num_train + num_valid])
        test.extend(dir_fnames[-num_test:])

    train_dir = os.path.join(dst_dataset_dir, "Train")
    valid_dir = os.path.join(dst_dataset_dir, "Valid")
    test_dir = os.path.join(dst_dataset_dir, "Test")

    # Save new split dataset
    lst = [(train_dir, train), (valid_dir, valid), (test_dir, test)]
    src_dst_paths = []
    for dst_parent_dir, dir_fnames in lst:
        for dir, fname in dir_fnames:
            src_path = os.path.join(src_dataset_dir, dir, fname)
            dst_path = os.path.join(dst_parent_dir, dir, fname)
            src_dst_paths.append((src_path, dst_path))

    utils.copy_files(src_dst_paths)

    exec_time = time.time() - start_time

    print("\nSplit dataset from {} (size = {}) to :".format(
        src_dataset_dir, len(src_dst_paths)))
    print("---- {} (size = {})".format(train_dir, len(train)))
    print("---- {} (size = {})".format(valid_dir, len(valid)))
    print("---- {} (size = {})".format(test_dir, len(test)))
    print("Time : {:.2f} seconds".format(exec_time))
Beispiel #7
0
    def _load_face_encodings(self):
        start_time = time.time()
        X_train, y_train = [], []
        idx_fname_map = {}
        mids_train = utils.get_dir_names(self.training_data_dir)
        for mid in mids_train:
            self.face_encoding_map.update({mid: {}})

            mid_dir = os.path.join(self.training_data_dir, mid)
            file_names = utils.get_file_names(mid_dir)

            fencoding_map_of_mid = load_face_encoding(self.face_encoding_dir,
                                                      file_names=[mid
                                                                  ]).get(mid)
            # print("face_encoding_map of {} : {}".format(mid, list(fencoding_map_of_mid.keys())))
            num_calculated_files = 0
            for file_name in file_names:
                fencoding = fencoding_map_of_mid.get(file_name)
                if fencoding is None:
                    # Calculate face encoding of this file name
                    fencoding = get_face_encodings(
                        image_path=os.path.join(mid_dir, file_name))
                    if len(fencoding) > 0:
                        num_calculated_files += 1
                        fencoding_map_of_mid.update({file_name: fencoding})

                if len(fencoding) > 0:
                    self.face_encoding_map[mid].update({file_name: fencoding})
                    idx_fname_map.update({len(X_train): (mid, file_name)})
                    X_train.append(fencoding)
                    y_train.append(self.mid_class_map.get(mid))

            # Save face encoding if there is any encoding just calculated
            if num_calculated_files > 0:
                utils.save_json(fencoding_map_of_mid,
                                os.path.join(self.face_encoding_dir, mid))

        self.X_train, self.y_train = np.array(X_train), np.array(y_train)
        self.idx_fname_map = idx_fname_map

        exec_time = time.time() - start_time
        print("{}:: Load face encoding done. Time : {:.2f} seconds".format(
            self.class_name, exec_time))
Beispiel #8
0
    def train(self):
        start_time = time.time()

        # Setup generator
        if self.is_siamese:
            train_generator = generate_batch(dataset_dir=self.train_dir,
                                             batch_size=self.batch_size,
                                             image_size=self.image_size)
            valid_generator = generate_batch(dataset_dir=self.valid_dir,
                                             batch_size=self.batch_size,
                                             image_size=self.image_size)

            self.num_classes = len(utils.get_dir_names(self.train_dir))

        else:
            train_datagen = ImageDataGenerator(
                rescale=1. / 255,
                rotation_range=20,
                width_shift_range=0.2,
                height_shift_range=0.2,
                horizontal_flip=True,
            )

            valid_datagen = ImageDataGenerator(rescale=1. / 255)

            train_generator = train_datagen.flow_from_directory(
                directory=self.train_dir,
                target_size=(self.image_size, self.image_size),
                batch_size=self.batch_size,
            )

            valid_generator = valid_datagen.flow_from_directory(
                directory=self.valid_dir,
                target_size=(self.image_size, self.image_size),
                batch_size=self.batch_size,
            )
            self.num_classes = len(train_generator.class_indices)

        optimizer = Adam
        if self.optimizer == "Adam":
            optimizer = Adam
        elif self.optimizer == "RMSProp":
            optimizer = RMSprop
        model = None
        # Check training from scratch or continue training
        if self.model_path is not None:
            model = load_model(self.model_path)
        else:
            if self.model_name == "VGG16":
                model_base = VGG16(include_top=False,
                                   input_shape=self.input_shape)
            elif self.model_name == "ResNet50":
                model_base = ResNet50(include_top=False,
                                      input_shape=self.input_shape)
            elif self.model_name == "DenseNet121":
                model_base = DenseNet121(include_top=False,
                                         input_shape=self.input_shape)
            elif self.model_name == "InceptionV3":
                model_base = InceptionV3(include_top=False,
                                         input_shape=self.input_shape)
            elif self.model_name == "InceptionResNetV2":
                model_base = InceptionResNetV2(include_top=False,
                                               input_shape=self.input_shape)
            elif self.model_name == "Xception":
                model_base = Xception(include_top=False,
                                      input_shape=self.input_shape)
            elif self.model_name == "Scratch":
                model_base = Sequential()
                model_base.add(
                    Conv2D(32,
                           kernel_size=(3, 3),
                           activation="relu",
                           input_shape=self.input_shape))
                model_base.add(
                    Conv2D(32, kernel_size=(3, 3), activation="relu"))
                model_base.add(MaxPool2D())
                model_base.add(
                    Conv2D(64, kernel_size=(3, 3), activation="relu"))
                model_base.add(
                    Conv2D(64, kernel_size=(3, 3), activation="relu"))
                model_base.add(MaxPool2D())
                model_base.add(
                    Conv2D(128, kernel_size=(3, 3), activation="relu"))
                model_base.add(
                    Conv2D(128, kernel_size=(3, 3), activation="relu"))
                model_base.add(
                    Conv2D(128, kernel_size=(3, 3), activation="relu"))
                model_base.add(MaxPool2D())
                model_base.add(
                    Conv2D(256, kernel_size=(3, 3), activation="relu"))
                model_base.add(
                    Conv2D(256, kernel_size=(3, 3), activation="relu"))
                model_base.add(
                    Conv2D(256, kernel_size=(3, 3), activation="relu"))
                model_base.add(MaxPool2D())
                self.num_trainable_layer = len(model_base.layers)
            else:
                print("Model name {} is not valid ".format(self.model_name))
                return 0

            # Freeze low layer
            for layer in model_base.layers[:-self.num_trainable_layer]:
                layer.trainable = False

            # Show trainable status of each layers
            print("\nAll layers of {} ".format(self.model_name))
            for layer in model_base.layers:
                print("Layer : {} - Trainable : {}".format(
                    layer, layer.trainable))

            model = Sequential()
            model.add(model_base)
            model.add(Flatten())
            # model.add(Dense(50, activation="relu"))
            # model.add(Dropout(0.25))
            model.add(Dense(self.num_classes, activation="softmax"))

            # Compile model
            model.compile(loss="categorical_crossentropy",
                          metrics=["acc"],
                          optimizer=optimizer(lr=self.lr))

        if self.is_siamese:
            model = get_siamese_model(model)
            model.compile(loss=contrastive_loss,
                          metrics=[accuracy],
                          optimizer=optimizer(lr=self.lr))

        print("\nFinal model summary")
        model.summary()

        # classes = [_ for _ in range(self.num_classes)]
        # for c in train_generator.class_indices:
        #     classes[train_generator.class_indices[c]] = c
        #
        # model.classes = classes

        # Define callbacks
        save_model_dir = os.path.join(self.save_dir,
                                      "Model_{}".format(self.model_name))
        utils.make_dirs(save_model_dir)
        # loss_path = os.path.join(save_model_dir, "epochs_{epoch:02d}-val_loss_{val_loss:.2f}.h5")
        # loss_checkpoint = ModelCheckpoint(
        #     filepath=loss_path,
        #     monitor="val_loss",
        #     verbose=1,
        #     save_best_only=True
        # )

        acc_path = os.path.join(
            save_model_dir, "epochs_{epoch:02d}-val_acc_{val_accuracy:.2f}.h5")
        acc_checkpoint = ModelCheckpoint(filepath=acc_path,
                                         monitor="val_accuracy",
                                         verbose=1,
                                         save_best_only=True)
        callbacks = [acc_checkpoint]

        # Train model
        print("Start train model from {} ...".format("{} pretrained".format(
            self.model_name) if self.model_path is None else self.model_path))

        if self.is_siamese:
            history = model.fit_generator(
                generator=train_generator,
                steps_per_epoch=self.num_classes / self.batch_size,
                epochs=self.num_epochs,
                validation_data=valid_generator,
                validation_steps=self.num_classes / self.batch_size,
                callbacks=callbacks)
        else:
            history = model.fit_generator(
                generator=train_generator,
                steps_per_epoch=train_generator.samples /
                train_generator.batch_size,
                epochs=self.num_epochs,
                validation_data=valid_generator,
                validation_steps=valid_generator.samples /
                train_generator.batch_size,
                callbacks=callbacks)

        # Save model
        save_path = os.path.join(save_model_dir, "final_model.h5")
        model.save(save_path)

        # Save history
        acc, val_acc = history.history["acc"], history.history["val_acc"]
        loss, val_loss = history.history["loss"], history.history["val_loss"]
        train_stats = dict(Loss=loss,
                           Valid_Loss=val_loss,
                           Accuracy=acc,
                           Valid_Accuracy=val_acc)
        df = pd.DataFrame(train_stats)
        save_path = os.path.join(self.save_dir, "History.csv")
        utils.save_csv(df, save_path)

        exec_time = time.time() - start_time
        print("\nTrain model {} done. Time : {:.2f} seconds".format(
            "{} pretrained".format(self.model_name)
            if self.model_path is None else self.model_path, exec_time))