def train():
    train_data = read_data_from_json(TRAIN_DATA_PATH)
    val_data = read_data_from_json(VALIDATION_DATA_PATH)
    test_data = read_data_from_json(TEST_DATA_PATH)

    # train_data = undersample_dataset(train_data, prob=0.68)
    # val_data = undersample_dataset(val_data, prob=0.68)
    # test_data = undersample_dataset(test_data, prob=0.68)

    # train_data = train_data[:2]
    # val_data = val_data[:2]
    # test_data = test_data[:1]

    if DEBUG:
        print_data_stats(train_data, "Train")
        print_data_stats(val_data, "Validation")
        print_data_stats(test_data, "Test")
        if False:
            print(dataset_similarity(val_data, train_data))  # 0.5714%
            print(dataset_similarity(test_data, train_data))  # 2.112%

    # Tokenize data (rudimentary tokenizer).
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(
        all_sentences(train_data) + all_sentences(val_data) +
        all_sentences(test_data))

    if DEBUG:
        print("Tokenizer found {} words.".format(len(tokenizer.word_counts)))
        print("")

    # Convert to Keras input arrays (or dict).
    wf = WordFeatures()
    wf.train_PMI(train_data + val_data + test_data)

    train_data, train_labels, _ = preprocess_data(train_data, tokenizer, wf,
                                                  "train")
    val_data, val_labels, _ = preprocess_data(val_data, tokenizer, wf,
                                              "validation")
    test_data, test_labels, _ = preprocess_data(test_data, tokenizer, wf,
                                                "test")

    # Equalize training data labels to the same frequency.
    if False:
        from utils import equalize
        train_data, train_labels = equalize(train_data, train_labels)
        if DEBUG:
            print("Train data has been equalized. New freq: {}.".format(
                np.asarray(np.sum(train_labels, axis=0), dtype=np.int32)))
    if False:
        from utils import oversample_dataset
        train_data, train_labels = oversample_dataset(train_data, train_labels,
                                                      [6000, 8000])
        if DEBUG:
            print("Train data has been oversampled. New freq: {}.".format(
                np.asarray(np.sum(train_labels, axis=0), dtype=np.int32)))

    embeddings_matrix = build_embeddings_matrix(tokenizer)

    num_words = len(tokenizer.word_counts)
    model = define_model(num_words, embeddings_matrix, "train",
                         WORD_EMBEDDINGS_DIM)
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)

    filepath = "models/" + "model.{val_acc:.3f}-{epoch:03d}.hdf5"
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=0,
                                 mode='max',
                                 save_best_only=True,
                                 save_weights_only=True)
    model.fit(train_data,
              train_labels,
              batch_size=4000,
              epochs=450,
              verbose=2,
              validation_data=(val_data, val_labels),
              callbacks=[checkpoint])
    score = model.evaluate(test_data, test_labels, verbose=0)
    if score:
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
Example #2
0
 def split_data(self):
     self.labeled_i, self.unlabeled_i = equalize(*train_test_split(
         self.labeled_i,
         test_size=self.percent_unlabeled,
         stratify=self.df['material'].iloc[self.labeled_i]))
Example #3
0
    def detect_face_in_frame(a_frame):
        a_frame_prepared = equalize(a_frame)
        # watch out for detect_multiscale and the size of capture!
        faces_list = face_classifier.detect_multiscale(a_frame_prepared)

        return faces_list[0] if len(faces_list) == 1 else None
Example #4
0
 def split_data(self):
     self.labeled_i, self.unlabeled_i = equalize(
         *train_test_split(self.labeled_i,
                           test_size=self.num_unlabeled,
                           stratify=self.labels.numpy()))
Example #5
0
    def detect_face_in_frame(self, a_frame): 
        a_frame = convert_to_gray(a_frame)
        a_frame = equalize(a_frame)
        faces_list = self.face_classifier.detect_multiscale(a_frame)

        return faces_list[0] if len(faces_list) == 1 else None