def train(): train_data = read_data_from_json(TRAIN_DATA_PATH) val_data = read_data_from_json(VALIDATION_DATA_PATH) test_data = read_data_from_json(TEST_DATA_PATH) # train_data = undersample_dataset(train_data, prob=0.68) # val_data = undersample_dataset(val_data, prob=0.68) # test_data = undersample_dataset(test_data, prob=0.68) # train_data = train_data[:2] # val_data = val_data[:2] # test_data = test_data[:1] if DEBUG: print_data_stats(train_data, "Train") print_data_stats(val_data, "Validation") print_data_stats(test_data, "Test") if False: print(dataset_similarity(val_data, train_data)) # 0.5714% print(dataset_similarity(test_data, train_data)) # 2.112% # Tokenize data (rudimentary tokenizer). tokenizer = Tokenizer() tokenizer.fit_on_texts( all_sentences(train_data) + all_sentences(val_data) + all_sentences(test_data)) if DEBUG: print("Tokenizer found {} words.".format(len(tokenizer.word_counts))) print("") # Convert to Keras input arrays (or dict). wf = WordFeatures() wf.train_PMI(train_data + val_data + test_data) train_data, train_labels, _ = preprocess_data(train_data, tokenizer, wf, "train") val_data, val_labels, _ = preprocess_data(val_data, tokenizer, wf, "validation") test_data, test_labels, _ = preprocess_data(test_data, tokenizer, wf, "test") # Equalize training data labels to the same frequency. if False: from utils import equalize train_data, train_labels = equalize(train_data, train_labels) if DEBUG: print("Train data has been equalized. New freq: {}.".format( np.asarray(np.sum(train_labels, axis=0), dtype=np.int32))) if False: from utils import oversample_dataset train_data, train_labels = oversample_dataset(train_data, train_labels, [6000, 8000]) if DEBUG: print("Train data has been oversampled. New freq: {}.".format( np.asarray(np.sum(train_labels, axis=0), dtype=np.int32))) embeddings_matrix = build_embeddings_matrix(tokenizer) num_words = len(tokenizer.word_counts) model = define_model(num_words, embeddings_matrix, "train", WORD_EMBEDDINGS_DIM) model.summary() plot_model(model, to_file='model.png', show_shapes=True) filepath = "models/" + "model.{val_acc:.3f}-{epoch:03d}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, mode='max', save_best_only=True, save_weights_only=True) model.fit(train_data, train_labels, batch_size=4000, epochs=450, verbose=2, validation_data=(val_data, val_labels), callbacks=[checkpoint]) score = model.evaluate(test_data, test_labels, verbose=0) if score: print('Test loss:', score[0]) print('Test accuracy:', score[1])
def split_data(self): self.labeled_i, self.unlabeled_i = equalize(*train_test_split( self.labeled_i, test_size=self.percent_unlabeled, stratify=self.df['material'].iloc[self.labeled_i]))
def detect_face_in_frame(a_frame): a_frame_prepared = equalize(a_frame) # watch out for detect_multiscale and the size of capture! faces_list = face_classifier.detect_multiscale(a_frame_prepared) return faces_list[0] if len(faces_list) == 1 else None
def split_data(self): self.labeled_i, self.unlabeled_i = equalize( *train_test_split(self.labeled_i, test_size=self.num_unlabeled, stratify=self.labels.numpy()))
def detect_face_in_frame(self, a_frame): a_frame = convert_to_gray(a_frame) a_frame = equalize(a_frame) faces_list = self.face_classifier.detect_multiscale(a_frame) return faces_list[0] if len(faces_list) == 1 else None