def test_balanced_batch_generator_function_no_return_indices(): with pytest.raises(ValueError, match='needs to return the indices'): balanced_batch_generator(X, y, sampler=ClusterCentroids(), batch_size=10, random_state=42)
def __run_balanced_batch_generator(self): self.generator, self.steps_per_epoch = balanced_batch_generator( self.__reshape_data_to_original(), self.y, sampler=RandomOverSampler(), batch_size=self.batch_size, keep_sparse=True)
def test_balanced_batch_generator_function(sampler, sample_weight): model = _build_keras_model(y.shape[1], X.shape[1]) training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42) model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=10)
def test_balanced_batch_generator_function(sampler, sample_weight): model = _build_keras_model(y.shape[1], X.shape[1]) training_generator, steps_per_epoch = balanced_batch_generator( X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42) model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=10)
def classify( self, total_epoch_count=30, warmup_epoch_count=10 ): #, X, type: str, classifier: str, test_prop: float, res: None, res_method: None): if self.type == "binary": self.train_y[np.where(self.train_y == 1)] = 0 self.train_y[np.where(self.train_y == 2)] = 1 self.test_y[np.where(self.test_y == 1)] = 0 self.test_y[np.where(self.test_y == 2)] = 1 #log_dir = ".log/movie_reviews/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s") #tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir) training_generator, steps_per_epoch = balanced_batch_generator( self.train_x, self.train_y, batch_size=48, random_state=100) #total_epoch_count = 30 # model.fit(x=(data.train_x, data.train_x_token_types), y=data.train_y, self.model.fit( training_generator, epochs=total_epoch_count, steps_per_epoch=steps_per_epoch, # validation_split=0.1, callbacks= [ # keras.callbacks.LearningRateScheduler(time_decay,verbose=1), # lrate, self.create_learning_rate_scheduler( max_learn_rate=1e-5, end_learn_rate=5e-8, warmup_epoch_count=warmup_epoch_count, total_epoch_count=total_epoch_count) #, #keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True) # tensorboard_callback ]) self.model.save_weights('./movie_reviews.h5', overwrite=True) Y_pred_probabilities = self.model.predict(self.test_x) Y_pred = np.argmax(Y_pred_probabilities, axis=-1) self.pred_y = Y_pred # Accuracy Percentage print( f"Accuracy is {round(accuracy_score(self.test_y, Y_pred), 2)*100}%" ) # Classification Report print(classification_report(Y_pred, self.test_y)) # Matthew's Correlation Coefficient print( f"Matthew's Correlation Coefficient is {matthews_corrcoef(self.test_y, Y_pred)}" ) # Plots of Confusion Matrix and ROC Curve plot_confusion_matrix(self.test_y, Y_pred, figsize=(10, 10))
def test_balanced_batch_generator_function_sparse(keep_sparse): training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, random_state=42) for idx in range(steps_per_epoch): X_batch, y_batch = next(training_generator) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch)
def test_balanced_batch_generator_function_sparse(is_sparse): training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, sparse=is_sparse, batch_size=10, random_state=42) for idx in range(steps_per_epoch): X_batch, y_batch = next(training_generator) if is_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch)
def __init__(self, x, y, datagen, batch_size=32): self.datagen = datagen self.batch_size = batch_size self._shape = x.shape datagen.fit(x) self.gen, self.steps_per_epoch = balanced_batch_generator( x.reshape(x.shape[0], -1), y, sampler=RandomOverSampler(), batch_size=self.batch_size, keep_sparse=True)
def __init__(self, x, y, datagen, batch_size=32): self.datagen = datagen self.batch_size = min(batch_size, x.shape[0]) # datagen.fit(x) self.gen, self.steps_per_epoch = balanced_batch_generator( x.reshape(x.shape[0], -1), y, sampler=RandomOverSampler(random_state=42), batch_size=self.batch_size, keep_sparse=True) self._shape = (self.steps_per_epoch * batch_size, *x.shape[1:])
def train_model(self, model, X_train, X_test, y_train, y_test): """Training a model to predict the presence or absence of a species. Various instance variables are used to define how the model trains, like: batch size, random seed and number of epochs. :param model: Keras Model Object. Initialized model ready for training. :param X_train: Array. Contains training data. :param X_test: Array. Contains testing data. :param y_train: Array. Contains training (ground truth) labels. :param y_test: Array. Contains testing (ground truth) labels. :return: Tuple. Containing: float 'AUC' performance metric between 0 and 1 (0 = 100% wrong, 1 = 100% right); keras model 'model' a keras model with an identical architecture to the input variable 'model' but with trained weights. """ training_generator, steps_per_epoch = balanced_batch_generator( X_train, y_train, sampler=NearMiss(), batch_size=self.batch, random_state=self.random_seed) model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=self.epoch, verbose=0) score = model.evaluate(X_test, y_test, verbose=0) predictions = model.predict(X_test) fpr, tpr, thresholds = roc_curve(y_test[:, 1], predictions[:, 1]) len_tpr = int(len(tpr) / 2) self.test_loss.append(score[0]) self.test_acc.append(score[1]) self.test_AUC.append(roc_auc_score(y_test[:, 1], predictions[:, 1])) self.test_tpr.append(tpr[len_tpr]) AUC = roc_auc_score(y_test[:, 1], predictions[:, 1]) n_bootstraps = 1000 y_pred = predictions[:, 1] y_true = y_test[:, 1] bootstrapped_scores = [] rng = np.random.RandomState(self.random_seed) for i in range(n_bootstraps): indices = rng.randint(0, len(y_pred) - 1, len(y_pred)) if len(np.unique(y_true[indices])) < 2: continue score = roc_auc_score(y_true[indices], y_pred[indices]) bootstrapped_scores.append(score) sorted_scores = np.array(bootstrapped_scores) sorted_scores.sort() ci_lower = sorted_scores[int(0.05 * len(sorted_scores))] ci_upper = sorted_scores[int(0.95 * len(sorted_scores))] self.test_lci.append(ci_lower) self.test_uci.append(ci_upper) return AUC, model
def test_balanced_batch_generator_function_sparse(data, keep_sparse): X, y = data training_generator, steps_per_epoch = balanced_batch_generator( sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, random_state=42) for _ in range(steps_per_epoch): X_batch, _ = next(training_generator) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch)
def __init__(self, x, y, datagen, batch_size=32, crop_length=224, resize=300, balance=True): self.datagen = datagen self.batch_size = batch_size self._shape = x.shape self.crop_length = crop_length self.resize = resize datagen.fit(x) self.gen, self.steps_per_epoch = balanced_batch_generator( x.reshape(x.shape[0], -1), y, sampler=RandomOverSampler() if balance else None, batch_size=self.batch_size)
def test_balanced_batch_generator_function_no_return_indices(): with pytest.raises(ValueError, match='needs to return the indices'): balanced_batch_generator( X, y, sampler=ClusterCentroids(), batch_size=10, random_state=42)
def data_loader(self): ''' This can be used also to evaluate the input variables with AVOVA F-test ''' ## read data from files logging.debug("Loading data") config_base_dir = os.path.join(self.__dataload_config["base_dir"], self.__dataload_config["plot_config"]) # create the model directory utc_seconds = str(datetime.datetime.now().timestamp()).split(".")[0] logging.info(utc_seconds) self.__model_dir = os.path.join(config_base_dir, self.__dataload_config["cut"], "steps", utc_seconds) os.makedirs(self.__model_dir, exist_ok=True) # load numpy samples_dir = os.path.join(config_base_dir, self.__dataload_config["cut"], "samples", self.__dataload_config["samples_version"]) signal = pickle.load( open(os.path.join(samples_dir, "for_training/signal_balanced.pkl"), "rb")) bkg = pickle.load( open( os.path.join(samples_dir, "for_training/background_balanced.pkl"), "rb")) # Keep only the first "input-dim" columns self.__dataload_config["cols"] = self.__dataload_config[ "cols"][:self.__model_config["input_dim"]] logging.debug(self.__dataload_config["cols"]) ## create numpy arrays X_sig = signal[self.__dataload_config["cols"]].values X_bkg = bkg[self.__dataload_config["cols"]].values Y_sig = np.ones(len(X_sig)) Y_bkg = np.zeros(len(X_bkg)) W_sig = (signal["weight_norm"]).values W_bkg = (bkg["weight_norm"]).values Wnn_sig = (signal["weight_"]).values Wnn_bkg = (bkg["weight_"]).values X = np.vstack([X_sig, X_bkg]) Y = np.hstack([Y_sig, Y_bkg]) W = np.hstack([W_sig, W_bkg]) Wnn = np.hstack([Wnn_sig, Wnn_bkg]) ## import scaler configuration scaler = StandardScaler() X_scaled = scaler.fit_transform(X) pickle.dump(scaler, open(f"{self.__model_dir}/scaler_model.pkl", "wb")) ## Balance X_train, X_test, y_train, y_test, W_train, W_test, Wnn_train, Wnn_test = train_test_split( X_scaled, Y, W, Wnn, test_size=self.__dataload_config["test_ratio"], random_state=42, stratify=Y) X_train, X_val, y_train, y_val, W_train, W_val, Wnn_train, Wnn_val = train_test_split( X_train, y_train, W_train, Wnn_train, test_size=self.__dataload_config["val_ratio"], random_state=42, stratify=y_train) data_split = { "X_train": X_train, "X_test": X_test, "X_val": X_val, "y_train": y_train, "y_test": y_test, "y_val": y_val, "W_train": W_train, "W_test": W_test, "W_val": W_val, "Wnn_train": Wnn_train, "Wnn_test": Wnn_test, "Wnn_val": Wnn_val, } # for dataset_name, dataset in data_split.items(): # print(dataset_name + " " + str(float(dataset.nbytes) / (1024. * 1024.))) ## Oversampling training_generator, steps_per_epoch_train = balanced_batch_generator( X_train, y_train, W_train, batch_size=self.__model_config["batch_size"], sampler=RandomOverSampler()) #validation_generator, steps_per_epoch_val = balanced_batch_generator(X_val, y_val, W_val, batch_size=self.__model_config["batch_size"], sampler=RandomOverSampler()) ## test != val validation_generator, steps_per_epoch_val = balanced_batch_generator( X_val, y_val, W_val, batch_size=self.__model_config["batch_size"], sampler=RandomOverSampler()) ## test == val generators = { "training_generator": training_generator, "steps_per_epoch_train": steps_per_epoch_train, "validation_generator": validation_generator, "steps_per_epoch_val": steps_per_epoch_val, } return data_split, generators
def test_balanced_batch_generator_function_no_return_indices(data): with pytest.raises(ValueError, match='needs to have an attribute'): balanced_batch_generator( *data, sampler=ClusterCentroids(), batch_size=10, random_state=42)
def test_balanced_batch_generator_function_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): balanced_batch_generator(*data, sampler=ClusterCentroids(), batch_size=10, random_state=42)
def train(_train_data, test_data, embedding_matrix, batch_size=BATCH_SIZE, epoch_count=100, max_length=MAX_LENGTH, model_name="model", learning_rate=LR): random.shuffle(_train_data) model_filepath = f"../model/model_{model_name}.h5" result_dir = f"../result/{model_name}" ml.save_hyparameters(result_dir, model_name, epoch_count, batch_size, learning_rate) train_data = [] validation_data = [] validation_data_rate = 0.25 # validationの割合 categorized_data = { category_dict[category]: [] for category in CATEGORIES } # カテゴリごとに分けられたデータ for target_value, input_value in _train_data: categorized_data[target_value].append((target_value, input_value)) # train, validation, test データのカテゴリ組成を等しくする for category_id in categorized_data: data_len = len(categorized_data[category_id]) validation_boundary = int(data_len * validation_data_rate) validation_data += categorized_data[category_id][0:validation_boundary] train_data += categorized_data[category_id][validation_boundary:] test_inputs = [] test_targets = [] train_inputs = [] train_targets = [] validation_inputs = [] validation_targets = [] for target_value, input_value in test_data: test_inputs.append(input_value) test_targets.append([ 1 if i == target_value else 0 for i in range(len(category_dict)) ]) # 正解ラベルだけ1にした配列 for target_value, input_value in validation_data: validation_inputs.append(input_value) validation_targets.append([ 1 if i == target_value else 0 for i in range(len(category_dict)) ]) # 正解ラベルだけ1にした配列 for target_value, input_value in train_data: train_inputs.append(input_value) train_targets.append([ 1 if i == target_value else 0 for i in range(len(category_dict)) ]) # 正解ラベルだけ1にした配列 # カテゴリごとに含まれる数を表示 count_by_categories = { key: np.concatenate([ np.argmax(test_targets, 1), np.argmax(validation_targets, 1), np.argmax(train_targets, 1) ]).tolist().count(category_dict[key]) for key in CATEGORIES } test_count_by_categories = { key: np.argmax(test_targets, 1).tolist().count(category_dict[key]) for key in CATEGORIES } validation_count_by_categories = { key: np.argmax(validation_targets, 1).tolist().count(category_dict[key]) for key in CATEGORIES } train_count_by_categories = { key: np.argmax(train_targets, 1).tolist().count(category_dict[key]) for key in CATEGORIES } print("ALL: ", count_by_categories) print("TRAIN: ", train_count_by_categories) print("VALIDATION: ", validation_count_by_categories) print("TEST: ", test_count_by_categories) test_inputs = np.array(test_inputs) test_targets = np.array(test_targets) train_inputs = np.array(train_inputs) train_targets = np.array(train_targets) validation_inputs = np.array(validation_inputs) validation_targets = np.array(validation_targets) # Bootstrap Sampling train_inputs, train_targets = resample(train_inputs, train_targets, n_samples=len(train_inputs)) # 単語数、embeddingの次元 num_words, word_vec_size = embedding_matrix.shape # モデルの構築 model = Sequential([ Embedding(num_words, word_vec_size, weights=[embedding_matrix], input_length=max_length, trainable=False, mask_zero=True), Conv1D(16, 5, activation='relu'), AveragePooling1D(7), Dropout(0.5), Conv1D(16, 5, activation='relu'), GlobalAveragePooling1D(), Dense(128, activation='relu'), Dropout(0.3), Dense(len(CATEGORIES), activation='softmax') ]) model.summary() #Embedding層は学習しないようする model.layers[0].trainable = False model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate), metrics=['accuracy']) # model.compile(loss='categorical_crossentropy', # optimizer=keras.optimizers.rmsprop(learning_rate), # metrics=['accuracy']) # checkpointの設定 checkpoint = ModelCheckpoint( filepath=model_filepath, monitor='val_loss', save_best_only=True, period=1, ) # 学習率を少しずつ下げるようにする # start = learning_rate # stop = learning_rate * 0.1 # learning_rates = np.linspace(start, stop, epoch_count) # データの不均衡性への対策 training_generator, steps_per_epoch = balanced_batch_generator( train_inputs, train_targets, batch_size=batch_size, random_state=42) # validation_generator, validation_steps = balanced_batch_generator( # validation_inputs, validation_targets, batch_size=batch_size, random_state=42) # 学習 history = model.fit_generator( generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=epoch_count, verbose=1, validation_data=(validation_inputs, validation_targets), # validation_data=validation_generator, # validation_steps=validation_steps, # callbacks=[checkpoint, LearningRateScheduler(lambda epoch: learning_rates[epoch])], callbacks=[checkpoint], shuffle=True) # 最良の結果を残したモデルを読み込む model = load_model(model_filepath) ml.model_evaluate(model, test_inputs, test_targets, result_dir) ml.classification_evaluate(model, test_inputs, test_targets, result_dir) ml.visualize_model(model, result_dir) ml.save_history(history, result_dir)
return model # Model Fitting baseline = MiniVGGNet('baseline', IMAGE_DIMS, n_classes, INIT_LR, EPOCHS, BS) # takes some time to run, depending on hardware (resume here) baseline.fit(X_train, y_train_bin, X_test, y_test_bin) ros = MiniVGGNet('baseline_ros', IMAGE_DIMS, n_classes, INIT_LR, EPOCHS, BS) # ValueError: Found array with dim 4. Estimator expected <= 2. ros_generator, steps_per_epoch_ros = balanced_batch_generator( X_train, y_train_bin, sampler=RandomOverSampler(), batch_size=BS, random_state=0) ros.fit_generator(X_train, y_train_bin, X_test, y_test_bin, ros_generator, steps_per_epoch_ros) img_datagen = MiniVGGNet('baseline_datagen', IMAGE_DIMS, n_classes, INIT_LR, EPOCHS, BS) img_data_generator = ImageDataGenerator( rotation_range=25, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode="nearest")