def test_balanced_batch_generator_class_sparse(keep_sparse): training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y, batch_size=10, keep_sparse=keep_sparse, random_state=42) for idx in range(len(training_generator)): X_batch, y_batch = training_generator.__getitem__(idx) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch)
def test_balanced_batch_generator_class_sparse(is_sparse): training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y, batch_size=10, sparse=is_sparse, random_state=42) for idx in range(len(training_generator)): X_batch, y_batch = training_generator.__getitem__(idx) if is_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch)
def __init__(self, x, y, datagen, batch_size=32): self.datagen = datagen self.batch_size = batch_size self._shape = x.shape datagen.fit(x) self.gen, self.steps_per_epoch, *rest = BalancedBatchGenerator( x.reshape(x.shape[0], -1), y, sampler=RandomOverSampler(), batch_size=self.batch_size, keep_sparse=True)
def test_balanced_batch_generator_class_sparse(data, keep_sparse): X, y = data training_generator = BalancedBatchGenerator( sparse.csr_matrix(X), y, batch_size=10, keep_sparse=keep_sparse, random_state=42, ) for idx in range(len(training_generator)): X_batch, _ = training_generator.__getitem__(idx) if keep_sparse: assert sparse.issparse(X_batch) else: assert not sparse.issparse(X_batch)
def fit_with_undersampling(self, train_data, train_labels, valid_data, valid_labels): train_data = self.encode_sequences(train_data) valid_data = self.encode_sequences(valid_data) print('encode:', train_data.shape) training_generator = BalancedBatchGenerator(train_data, train_labels, batch_size=self.batch_size, random_state=42) early_stopping = EarlyStopping(monitor='val_acc', patience=50, verbose=1, mode='max') check_point = ModelCheckpoint(self.best_model_path, monitor='val_acc', mode='max', verbose=1, save_best_only=True) callbacks_list = [check_point, early_stopping] self.model.fit_generator(generator=training_generator, epochs=self.max_epoches, verbose=2, callbacks=callbacks_list, validation_data=(valid_data, valid_labels))
def fit_predict_balanced_model(X_train, y_train, X_test, y_test): model = make_model(X_train.shape[1]) training_generator = BalancedBatchGenerator(X_train, y_train, batch_size=1000, random_state=42) model.fit_generator(generator=training_generator, epochs=5, verbose=1) y_pred = model.predict_proba(X_test, batch_size=1000) return roc_auc_score(y_test, y_pred)
def train(model, X, y, X_val, y_val, checkpoint, num_epochs=20): sgd = SGD(lr=0.01) model.compile(optimizer=sgd, loss='mean_squared_error', metrics=[accuracy]) training_generator = BalancedBatchGenerator(X, y, sampler=RandomOverSampler(), batch_size=64, random_state=42) model.fit_generator(generator=training_generator, epochs=num_epochs, validation_data=(X_val,y_val)) model.save_weights(checkpoint)
def test_balanced_batch_generator_class(sampler, sample_weight): model = _build_keras_model(y.shape[1], X.shape[1]) training_generator = BalancedBatchGenerator(X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, random_state=42) model.fit_generator(generator=training_generator, epochs=10)
def fit_predict_balanced_model(X_train, y_train, X_test, y_test): model = make_model(X_train.shape[1]) training_generator = BalancedBatchGenerator(X_train, y_train, batch_size=1000, random_state=42) model.fit_generator(generator=training_generator, epochs=200, verbose=0) y_pred = model.predict(X_test, batch_size=1000) y_pred = tf.argmax(y_pred, axis=-1).numpy() y_test = tf.argmax(y_test, axis=-1).numpy() return balanced_accuracy_score(y_test, y_pred)
def create_model(x_train_sequence, y_train, x_test_sequence, y_test): verbose = 1 max_sequence_length = 110 vocab_size = 3000 embedding_dim = {{choice([32, 64, 128, 256, 512])}} lstm = {{choice([32, 64, 128, 256, 512])}} num_epochs = {{choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])}} dropout = {{uniform(0, 1)}} recurrent_dropout = {{uniform(0, 1)}} alpha = {{uniform(0, 3)}} batch_size = {{choice([32, 64, 128, 256])}} model = Sequential() model.add( Embedding(vocab_size, embedding_dim, input_length=max_sequence_length, mask_zero=True)) model.add( LSTM(lstm, recurrent_dropout=recurrent_dropout, return_sequences=False)) model.add(ELU(alpha=alpha)) model.add(Dropout(dropout)) model.add(Dense(2, activation='softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["binary_accuracy"]) model.summary() # Fit the model and evaluate #result = model.fit(x_train_sequence, y_train, batch_size=batch_size, # validation_data=(x_test_sequence, y_test), verbose=verbose, shuffle=True, epochs=num_epochs) generator = BalancedBatchGenerator(x_train_sequence, y_train, sampler=NearMiss(), batch_size=batch_size, random_state=1) result = model.fit_generator(generator=generator, epochs=num_epochs, verbose=verbose) #get the highest validation accuracy of the training epochs validation_acc = np.amax(result.history["binary_accuracy"]) print('Best validation acc of epoch:', validation_acc) print('Embedding_dim: ', embedding_dim) print('Number of neurons: ', lstm) print('Epochs: ', num_epochs) print('Dropout: ', dropout) print('Recurrent Dropout: ', recurrent_dropout) print('Batch Size: ', batch_size) #print('Alpha: ', alpha) return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}
class BalancedDataGenerator(Sequence): """ImageDataGenerator + RandomOversampling""" def __init__(self, x, y, datagen, batch_size=32): self.datagen = datagen self.batch_size = batch_size self._shape = x.shape datagen.fit(x) self.gen, self.steps_per_epoch, *rest = BalancedBatchGenerator( x.reshape(x.shape[0], -1), y, sampler=RandomOverSampler(), batch_size=self.batch_size, keep_sparse=True) def __len__(self): return self._shape[0] // self.batch_size def __getitem__(self, idx): x_batch, y_batch = self.gen.__next__() x_batch = x_batch.reshape(-1, *self._shape[1:]) return self.datagen.flow(x_batch, y_batch, batch_size=self.batch_size).next()
#dropout? model.add(Dense(128, activation='relu')) model.add(Dense(y_count, activation='softmax')) # compile the keras model sgd = optimizers.adam(lr=0.02) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # fit the keras model on the dataset x_train, x_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] training_generator = BalancedBatchGenerator(X, y, sampler=NearMiss(), batch_size=8, random_state=42) model.fit_generator(generator=training_generator, epochs=32, verbose=1) cvscores.append(model.evaluate(x_test, y_test)) print('Model evaluation ', cvscores[-1]) print('\n') cfm = confusion_matrix(np.argmax(y_test, axis=1), model.predict_classes(x_test), labels=[i for i in range(y_count)]) cfm = pd.DataFrame(cfm, col, col) print(cfm) print('\n') print('mean accuracy is at: %s' % np.mean(list(zip(*cvscores))[1]))
def test_balanced_batch_generator_class_no_return_indices(data): with pytest.raises(ValueError, match="needs to have an attribute"): BalancedBatchGenerator(*data, sampler=ClusterCentroids(), batch_size=10)
def test_balanced_batch_generator_class_no_return_indices(): with pytest.raises(ValueError, match='needs to return the indices'): BalancedBatchGenerator(X, y, sampler=ClusterCentroids(), batch_size=10)
create_dir('log') log_fname = 'log/balanced_log.csv' if not os.path.exists(log_fname): with open(log_fname, 'w+') as f: writer = csv.writer(f) writer.writerow(['model_name', 'loss']) # upsample minority classes ###################################### model_name = 'upsample_random' create_dir(f'log/{model_name}') sampler = RandomOverSampler(random_state=42) training_generator = BalancedBatchGenerator(X_train, y_train_onehot, sampler=sampler, batch_size=batchsize, random_state=42) train(training_generator, X_test, y_test, X_val, y_val, model_name, batchsize, log_fname) # downsample majority classes ##################################### model_name = 'downsample-random' create_dir(f'log/{model_name}') sampler = RandomUnderSampler(random_state=42) training_generator = BalancedBatchGenerator(X_train, y_train_onehot, sampler=sampler, batch_size=batchsize, random_state=42) train(training_generator, X_test, y_test, X_val, y_val, model_name,
# input layer model = Sequential() model.add(Dense(output_dim=256, input_dim=83, activation="relu")) # model.add(Dense(output_dim = 32, activation = "relu",kernel_regularizer=regularizers.l2(0.01))) model.add(Dense(output_dim=256, activation="relu")) model.add(Dense(output_dim=256, activation="relu")) model.add(Dense(output_dim=1, activation="sigmoid")) model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc']) print(model.summary()) # # fit the keras model on the dataset from imblearn.keras import BalancedBatchGenerator training_generator = BalancedBatchGenerator( X_train, y_train, sampler=RandomUnderSampler(), batch_size=16, random_state=0) # model.fit(X_train, y_train, epochs=10, batch_size=512, verbose=2) callback_history = model.fit_generator(generator=training_generator, epochs=15, verbose=2, validation_data=(X_test, y_test)) # # evaluate the keras model # _, accuracy = model.evaluate(X, y) # print('Accuracy: %.2f' % (accuracy * 100)) # # DT # DT = tree.DecisionTreeClassifier() # dataDT = DT.fit(X_train, y_train) # print('finish modeling') # predict # test_y_predicted = dataDT.predict(X_test)