def test_balanced_batch_generator_class_sparse(keep_sparse):
    training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y,
                                                batch_size=10,
                                                keep_sparse=keep_sparse,
                                                random_state=42)
    for idx in range(len(training_generator)):
        X_batch, y_batch = training_generator.__getitem__(idx)
        if keep_sparse:
            assert sparse.issparse(X_batch)
        else:
            assert not sparse.issparse(X_batch)
def test_balanced_batch_generator_class_sparse(is_sparse):
    training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y,
                                                batch_size=10,
                                                sparse=is_sparse,
                                                random_state=42)
    for idx in range(len(training_generator)):
        X_batch, y_batch = training_generator.__getitem__(idx)
        if is_sparse:
            assert sparse.issparse(X_batch)
        else:
            assert not sparse.issparse(X_batch)
Example #3
0
 def __init__(self, x, y, datagen, batch_size=32):
     self.datagen = datagen
     self.batch_size = batch_size
     self._shape = x.shape
     datagen.fit(x)
     self.gen, self.steps_per_epoch, *rest = BalancedBatchGenerator(
         x.reshape(x.shape[0], -1),
         y,
         sampler=RandomOverSampler(),
         batch_size=self.batch_size,
         keep_sparse=True)
Example #4
0
def test_balanced_batch_generator_class_sparse(data, keep_sparse):
    X, y = data
    training_generator = BalancedBatchGenerator(
        sparse.csr_matrix(X),
        y,
        batch_size=10,
        keep_sparse=keep_sparse,
        random_state=42,
    )
    for idx in range(len(training_generator)):
        X_batch, _ = training_generator.__getitem__(idx)
        if keep_sparse:
            assert sparse.issparse(X_batch)
        else:
            assert not sparse.issparse(X_batch)
Example #5
0
    def fit_with_undersampling(self, train_data, train_labels, valid_data,
                               valid_labels):

        train_data = self.encode_sequences(train_data)
        valid_data = self.encode_sequences(valid_data)

        print('encode:', train_data.shape)

        training_generator = BalancedBatchGenerator(train_data,
                                                    train_labels,
                                                    batch_size=self.batch_size,
                                                    random_state=42)

        early_stopping = EarlyStopping(monitor='val_acc',
                                       patience=50,
                                       verbose=1,
                                       mode='max')
        check_point = ModelCheckpoint(self.best_model_path,
                                      monitor='val_acc',
                                      mode='max',
                                      verbose=1,
                                      save_best_only=True)
        callbacks_list = [check_point, early_stopping]

        self.model.fit_generator(generator=training_generator,
                                 epochs=self.max_epoches,
                                 verbose=2,
                                 callbacks=callbacks_list,
                                 validation_data=(valid_data, valid_labels))
def fit_predict_balanced_model(X_train, y_train, X_test, y_test):
    model = make_model(X_train.shape[1])
    training_generator = BalancedBatchGenerator(X_train, y_train,
                                                batch_size=1000,
                                                random_state=42)
    model.fit_generator(generator=training_generator, epochs=5, verbose=1)
    y_pred = model.predict_proba(X_test, batch_size=1000)
    return roc_auc_score(y_test, y_pred)
def train(model, X, y, X_val, y_val, checkpoint, num_epochs=20):
    sgd = SGD(lr=0.01)
    model.compile(optimizer=sgd,
            loss='mean_squared_error',
            metrics=[accuracy])
    training_generator = BalancedBatchGenerator(X, y, sampler=RandomOverSampler(), batch_size=64, random_state=42)
    model.fit_generator(generator=training_generator, epochs=num_epochs, validation_data=(X_val,y_val))
    model.save_weights(checkpoint)
def test_balanced_batch_generator_class(sampler, sample_weight):
    model = _build_keras_model(y.shape[1], X.shape[1])
    training_generator = BalancedBatchGenerator(X, y,
                                                sample_weight=sample_weight,
                                                sampler=sampler,
                                                batch_size=10,
                                                random_state=42)
    model.fit_generator(generator=training_generator,
                        epochs=10)
Example #9
0
def fit_predict_balanced_model(X_train, y_train, X_test, y_test):
    model = make_model(X_train.shape[1])
    training_generator = BalancedBatchGenerator(X_train,
                                                y_train,
                                                batch_size=1000,
                                                random_state=42)
    model.fit_generator(generator=training_generator, epochs=200, verbose=0)
    y_pred = model.predict(X_test, batch_size=1000)
    y_pred = tf.argmax(y_pred, axis=-1).numpy()
    y_test = tf.argmax(y_test, axis=-1).numpy()
    return balanced_accuracy_score(y_test, y_pred)
def create_model(x_train_sequence, y_train, x_test_sequence, y_test):
    verbose = 1
    max_sequence_length = 110
    vocab_size = 3000
    embedding_dim = {{choice([32, 64, 128, 256, 512])}}
    lstm = {{choice([32, 64, 128, 256, 512])}}
    num_epochs = {{choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])}}
    dropout = {{uniform(0, 1)}}
    recurrent_dropout = {{uniform(0, 1)}}
    alpha = {{uniform(0, 3)}}
    batch_size = {{choice([32, 64, 128, 256])}}
    model = Sequential()
    model.add(
        Embedding(vocab_size,
                  embedding_dim,
                  input_length=max_sequence_length,
                  mask_zero=True))
    model.add(
        LSTM(lstm, recurrent_dropout=recurrent_dropout,
             return_sequences=False))
    model.add(ELU(alpha=alpha))
    model.add(Dropout(dropout))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=["binary_accuracy"])
    model.summary()

    # Fit the model and evaluate
    #result = model.fit(x_train_sequence, y_train, batch_size=batch_size,
    #                    validation_data=(x_test_sequence, y_test), verbose=verbose, shuffle=True, epochs=num_epochs)

    generator = BalancedBatchGenerator(x_train_sequence,
                                       y_train,
                                       sampler=NearMiss(),
                                       batch_size=batch_size,
                                       random_state=1)

    result = model.fit_generator(generator=generator,
                                 epochs=num_epochs,
                                 verbose=verbose)

    #get the highest validation accuracy of the training epochs
    validation_acc = np.amax(result.history["binary_accuracy"])
    print('Best validation acc of epoch:', validation_acc)
    print('Embedding_dim: ', embedding_dim)
    print('Number of neurons: ', lstm)
    print('Epochs: ', num_epochs)
    print('Dropout: ', dropout)
    print('Recurrent Dropout: ', recurrent_dropout)
    print('Batch Size: ', batch_size)
    #print('Alpha: ', alpha)
    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': model}
Example #11
0
class BalancedDataGenerator(Sequence):
    """ImageDataGenerator + RandomOversampling"""
    def __init__(self, x, y, datagen, batch_size=32):
        self.datagen = datagen
        self.batch_size = batch_size
        self._shape = x.shape
        datagen.fit(x)
        self.gen, self.steps_per_epoch, *rest = BalancedBatchGenerator(
            x.reshape(x.shape[0], -1),
            y,
            sampler=RandomOverSampler(),
            batch_size=self.batch_size,
            keep_sparse=True)

    def __len__(self):
        return self._shape[0] // self.batch_size

    def __getitem__(self, idx):
        x_batch, y_batch = self.gen.__next__()
        x_batch = x_batch.reshape(-1, *self._shape[1:])
        return self.datagen.flow(x_batch, y_batch,
                                 batch_size=self.batch_size).next()
    #dropout?
    model.add(Dense(128, activation='relu'))
    model.add(Dense(y_count, activation='softmax'))

    # compile the keras model
    sgd = optimizers.adam(lr=0.02)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    # fit the keras model on the dataset
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    training_generator = BalancedBatchGenerator(X,
                                                y,
                                                sampler=NearMiss(),
                                                batch_size=8,
                                                random_state=42)
    model.fit_generator(generator=training_generator, epochs=32, verbose=1)
    cvscores.append(model.evaluate(x_test, y_test))
    print('Model evaluation ', cvscores[-1])
    print('\n')
    cfm = confusion_matrix(np.argmax(y_test, axis=1),
                           model.predict_classes(x_test),
                           labels=[i for i in range(y_count)])
    cfm = pd.DataFrame(cfm, col, col)
    print(cfm)

print('\n')

print('mean accuracy is at: %s' % np.mean(list(zip(*cvscores))[1]))
Example #13
0
def test_balanced_batch_generator_class_no_return_indices(data):
    with pytest.raises(ValueError, match="needs to have an attribute"):
        BalancedBatchGenerator(*data,
                               sampler=ClusterCentroids(),
                               batch_size=10)
Example #14
0
def test_balanced_batch_generator_class_no_return_indices():
    with pytest.raises(ValueError, match='needs to return the indices'):
        BalancedBatchGenerator(X, y, sampler=ClusterCentroids(), batch_size=10)
    create_dir('log')
    log_fname = 'log/balanced_log.csv'

    if not os.path.exists(log_fname):
        with open(log_fname, 'w+') as f:
            writer = csv.writer(f)
            writer.writerow(['model_name', 'loss'])

    # upsample minority classes  ######################################
    model_name = 'upsample_random'
    create_dir(f'log/{model_name}')
    sampler = RandomOverSampler(random_state=42)
    training_generator = BalancedBatchGenerator(X_train,
                                                y_train_onehot,
                                                sampler=sampler,
                                                batch_size=batchsize,
                                                random_state=42)
    train(training_generator, X_test, y_test, X_val, y_val, model_name,
          batchsize, log_fname)

    # downsample majority classes #####################################
    model_name = 'downsample-random'
    create_dir(f'log/{model_name}')
    sampler = RandomUnderSampler(random_state=42)
    training_generator = BalancedBatchGenerator(X_train,
                                                y_train_onehot,
                                                sampler=sampler,
                                                batch_size=batchsize,
                                                random_state=42)
    train(training_generator, X_test, y_test, X_val, y_val, model_name,
Example #16
0
# input layer


model = Sequential()
model.add(Dense(output_dim=256, input_dim=83, activation="relu"))
# model.add(Dense(output_dim = 32, activation = "relu",kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(output_dim=256, activation="relu"))
model.add(Dense(output_dim=256, activation="relu"))
model.add(Dense(output_dim=1, activation="sigmoid"))

model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc'])
print(model.summary())
# # fit the keras model on the dataset
from imblearn.keras import BalancedBatchGenerator

training_generator = BalancedBatchGenerator(
    X_train, y_train, sampler=RandomUnderSampler(), batch_size=16, random_state=0)
# model.fit(X_train, y_train, epochs=10, batch_size=512, verbose=2)
callback_history = model.fit_generator(generator=training_generator,
                                       epochs=15, verbose=2, validation_data=(X_test, y_test))
# # evaluate the keras model
# _, accuracy = model.evaluate(X, y)
# print('Accuracy: %.2f' % (accuracy * 100))


# # DT
# DT = tree.DecisionTreeClassifier()
# dataDT = DT.fit(X_train, y_train)
# print('finish modeling')

# predict
# test_y_predicted = dataDT.predict(X_test)