Beispiel #1
0
def Neural_Network_Selection(normal_count, anomaly_coount, data_dir):
    undersample, _ = preprocessing.create_datasets(data_dir, normal_count,
                                                   anomaly_coount)
    X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = undersample
    hidden_layers = [1, 2, 3, 4, 5]
    hidden_layer_neurons = [50, 100, 200, 300, 500]
    results_matrix_validation = np.zeros((5, 5))
    dense_index = -1

    for neurons in hidden_layer_neurons:
        layer_index = -1
        dense_index += 1
        for layers in hidden_layers:
            model = Sequential()
            model.add(Dense(neurons, input_dim=30, activation='relu'))
            layer_index += 1
            for _layers in range(layers):
                model.add(Dense(neurons, activation='relu'))
            model.add(Dense(1, activation='sigmoid'))
            # Compile model
            model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
            # Fit the model
            model.fit(X_train_undersample,
                      y_train_undersample,
                      epochs=200,
                      batch_size=10,
                      verbose=0)
            acc = (model.evaluate(X_test_undersample, y_test_undersample)[1])
            results_matrix_validation[dense_index][layer_index] = acc

    result = np.where(
        results_matrix_validation == np.amax(results_matrix_validation))
    return (hidden_layer_neurons[result[0][0]], hidden_layers[result[1][0]])
def create_nonlinear_SVC(data_dir, normal_count, anomaly_count):
    undersample, _ = preprocessing.create_datasets(data_dir, normal_count,
                                                   anomaly_count)
    train_x, test_x, train_y, test_y = undersample
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    degree = [1, 3, 6, 12]
    C = [1, 10, 20]

    recall_val = 0
    opt_kernel = None
    opt_degree = None
    opt_C = None
    for kernel in kernels:
        for deg in degree:
            for c in C:
                clf = SVC(kernel=kernel, degree=deg, C=c)
                clf.fit(train_x, train_y)
                recall_state = recall_score(test_y,
                                            clf.predict(test_x),
                                            pos_label=1)
                if recall_val < recall_state:
                    recall_val = recall_state
                    opt_kernel, opt_degree, opt_C = (kernel, deg, c)
                #print('Recall Score: {} | kernel {}, degree {}, C {}'.format(recall_state, kernel, deg, c))
    return ((kernel, deg, c))
def PCA_(undersample_amount, data_dir, plot):
    train_x, test_x, train_y, test_y = preprocessing.create_datasets(
        data_dir, undersample_amount, undersample_amount)[0]
    pca = PCA(n_components=2)
    finalDf = plot_PCA(train_x, train_y, pca, plot)
    pca = PCA(n_components=10)
    principalComponents = pca.fit_transform(train_x)
    variance_explained = pca.explained_variance_ratio_
    #print('First 10 Principal Components Variance Explained As Follows Respectively: {}'.format(variance_explained))

    # first two principal components consistantly (every stratified split sample) explains about 75% and above of the variance
    # while the first component explains about 65% of the whole thing
    return (finalDf, test_x, test_y)
Beispiel #4
0
import nmf1
import nmf2
from complement import *
from preprocessing import create_datasets
from metric import evaluate
import numpy as np
import matplotlib.pyplot as plt

if __name__ == '__main__':

    dataset1, dataset2, dataset3, dataset4, dataset5 = create_datasets()
    #print(dataset1)
    print ('************datasets were created!!***************')

    R_train1 = np.array(dataset1[0])
    R_test1 = np.array(dataset1[1])
    R_train2 = np.array(dataset2[0])
    R_test2 = np.array(dataset2[1])
    R_train3 = np.array(dataset3[0])
    R_test3 = np.array(dataset3[1])
    R_train4 = np.array(dataset4[0])
    R_test4 = np.array(dataset4[1])
    R_train5 = np.array(dataset5[0])
    R_test5 = np.array(dataset5[1])
    #print(R_train1,R_test1)
    K = 19 # number of latent features
    # initialize P and Q with random values
    P = np.random.rand(R_train1.shape[0], K)
    Q = np.random.rand(R_train1.shape[1], K)
    print(R_test1.shape[0],R_test1.shape[1])
def Logistic_Regression_Selection(sample_times, undersample_amount, data_dir):
    c_bank = [0.001, 0.01, 0.1, 1, 10]
    penalties = ['l1', 'l2']
    results_matrix_train = np.zeros((5, 3))
    results_matrix_validation = np.zeros((5, 3))
    results_matrix_large_test = np.zeros((5, 3))

    for sample_count in range(sample_times):
        undersample, test_set = preprocessing.create_datasets(
            data_dir, undersample_amount, undersample_amount)
        X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = undersample
        X_test, y_test = test_set
        for c in c_bank:
            for regulizer in penalties:
                log_reg = LogisticRegression(C=c, penalty=regulizer)
                log_reg.fit(X_train_undersample,
                            y_train_undersample.values.ravel())
                # train set
                y_pred_undersample = log_reg.predict(X_train_undersample)
                recall_train = np.round(recall_score(
                    y_train_undersample.values, y_pred_undersample),
                                        decimals=4)
                # validation set
                y_pred_undersample = log_reg.predict(X_test_undersample)
                recall_test = np.round(recall_score(y_test_undersample.values,
                                                    y_pred_undersample),
                                       decimals=4)
                # large test
                y_pred_undersample = log_reg.predict(X_test)
                recall_large_test = np.round(recall_score(
                    y_test.values, y_pred_undersample),
                                             decimals=4)
                #print("------------------------------------")
                #print('Sample Number {}: C-value {}, Regularizer {}, has Training Recall: {}'.format(sample_count, c, regulizer, recall_train))
                #print('Sample Number {}: C-value {}, Regularizer {}, has Validation Recall: {}'.format(sample_count, c, regulizer, recall_test))
                #print('Sample Number {}: C-value {}, Regularizer {}, has Validation Recall: {}'.format(sample_count, c, regulizer, recall_large_test))
                #print("------------------------------------")
                results_matrix_train[c_bank.index(c)][penalties.index(
                    regulizer)] += recall_train
                results_matrix_validation[c_bank.index(c)][penalties.index(
                    regulizer)] += recall_test
                results_matrix_large_test[c_bank.index(c)][penalties.index(
                    regulizer)] += recall_large_test

    results_matrix_train = results_matrix_train / sample_times
    results_matrix_validation = results_matrix_validation / sample_times
    results_matrix_large_test = results_matrix_large_test / sample_times
    final_c = []
    final_reg = []

    result = np.where(results_matrix_train == np.amax(results_matrix_train))
    #print("------------------------------------")
    #print("Best Average Training Logistic Regression Recall: {}".format(np.amax(results_matrix_train)))
    #print("With C value: {}".format(c_bank[result[0][0]]))
    #print("With Penalty type: {}".format(penalties[result[1][0]]))
    #print("------------------------------------")
    final_c.append(c_bank[result[0][0]])
    final_reg.append(penalties[result[1][0]])

    result = np.where(
        results_matrix_validation == np.amax(results_matrix_validation))
    #print("------------------------------------")
    #print("Best Average Validation Logistic Regression Recall: {}".format(np.amax(results_matrix_validation)))
    #print("With C value: {}".format(c_bank[result[0][0]]))
    #print("With Penalty type: {}".format(penalties[result[1][0]]))
    #print("------------------------------------")
    final_c.append(c_bank[result[0][0]])
    final_reg.append(penalties[result[1][0]])

    result = np.where(
        results_matrix_large_test == np.amax(results_matrix_large_test))
    #print("------------------------------------")
    #print("Best Average Training Logistic Regression Recall: {}".format(np.amax(results_matrix_large_test)))
    #print("With C value: {}".format(c_bank[result[0][0]]))
    #print("With Penalty type: {}".format(penalties[result[1][0]]))
    #print("------------------------------------")
    final_c.append(c_bank[result[0][0]])
    final_reg.append(penalties[result[1][0]])

    final_c = Counter(final_c).most_common(1)[0]
    final_reg = Counter(final_reg).most_common(1)[0]

    ##print("Best Overall C value: {}, Best Overall Regularizer: {}".format(final_c[0], final_reg[0]))
    ##print("------------------------------------")

    log_reg = LogisticRegression(C=float(final_c[0]),
                                 penalty=str(final_reg[0]))
    log_reg.fit(X_train_undersample, y_train_undersample.values.ravel())
    # train set
    y_pred_undersample = log_reg.predict(X_train_undersample)
    recall_train = np.round(recall_score(y_train_undersample.values,
                                         y_pred_undersample),
                            decimals=4)
    # validation set
    y_pred_undersample = log_reg.predict(X_test_undersample)
    recall_test = np.round(recall_score(y_test_undersample.values,
                                        y_pred_undersample),
                           decimals=4)
    # large test
    y_pred_undersample = log_reg.predict(X_test)
    recall_large_test = np.round(recall_score(y_test.values,
                                              y_pred_undersample),
                                 decimals=4)

    ##print("------------------------------------")
    ##print("------------------------------------")
    ##print("------------------------------------")
    ##print('Logistic Regression Test Set Recall: {}'.format(recall_large_test))
    ##print('Logistic Regression Test Set Confusion Matrix:')
    ##print(pd.DataFrame(confusion_matrix(y_test.values,y_pred_undersample,labels=[0,1]), index=['true:0', 'true:1'], columns=['pred:0', 'pred:1']))
    return (final_c[0], final_reg[0])
Beispiel #6
0
import numpy as np
from preprocessing import create_datasets, create_generators
from extracting import extract_features
from model import create_model, get_conv_base
from visualizing import display_progress

base_dir: str = '/Users/Jan/developer/ML/dogs/dataset'

train, test, val = create_datasets('hello')

conv_base = get_conv_base()

train_features, train_labels = extract_features(conv_base, train, 2000)
val_features, val_labels = extract_features(conv_base, val, 1000)
test_features, test_labels = extract_features(conv_base, test, 1000)

train_features = np.reshape(train_features, (2000, 4 * 4 * 512))
val_features = np.reshape(val_features, (1000, 4 * 4 * 512))
test_features = np.reshape(test_features, (1000, 4 * 4 * 512))

model = create_model()
model.summary()

p: str = '/Users/Jan/Developer/ML/dogs/dataset'

train_generator, val_generator = create_generators(f'{p}/train', f'{p}/val',
                                                   f'{p}/test')

history = model.fit_generator(train_generator,
                              steps_per_epoch=100,
                              epochs=30,
Beispiel #7
0
import nmf1
import nmf2
from complement import *
from preprocessing import create_datasets
from metric import evaluate
import numpy as np
import matplotlib.pyplot as plt

if __name__ == '__main__':

    dataset1, dataset2, dataset3, dataset4, dataset5 = create_datasets()

    print '************datasets were created!!***************'

    R_train1 = np.array(dataset1[0])
    R_test1 = np.array(dataset1[1])
    R_train2 = np.array(dataset2[0])
    R_test2 = np.array(dataset2[1])
    R_train3 = np.array(dataset3[0])
    R_test3 = np.array(dataset3[1])
    R_train4 = np.array(dataset4[0])
    R_test4 = np.array(dataset4[1])
    R_train5 = np.array(dataset5[0])
    R_test5 = np.array(dataset5[1])

    K = 19 # number of latent features

    # initialize P and Q with random values
    P = np.random.rand(R_train1.shape[0], K)
    Q = np.random.rand(R_train1.shape[1], K)
    iteration = args.iteration
    settings = f'dim{dim}--layer_hidden{layer_hidden}--layer_output{layer_output}--lr{lr}--lr_decay{lr_decay}--decay_interval{decay_interval}--batch{batch_train}'
    print(settings)

    # Slack Message
    url = 'http://xxx.xxx.xxx'
    message = {"text": f"GNN train start"}
    message_ = {"text": f"GNN train end.)"}
    requests.post(url, data=json.dumps(message))

    # Preprocessing  Datasets
    print('Creating datasets from molecular graph.')
    print(
        'Trainingset is splitted and converted into subsets based on K-foldCV')
    print('Just a moment......')
    datasets_train, datasets_valid, dataset_test, N_fingerprints, valid_indexes = pp.create_datasets(
        train_path, test_path, radius, task, device)

    # Make directiry for Saving Results
    os.mkdir(f'{args.date}')
    ### Trainig and Prediction ##
    for a in range(5):
        dataset_train = datasets_train[a]
        dataset_valid = datasets_valid[a]
        dataset_test = dataset_test
        N_fingerprints = N_fingerprints
        print('-' * 100)
        print('The preprocess has finished!')
        print('# of training data allocations:', a)
        print('# of training data samples:', len(dataset_train))
        print('# of development data samples:', len(dataset_valid))
        print('# of test data samples:', len(dataset_test))