def plot_raw_data(df, NUM_SAMPLES_PER_CLASS): userids = create_userids(df) NUM_USERS = len(userids) for i in range(0, NUM_USERS): userid = userids[i] print(userid) user_data = df.loc[df.iloc[:, -1].isin([userid])] # Select data for training user_data = user_data.drop(user_data.columns[-1], axis=1) user_array = user_data.values[0:NUM_SAMPLES_PER_CLASS, :] rows, cols = user_array.shape plt.clf() plt.xlabel('Time') plt.title("User " + str(userids[i])) for row in range(rows): plt.plot(user_array[row, :]) output_file = str(userids[i]) + '.png' print(output_file) plt.savefig(st.OUTPUT_FIGURES + "/" + output_file)
def normalize_users_columns(df, norm_type): print(df.shape) userids = create_userids(df) user_data = df.loc[df.iloc[:, -1].isin([userids[0]])] user_array = user_data.values nsamples, nfeatures = user_array.shape nfeatures = nfeatures - 1 user_X = user_array[:, 0:nfeatures] user_y = user_array[:, -1] scaler = MinMaxScaler() print(userids[0] + ": " + str(user_X.shape)) if (norm_type == st.NormalizationType.MINMAX): user_X = scaler.fit_transform(user_X) if (norm_type == st.NormalizationType.ZSCORE): user_X = preprocessing.scale(user_X) X = user_X y = user_y NUM_USERS = len(userids) for i in range(1, NUM_USERS): userid = userids[i] user_data = df.loc[df.iloc[:, -1].isin([userid])] user_array = user_data.values nsamples, nfeatures = user_array.shape nfeatures = nfeatures - 1 user_X = user_array[:, 0:nfeatures] user_y = user_array[:, -1] if (norm_type == st.NormalizationType.MINMAX): user_X = scaler.fit_transform(user_X) if (norm_type == st.NormalizationType.ZSCORE): user_X = preprocessing.scale(user_X) # append data X = np.vstack([X, user_X]) y = np.concatenate([y, user_y]) df = pd.DataFrame(X) df['user'] = y return df
def plot_user_dx_dy_histo(df): set_style() userids = create_userids(df) NUM_USERS = len(userids) for i in range(0, NUM_USERS): userid = userids[i] print(userid) user_data = df.loc[df.iloc[:, -1].isin([userid])] # Select data for training user_data = user_data.drop(user_data.columns[-1], axis=1) user_dx = user_data[user_data.columns[range(0, 128)]] user_dy = user_data[user_data.columns[range(128, 256)]] plt.clf() result = [] [result.extend(el) for el in user_dx.values.tolist()] sns.distplot(result, norm_hist=True, color='green', bins=32) # n, bins, patches = plt.hist( result, 50 ) plt.xlabel('Bins') plt.ylabel('Density') plt.title(' dx histogram ') output_file = str(userids[i]) + '_dx.png' print(output_file) plt.savefig(st.OUTPUT_FIGURES + "/" + output_file) plt.clf() result = [] [result.extend(el) for el in user_dy.values.tolist()] ax = sns.distplot(result, norm_hist=True, color='red', bins=32) print(ax) # plt.hist( result ) plt.xlabel('Bins') plt.ylabel('Density') plt.title(' dy histogram ') output_file = str(userids[i]) + '_dy.png' # print(output_file) plt.savefig(st.OUTPUT_FIGURES + "/" + output_file)
def evaluate_authentication( df, verbose = False): print(df.shape) userids = create_userids( df ) NUM_USERS = len(userids) auc_list = list() eer_list = list() global_positive_scores = list() global_negative_scores = list() for i in range(0,NUM_USERS): userid = userids[i] user_train_data = df.loc[ df.iloc[:, -1].isin([userid]) ] # Select data for training user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1) user_array = user_train_data.values num_samples = user_array.shape[0] train_samples = (int)(num_samples * 0.66) test_samples = num_samples - train_samples # print("#train_samples: "+str(train_samples)+"\t#test_samples: "+ str(test_samples)) user_train = user_array[0:train_samples,:] user_test = user_array[train_samples:num_samples,:] other_users_data = df.loc[~df.iloc[:, -1].isin([userid])] other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1) other_users_array = other_users_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_train) positive_scores = clf.score_samples(user_test) negative_scores = clf.score_samples(other_users_array) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0) auc, eer = compute_AUC_EER(y_pred_positive, y_pred_negative) # auc, eer = compute_AUC_EER(positive_scores, negative_scores ) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+", "+ str(auc)+", "+str(eer) ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) if verbose == True: global_auc, global_eer = compute_AUC_EER(global_positive_scores, global_negative_scores) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer)) return auc_list, eer_list
def evaluate_authentication_cross_day( df1, df2, verbose = False ): print("Session 1 shape: "+str(df1.shape)) print("Session 2 shape: "+str(df2.shape)) userids = create_userids( df1 ) NUM_USERS = len(userids) global_positive_scores = list() global_negative_scores = list() auc_list = list() eer_list = list() for i in range(0,NUM_USERS): userid = userids[i] user_session1_data = df1.loc[df1.iloc[:, -1].isin([userid])] user_session2_data = df2.loc[df2.iloc[:, -1].isin([userid])] user_session1_data = user_session1_data.drop(user_session1_data.columns[-1], axis=1) user_session1_array = user_session1_data.values # positive test data user_session2_data = user_session2_data.drop(user_session2_data.columns[-1], axis=1) user_session2_array = user_session2_data.values # negative test data other_users_session2_data = df2.loc[~df2.iloc[:, -1].isin([userid])] other_users_session2_data = other_users_session2_data.drop(other_users_session2_data.columns[-1], axis=1) other_users_session2_array = other_users_session2_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_session1_array) positive_scores = clf.score_samples(user_session2_array) negative_scores = clf.score_samples(other_users_session2_array) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0) auc, eer = compute_AUC_EER(y_pred_positive, y_pred_negative) # auc, eer = compute_AUC_EER(positive_scores, negative_scores ) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+": "+ str(auc)+", "+str(eer) ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) if verbose == True: global_auc, global_eer = compute_AUC_EER(global_positive_scores, global_negative_scores) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer)) return auc_list, eer_list
def evaluate_authentication_train_test(df_train, df_test, data_type, num_blocks, representation_type, verbose=False, roc_data=False, roc_data_filename=TEMP_NAME): print("Training: " + str(df_train.shape)) print("Testing: " + str(df_test.shape)) userids = create_userids(df_train) NUM_USERS = len(userids) auc_list = list() eer_list = list() global_positive_scores = list() global_negative_scores = list() for i in range(0, NUM_USERS): userid = userids[i] user_train_data = df_train.loc[df_train.iloc[:, -1].isin([userid])] # Select data for training user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1) user_array = user_train_data.values # train_samples = user_array.shape[0] user_test_data = df_test.loc[df_test.iloc[:, -1].isin([userid])] user_test_data = user_test_data.drop(user_test_data.columns[-1], axis=1) # test_samples = user_test_data.shape[0] other_users_data = df_test.loc[~df_test.iloc[:, -1].isin([userid])] other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1) # other_users_array = other_users_data.values # if (verbose == True): # print(str(userid)+". #train_samples: "+str(train_samples)+"\t#positive test_samples: "+ str(test_samples)) clf = OneClassSVM(gamma='scale') clf.fit(user_train_data) positive_scores = clf.score_samples(user_test_data) negative_scores = clf.score_samples(other_users_data) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - num_blocks + 1): y_pred_positive[i] = np.average(y_pred_positive[i:i + num_blocks], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - num_blocks + 1): y_pred_negative[i] = np.average(y_pred_negative[i:i + num_blocks], axis=0) auc, eer, _, _ = compute_AUC_EER(y_pred_positive, y_pred_negative) if SCORE_NORMALIZATION == True: positive_scores, negative_scores = score_normalization( positive_scores, negative_scores) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid) + ", " + str(auc) + ", " + str(eer)) auc_list.append(auc) eer_list.append(eer) print("\nNumber of blocks: ", num_blocks) print('AUC mean : %7.4f, std: %7.4f' % (np.mean(auc_list), np.std(auc_list))) print('EER mean: %7.4f, std: %7.4f' % (np.mean(eer_list), np.std(eer_list))) print("#positives: " + str(len(global_positive_scores))) print("#negatives: " + str(len(global_negative_scores))) global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores) filename = 'output_png/scores_' + str(data_type.value) + '_' + str( representation_type.value) if SCORES == True: # **************************************************************************************** plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution') # **************************************************************************************** if (roc_data == True): dict = {'FPR': fpr, 'TPR': tpr} df = pd.DataFrame(dict) df.to_csv(roc_data_filename, index=False) print(data_type.value + " Global AUC: " + str(global_auc)) print(data_type.value + " Global EER: " + str(global_eer)) return auc_list, eer_list
def evaluate_dataset(current_dataset, dataset_amount, num_actions, num_training_actions): #filename = FEAT_DIR + '/' + datasetname(current_dataset, dataset_amount, num_training_actions) filename1 = "/home/bwbwchen/temp/mouse_dynamics_balabit_chaoshen_dfl/measurements/lee_log" filename2 = "/home/bwbwchen/temp/mouse_dynamics_balabit_chaoshen_dfl/measurements/liu_log" """ filename1 = "/home/bwbwchen/temp/mouse_dynamics_balabit_chaoshen_dfl/measurements/mouse_log" filename2 = "/home/bwbwchen/temp/mouse_dynamics_balabit_chaoshen_dfl/measurements/liu_log" """ #print(filename1) #print(filename2) #dataset = pd.read_csv(filename) dataset = get_user_data(filename1, filename2) #print(dataset.shape) # DataFrame df = pd.DataFrame(dataset) num_features = int(dataset.shape[1]) #print("Num features: ", num_features) array = dataset.values X = array[:, 0:num_features - 1] y = array[:, num_features - 1] userids = create_userids(current_dataset) userids = [1] #print(userids) # Train user-specific classifiers and evaluate them items = userids # fpr = {} <==> fpr = dict() fpr = {} tpr = {} roc_auc = {} correct = df.loc[df.iloc[:, -1].isin([1])] wrong = df.loc[df.iloc[:, -1].isin([2])] numSamples = min(correct.shape[0], wrong.shape[0]) for i in userids: # print("Training classifier for the user "+str(i)) # Select all positive samples that belong to current user user_positive_data = df.loc[df.iloc[:, -1].isin([i])] user_positive_data = user_positive_data.iloc[np.random.choice( user_positive_data.shape[0], numSamples)] #numSamples = user_positive_data.shape[0] array_positive = copy.deepcopy(user_positive_data.values) array_positive[:, -1] = 1 # negative data for the current user user_neagtive_data = select_negatives_from_other_users( dataset, i, numSamples) array_negative = copy.deepcopy(user_neagtive_data.values) array_negative[:, -1] = 0 # concatenate negative and positive data dataset_user = pd.concat( [pd.DataFrame(array_positive), pd.DataFrame(array_negative)]).values X = dataset_user[:, 0:-1] y = dataset_user[:, -1] if CURRENT_SPLIT_TYPE == SPLIT_TYPE.RANDOM: X_train, X_validation, y_train, y_validation = model_selection.train_test_split( X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE) print("random split") else: X_train, X_validation, y_train, y_validation = keeporder_split( X, y, test_size=TEST_SIZE) #print ("f**k split") model = RandomForestClassifier(random_state=RANDOM_STATE) model.fit(X_train, y_train) # scoring = ['accuracy', 'roc_auc' ] # scores = cross_validate(model, X_train, y_train, scoring=scoring, cv = 10, return_train_score = False) scores = cross_validate(model, X_train, y_train, cv=25, return_train_score=False) cv_accuracy = scores['test_score'] print("CV Accuracy: %0.2f (+/- %0.2f)" % (cv_accuracy.mean(), cv_accuracy.std() * 2)) print("validation shape ", X_validation.shape) y_predicted = model.predict(X_validation) test_accuracy = accuracy_score(y_validation, y_predicted) print("Test Accuracy: %0.2f, y_predicted[0]" % test_accuracy, y_predicted[0]) # save model with open('outmodel.pkl', 'wb') as f: pickle.dump(model, f) fpr[i], tpr[i], thr = evaluate_sequence_of_samples( model, X_validation, y_validation, num_actions) threshold = -1 try: eer = brentq(lambda x: 1. - x - interp1d(fpr[i], tpr[i])(x), 0., 1.) threshold = interp1d(fpr[i], thr)(eer) except (ZeroDivisionError, ValueError): print("Division by zero") roc_auc[i] = auc(fpr[i], tpr[i]) print( str(i) + ": " + str(roc_auc[i]) + " threshold: " + str(threshold))
def train_model(df, model_name="foo.h5", fcn_filters=128, representation_learning=False): userids = create_userids(df) # print(userids) nbclasses = len(userids) print('number of classes: ' + str(nbclasses)) array = df.values nsamples, nfeatures = array.shape nfeatures = nfeatures - 1 X = array[:, 0:nfeatures] y = array[:, -1] enc = OneHotEncoder() enc.fit(y.reshape(-1, 1)) y = enc.transform(y.reshape(-1, 1)).toarray() X = X.reshape(-1, stt.FEATURES, stt.DIMENSIONS) if (representation_learning == False): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=stt.RANDOM_STATE) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.25, random_state=stt.RANDOM_STATE) else: X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=stt.RANDOM_STATE) print("Train, validation (and test shapes): ") print(X_train.shape) print(X_val.shape) if (representation_learning == False): print(X_test.shape) mini_batch_size = int(min(X_train.shape[0] / 10, stt.BATCH_SIZE)) if (model_name == "foo.h5"): model_name = stt.MODEL_NAME filepath = stt.TRAINED_MODELS_PATH + "/" + model_name print(filepath) cb, model = build_fcn((stt.FEATURES, stt.DIMENSIONS), nbclasses, filepath, fcn_filters) # model.summary() X_train = np.asarray(X_train).astype(np.float32) X_val = np.asarray(X_val).astype(np.float32) # convert to tensorflow dataset train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)) val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)) BATCH_SIZE = mini_batch_size SHUFFLE_BUFFER_SIZE = 100 train_ds = train_ds.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE) val_ds = val_ds.batch(BATCH_SIZE) start_time = time.time() hist = model.fit(train_ds, epochs=stt.EPOCHS, verbose=True, validation_data=val_ds, callbacks=cb) hist_df = pd.DataFrame(hist.history) # plot training curve plot_training(hist, model_name, metrics='loss') plot_training(hist, model_name, metrics='accuracy') duration = time.time() - start_time print("Training duration: " + str(duration / 60)) if (representation_learning == False): X_test = np.asarray(X_test).astype(np.float32) y_true = np.argmax(y_test, axis=1) y_pred = np.argmax(model.predict(X_test), axis=1) accuracy = metrics.accuracy_score(y_true, y_pred) print("Test accuracy: " + str(accuracy)) return model
def train_model(df, model_name="foo.h5"): userids = create_userids(df) nbclasses = len(userids) print(nbclasses) array = df.values nsamples, nfeatures = array.shape nfeatures = nfeatures - 1 X = array[:, 0:nfeatures] y = array[:, -1] enc = OneHotEncoder() enc.fit(y.reshape(-1, 1)) y = enc.transform(y.reshape(-1, 1)).toarray() X = X.reshape(-1, stt.FEATURES, stt.DIMENSIONS) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=stt.RANDOM_STATE) X_train, X_val, y_train, y_val = train_test_split( X_train, y_train, test_size=0.25, random_state=stt.RANDOM_STATE) print(X_train.shape) print(X_test.shape) print(X_val.shape) mini_batch_size = int(min(X_train.shape[0] / 10, stt.BATCH_SIZE)) if (model_name == "foo.h5"): model_name = stt.MODEL_NAME filepath = stt.TRAINED_MODELS_PATH + "/" + model_name if (stt.MODEL_TYPE == stt.ModelType.FCN): cb, model = build_fcn((stt.FEATURES, stt.DIMENSIONS), nbclasses, filepath) if (stt.MODEL_TYPE == stt.ModelType.RESNET): cb, model = build_resnet((stt.FEATURES, stt.DIMENSIONS), nbclasses, filepath) if (stt.MODEL_TYPE == stt.ModelType.MLP): cb, model = build_mlp((stt.FEATURES, stt.DIMENSIONS), nbclasses, filepath) if (stt.MODEL_TYPE == stt.ModelType.MCDCNN): cb, model = build_mcdcnn((stt.FEATURES, stt.DIMENSIONS), nbclasses, filepath) if (stt.MODEL_TYPE == stt.ModelType.TLENET): cb, model = build_tlenet((stt.FEATURES, stt.DIMENSIONS), nbclasses, filepath) if (stt.MODEL_TYPE == stt.ModelType.CNN): cb, model = build_cnn((stt.FEATURES, stt.DIMENSIONS), nbclasses, filepath) # if stt.UPDATE_WEIGHTS == True: # model = set_weights_from_pretrained_model(model) X_train = np.asarray(X_train).astype(np.float32) X_val = np.asarray(X_val).astype(np.float32) # convert to tensorflow dataset train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)) val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)) BATCH_SIZE = mini_batch_size SHUFFLE_BUFFER_SIZE = 100 train_ds = train_ds.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE) val_ds = val_ds.batch(BATCH_SIZE) start_time = time.time() hist = model.fit(train_ds, epochs=stt.EPOCHS, verbose=True, validation_data=val_ds, callbacks=cb) hist_df = pd.DataFrame(hist.history) # save history to csv: hist_csv_file = 'histories/history.csv' with open(hist_csv_file, mode='w') as f: hist_df.to_csv(f) duration = time.time() - start_time print("Training duration: " + str(duration / 60)) # EVALUATION X_test = np.asarray(X_test).astype(np.float32) y_true = np.argmax(y_test, axis=1) y_pred = np.argmax(model.predict(X_test), axis=1) accuracy = metrics.accuracy_score(y_true, y_pred) print(accuracy) return model
def evaluate_dataset( current_dataset, dataset_amount, num_actions, num_training_actions): filename = FEAT_DIR + '/' + datasetname(current_dataset, dataset_amount, num_training_actions) print(filename) dataset = pd.read_csv(filename) print(dataset.shape) # DataFrame df = pd.DataFrame(dataset) num_features = int(dataset.shape[1]) print("Num features: ", num_features) array = dataset.values X = array[:, 0:num_features - 1] y = array[:, num_features - 1] userids = create_userids(current_dataset) print(userids) # Train user-specific classifiers and evaluate them items = userids # fpr = {} <==> fpr = dict() fpr = {} tpr = {} roc_auc = {} for i in userids: # print("Training classifier for the user "+str(i)) # Select all positive samples that belong to current user user_positive_data = df.loc[df.iloc[:, -1].isin([i])] numSamples = user_positive_data.shape[0] array_positive = copy.deepcopy(user_positive_data.values) array_positive[:, -1] = 1 # negative data for the current user user_neagtive_data = select_negatives_from_other_users(dataset, i, numSamples) array_negative = copy.deepcopy(user_neagtive_data.values) array_negative[:, -1] = 0 # concatenate negative and positive data dataset_user = pd.concat([pd.DataFrame(array_positive), pd.DataFrame(array_negative)]).values X = dataset_user[:, 0:-1] y = dataset_user[:, -1] if CURRENT_SPLIT_TYPE == SPLIT_TYPE.RANDOM: X_train, X_validation, y_train, y_validation = model_selection.train_test_split(X, y, test_size=TEST_SIZE,random_state= RANDOM_STATE) else: X_train, X_validation, y_train, y_validation = keeporder_split(X, y, test_size=TEST_SIZE) model = RandomForestClassifier(random_state= RANDOM_STATE) model.fit(X_train, y_train) # scoring = ['accuracy', 'roc_auc' ] # scores = cross_validate(model, X_train, y_train, scoring=scoring, cv = 10, return_train_score = False) scores = cross_validate(model, X_train, y_train, cv=10, return_train_score=False) cv_accuracy = scores['test_score'] print("CV Accuracy: %0.2f (+/- %0.2f)" % (cv_accuracy.mean(), cv_accuracy.std() * 2)) y_predicted = model.predict(X_validation) test_accuracy = accuracy_score(y_validation, y_predicted) print("Test Accuracy: %0.2f" % test_accuracy) fpr[i], tpr[i], thr = evaluate_sequence_of_samples(model, X_validation, y_validation, num_actions) threshold = -1 try: eer = brentq(lambda x: 1. - x - interp1d(fpr[i], tpr[i])(x), 0., 1.) threshold = interp1d(fpr[i], thr)(eer) except (ZeroDivisionError, ValueError): print("Division by zero") roc_auc[i] = auc(fpr[i], tpr[i]) print(str(i) + ": " + str(roc_auc[i])+" threshold: "+str(threshold)) plotROCs(fpr, tpr, roc_auc, items)
def evaluate_authentication( df, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME): print(df.shape) userids = create_userids( df ) NUM_USERS = len(userids) auc_list = list() eer_list = list() global_positive_scores = list() global_negative_scores = list() for i in range(0,NUM_USERS): userid = userids[i] user_train_data = df.loc[ df.iloc[:, -1].isin([userid]) ] # Select data for training user_train_data = user_train_data.drop(user_train_data.columns[-1], axis=1) user_array = user_train_data.values num_samples = user_array.shape[0] train_samples = (int)(num_samples * 0.66) + 1 test_samples = num_samples - train_samples if (verbose == True): print(str(userid)+". #train_samples: "+str(train_samples)+"\t#test_samples: "+ str(test_samples)) user_train = user_array[0:train_samples,:] user_test = user_array[train_samples:num_samples,:] other_users_data = df.loc[~df.iloc[:, -1].isin([userid])] other_users_data = other_users_data.drop(other_users_data.columns[-1], axis=1) other_users_array = other_users_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_train) positive_scores = clf.score_samples(user_test) negative_scores = clf.score_samples(other_users_array) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0) auc, eer,_,_ = compute_AUC_EER(y_pred_positive, y_pred_negative) if SCORE_NORMALIZATION == True: positive_scores, negative_scores = score_normalization(positive_scores, negative_scores) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+", "+ str(auc)+", "+str(eer)+"\n" ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) print("#positives: "+str(len(global_positive_scores))) print("#negatives: "+str(len(global_negative_scores))) global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores) filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) if SCORES == True: # **************************************************************************************** plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution') # **************************************************************************************** if( roc_data == True ): dict = {'FPR': fpr, 'TPR': tpr} df = pd.DataFrame(dict) df.to_csv(roc_data_filename, index=False) words = roc_data_filename.split('/') auc_eer_data_filename = words[0] +'/auc_eer_' + words[ 1 ] dict = {'AUC': auc_list, 'EER': eer_list} df = pd.DataFrame(dict) df.to_csv(auc_eer_data_filename, index=False) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer)) return auc_list, eer_list
def evaluate_authentication_cross_day( df1, df2, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME ): print("Session 1 shape: "+str(df1.shape)) print("Session 2 shape: "+str(df2.shape)) userids = create_userids( df1 ) NUM_USERS = len(userids) global_positive_scores = list() global_negative_scores = list() auc_list = list() eer_list = list() for i in range(0,NUM_USERS): userid = userids[i] user_session1_data = df1.loc[df1.iloc[:, -1].isin([userid])] user_session2_data = df2.loc[df2.iloc[:, -1].isin([userid])] user_session1_data = user_session1_data.drop(user_session1_data.columns[-1], axis=1) user_session1_array = user_session1_data.values # positive test data user_session2_data = user_session2_data.drop(user_session2_data.columns[-1], axis=1) user_session2_array = user_session2_data.values # negative test data other_users_session2_data = df2.loc[~df2.iloc[:, -1].isin([userid])] other_users_session2_data = other_users_session2_data.drop(other_users_session2_data.columns[-1], axis=1) other_users_session2_array = other_users_session2_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_session1_array) positive_scores = clf.score_samples(user_session2_array) negative_scores = clf.score_samples(other_users_session2_array) # Aggregating positive scores y_pred_positive = positive_scores for i in range(len(positive_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_positive[i] = np.average(y_pred_positive[i : i + AGGREGATE_BLOCK_NUM], axis=0) # Aggregating negative scores y_pred_negative = negative_scores for i in range(len(negative_scores) - AGGREGATE_BLOCK_NUM + 1): y_pred_negative[i] = np.average(y_pred_negative[i : i + AGGREGATE_BLOCK_NUM], axis=0) auc, eer, _, _ = compute_AUC_EER(y_pred_positive, y_pred_negative) # auc, eer = compute_AUC_EER(positive_scores, negative_scores ) if SCORE_NORMALIZATION == True: positive_scores, negative_scores = score_normalization(positive_scores, negative_scores) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+": "+ str(auc)+", "+str(eer) ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores) filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) if SCORES == True: # **************************************************************************************** plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution') # **************************************************************************************** if( roc_data == True ): dict = {'FPR': fpr, 'TPR': tpr} df = pd.DataFrame(dict) df.to_csv(roc_data_filename, index=False) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer)) return auc_list, eer_list
def evaluate_authentication_skilledforgeries( df_genuine, df_forgery, data_type, representation_type, verbose = False, roc_data = False, roc_data_filename = TEMP_NAME): print("Genuine shape: "+str(df_genuine.shape)) print("Forgery shape: "+str(df_forgery.shape)) print(df_forgery.shape) userids = create_userids( df_genuine ) NUM_USERS = len(userids) global_positive_scores = list() global_negative_scores = list() auc_list = list() eer_list = list() for i in range(0,NUM_USERS): userid = userids[i] user_genuine_data = df_genuine.loc[df_genuine.iloc[:, -1].isin([userid])] user_forgery_data = df_forgery.loc[df_forgery.iloc[:, -1].isin([userid])] user_genuine_data = user_genuine_data.drop(user_genuine_data.columns[-1], axis=1) user_genuine_array = user_genuine_data.values num_samples = user_genuine_array.shape[0] train_samples = (int)(num_samples * 0.66) test_samples = num_samples - train_samples # MCYT # train_samples = 15 # test_samples = 10 user_genuine_train = user_genuine_array[0:train_samples,:] user_genuine_test = user_genuine_array[train_samples:num_samples,:] user_forgery_data = user_forgery_data.drop(user_forgery_data.columns[-1], axis=1) user_forgery_array = user_forgery_data.values clf = OneClassSVM(gamma='scale') clf.fit(user_genuine_train) positive_scores = clf.score_samples(user_genuine_test) negative_scores = clf.score_samples(user_forgery_array) auc, eer,_,_ = compute_AUC_EER(positive_scores, negative_scores ) if SCORE_NORMALIZATION == True: positive_scores, negative_scores = score_normalization(positive_scores, negative_scores) global_positive_scores.extend(positive_scores) global_negative_scores.extend(negative_scores) if verbose == True: print(str(userid)+": "+ str(auc)+", "+str(eer) ) auc_list.append(auc) eer_list.append(eer) print('AUC mean : %7.4f, std: %7.4f' % ( np.mean(auc_list), np.std(auc_list)) ) print('EER mean: %7.4f, std: %7.4f' % ( np.mean(eer_list), np.std(eer_list)) ) global_auc, global_eer, fpr, tpr = compute_AUC_EER(global_positive_scores, global_negative_scores) filename = 'output_png/scores_'+ str(data_type.value)+ '_' + str(representation_type.value) if SCORES == True: # **************************************************************************************** plot_scores(global_positive_scores, global_negative_scores, filename, title='Scores distribution') # **************************************************************************************** if( roc_data == True ): dict = {'FPR': fpr, 'TPR': tpr} df = pd.DataFrame(dict) df.to_csv(roc_data_filename, index=False) print("Global AUC: "+str(global_auc)) print("Global EER: "+str(global_eer))