def get_transformed_datasets(self, transformer: AbstractTransformer): """transform daa and save to avoid doin it over and over again""" # large_dataset that doesn't fit memory # Todo: fix this by an efficient transformation calculator if self.crop_size == 63 or self.crop_size is None: warning_txt = "Dataset of image size %s is too large and cannot be previously transformed" % self.crop_size warnings.warn(warning_txt) return self.get_outlier_detection_datasets() # TODO: this could be refactored to a method, init and final part could be # refactored into single method transformed_data_path = utils.add_text_to_beginning_of_file_path( self.template_save_path, '%s_outlier' % transformer.name) if os.path.exists(transformed_data_path): print('loading pickle') return pd.read_pickle(transformed_data_path) (x_train, y_train), (x_val, y_val), ( x_test, y_test) = self.get_outlier_detection_datasets() # print('train: ', np.unique(y_train, return_counts=True)) # print('val: ', np.unique(y_val, return_counts=True)) # print('test: ', np.unique(y_test, return_counts=True)) x_train_transformed, train_transform_inds = \ transformer.apply_all_transforms(x_train) x_val_transformed, val_transform_inds = \ transformer.apply_all_transforms(x_val) x_test_transformed, test_transform_inds = \ transformer.apply_all_transforms(x_test) sets_tuple = ((x_train_transformed, train_transform_inds), (x_val_transformed, val_transform_inds), (x_test_transformed, test_transform_inds)) utils.save_pickle(sets_tuple, transformed_data_path) return sets_tuple
def get_unsplitted_dataset(self) -> Dataset: """get preprocessed dataset, prior to outlier-inlier splitting""" # check if preprocessing has already been done unsplitted_data_path = utils.add_text_to_beginning_of_file_path( self.template_save_path, 'unsplitted') if os.path.exists(unsplitted_data_path): return pd.read_pickle(unsplitted_data_path) # params for hits loader, it performs an by sample 0-1 norm that I think # is not useful, becuse data is already 0-1 params = { param_keys.DATA_PATH_TRAIN: self.data_path, param_keys.BATCH_SIZE: 0 } # TODO: check None thing, labels value is None # because it is not used, retrieving both labels data_loader = HiTSLoader( params, label_value=None, first_n_samples_by_class=self.n_samples_by_class, test_size=None, validation_size=None, channels_to_get=self.used_channels) dataset = data_loader.get_single_dataset() utils.save_pickle(dataset, unsplitted_data_path) return dataset
def get_outlier_detection_datasets(self): """get outlier trainval test sets, by slecting class 0 as outliers (bogus in Hitd) and generating a train-val set of only inliers, while a test set with half-half inliers and outliers""" outlier_data_path = utils.add_text_to_beginning_of_file_path( self.template_save_path, 'outlier') if os.path.exists(outlier_data_path): return pd.read_pickle(outlier_data_path) dataset = self.get_preprocessed_unsplitted_dataset() # labels from 5 classes to 0-1 as bogus-real bogus_class_indx = 0 new_labels = (dataset.data_label.flatten() != bogus_class_indx) * 1.0 print(np.mean(new_labels == dataset.data_label)) # print(np.unique(new_labels, return_counts=True)) inlier_task = 1 # n_outliers = int( np.round(self.test_percentage_all_data * self.n_samples_by_class)) # separate data into train-val-test outlier_indexes = np.where(new_labels != inlier_task)[0] np.random.RandomState(seed=self.random_seed).shuffle(outlier_indexes) test_outlier_idxs = outlier_indexes[:n_outliers] inlier_indexes = np.where(new_labels == inlier_task)[0] # real == inliers val_size_inliers = int( np.round( np.sum( (new_labels == inlier_task)) * self.val_inlier_percentage)) # print(val_size_inliers) np.random.RandomState(seed=self.random_seed).shuffle(inlier_indexes) split_one_inlier_idxs = inlier_indexes[val_size_inliers:] val_inlier_idxs = inlier_indexes[:val_size_inliers] # print(split_one_inlier_idxs) # print(val_inlier_idxs) # train-test inlier indexes test_inlier_idxs = split_one_inlier_idxs[:n_outliers] train_inlier_idxs = split_one_inlier_idxs[n_outliers:] # print(n_outliers) # print(train_inlier_idxs) # print(test_inlier_idxs) X_train, y_train = dataset.data_array[train_inlier_idxs], new_labels[ train_inlier_idxs] X_val, y_val = dataset.data_array[val_inlier_idxs], new_labels[ val_inlier_idxs] X_test, y_test = np.concatenate([ dataset.data_array[test_inlier_idxs], dataset.data_array[test_outlier_idxs] ]), np.concatenate( [new_labels[test_inlier_idxs], new_labels[test_outlier_idxs]]) print('train: ', np.unique(y_train, return_counts=True)) print('val: ', np.unique(y_val, return_counts=True)) print('test: ', np.unique(y_test, return_counts=True)) sets_tuple = ((X_train, y_train), (X_val, y_val), (X_test, y_test)) utils.save_pickle(sets_tuple, outlier_data_path) return sets_tuple
def get_preprocessed_unsplitted_dataset(self): preproc_data_path = utils.add_text_to_beginning_of_file_path( self.template_save_path, 'preproc') if os.path.exists(preproc_data_path): return pd.read_pickle(preproc_data_path) dataset = self.get_unsplitted_dataset() # preprocessing -1 to 1 normalize dataset.data_array = 2 * (dataset.data_array / np.max(dataset.data_array)) - 1 utils.save_pickle(dataset, preproc_data_path) return dataset
def get_outlier_detection_datasets(self): """get outlier trainval test sets, by slecting class 4 as outliers (bogus in ZTF) and generating a train-val set of only inliers, while a test set with half-half inliers and outliers""" outlier_data_path = utils.add_text_to_beginning_of_file_path( self.template_save_path, 'outlier') if os.path.exists(outlier_data_path): return pd.read_pickle(outlier_data_path) dataset = self.get_unsplitted_dataset() # labels from 5 classes to 0-1 as bogus-real bogus_class_indx = 4 new_labels = (dataset.data_label.flatten() != bogus_class_indx) * 1.0 # print(np.unique(new_labels, return_counts=True)) inlier_task = 1 # # separate data into train-val-test outlier_indexes = np.where(new_labels != inlier_task)[0] inlier_indexes = np.where(new_labels == inlier_task)[0] # real == inliers val_size_inliers = int(np.round(np.sum( (new_labels == inlier_task)) * self.val_inlier_percentage)) np.random.RandomState(seed=self.random_seed).shuffle(inlier_indexes) # # large_dataset that doesn't fit memory # # Todo: fix this by an efficient transformation calculator if self.crop_size == 63 or self.crop_size is None: inlier_indexes = inlier_indexes[:8000] val_size_inliers = 1000 # train-val indexes inlier indexes split_one_inlier_idxs = inlier_indexes[val_size_inliers:] val_inlier_idxs = inlier_indexes[:val_size_inliers] # print(split_one_inlier_idxs) # print(val_inlier_idxs) # train-test inlier indexes n_outliers = np.sum(new_labels != inlier_task) test_inlier_idxs = split_one_inlier_idxs[:n_outliers] train_inlier_idxs = split_one_inlier_idxs[n_outliers:] # print(n_outliers) # print(train_inlier_idxs) # print(test_inlier_idxs) X_train, y_train = dataset.data_array[train_inlier_idxs], new_labels[ train_inlier_idxs] X_val, y_val = dataset.data_array[val_inlier_idxs], new_labels[ val_inlier_idxs] X_test, y_test = np.concatenate( [dataset.data_array[test_inlier_idxs], dataset.data_array[outlier_indexes]]), np.concatenate( [new_labels[test_inlier_idxs], new_labels[outlier_indexes]]) print('train: ', np.unique(y_train, return_counts=True)) print('val: ', np.unique(y_val, return_counts=True)) print('test: ', np.unique(y_test, return_counts=True)) sets_tuple = ((X_train, y_train), (X_val, y_val), (X_test, y_test)) utils.save_pickle(sets_tuple, outlier_data_path) return sets_tuple
def save_normal_and_transformed_data( transformer, normal_data_name='tf2_normal.pkl', transformed_data_name='tf2_old_transformed.pkl'): save_dir = os.path.join(PROJECT_PATH, 'tests', 'aux_data') utils.check_paths(save_dir) hits_params = { loader_keys.DATA_PATH: os.path.join(PROJECT_PATH, '../datasets/HiTS2013_300k_samples.pkl'), loader_keys.N_SAMPLES_BY_CLASS: 10000, loader_keys.TEST_PERCENTAGE: 0.2, loader_keys.VAL_SET_INLIER_PERCENTAGE: 0.1, loader_keys.USED_CHANNELS: [0, 1, 2, 3], loader_keys.CROP_SIZE: 21, general_keys.RANDOM_SEED: 42, loader_keys.TRANSFORMATION_INLIER_CLASS_VALUE: 1 } hits_outlier_dataset = HiTSOutlierLoader(hits_params) (x_train, y_train), (x_val, y_val), ( x_test, y_test) = hits_outlier_dataset.get_outlier_detection_datasets() x_train_transform, y_train_transform = transformer.apply_all_transforms( x=x_train) x_val_transform, y_val_transform = transformer.apply_all_transforms( x=x_val) x_test_transform, y_test_transform = transformer.apply_all_transforms( x=x_test) normal_data = ((x_train, y_train), (x_val, y_val), (x_test, y_test)) transformed_data = ((x_train_transform, y_train_transform), (x_val_transform, y_val_transform), (x_test_transform, y_test_transform)) utils.save_pickle(normal_data, os.path.join(save_dir, normal_data_name)) utils.save_pickle(transformed_data, os.path.join(save_dir, transformed_data_name))
def get_unsplitted_dataset(self) -> Dataset: """get preprocessed dataset, prior to outlier-inlier splitting""" # check if preprocessing has already been done unsplitted_data_path = utils.add_text_to_beginning_of_file_path( self.template_save_path, 'unsplitted') if os.path.exists(unsplitted_data_path): return pd.read_pickle(unsplitted_data_path) # useful to avoid frameToInput to perform the transformation of Dataframe to a pickle dict data_path = self.data_path unprocessed_unsplitted_data_path = utils.add_text_to_beginning_of_file_path( self.data_path, 'unprocessed_unsplitted') if os.path.exists(unprocessed_unsplitted_data_path): data_path = unprocessed_unsplitted_data_path # params for Frame input ztf loader and preprocessing params = { param_keys.DATA_PATH_TRAIN: data_path, param_keys.BATCH_SIZE: 0, param_keys.CHANNELS_TO_USE: self.used_channels, param_keys.TEST_SIZE: 0, # not used param_keys.VAL_SIZE: 0, # not used param_keys.NANS_TO: 0, param_keys.CROP_SIZE: self.crop_size, param_keys.CONVERTED_DATA_SAVEPATH: unprocessed_unsplitted_data_path } # instantiate loader, set preprocessor, load dataset data_loader = FrameToInput(params) data_loader.dataset_preprocessor.set_pipeline( [data_loader.dataset_preprocessor.check_single_image, data_loader.dataset_preprocessor.clean_misshaped, data_loader.dataset_preprocessor.select_channels, data_loader.dataset_preprocessor.normalize_by_image, data_loader.dataset_preprocessor.nan_to_num, data_loader.dataset_preprocessor.crop_at_center ]) dataset = data_loader.get_single_dataset() utils.save_pickle(dataset, unsplitted_data_path) return dataset
def get_transformed_datasets(self, transformer: AbstractTransformer): """transform daa and save to avoid doin it over and over again""" transformed_data_path = utils.add_text_to_beginning_of_file_path( self.template_save_path, '%s_outlier' % transformer.name) if os.path.exists(transformed_data_path): return pd.read_pickle(transformed_data_path) (x_train, y_train), (x_val, y_val), (x_test, y_test) = self.get_outlier_detection_datasets() print('train: ', np.unique(y_train, return_counts=True)) print('val: ', np.unique(y_val, return_counts=True)) print('test: ', np.unique(y_test, return_counts=True)) x_train_transformed, train_transform_inds = \ transformer.apply_all_transforms(x_train) x_val_transformed, val_transform_inds = \ transformer.apply_all_transforms(x_val) x_test_transformed, test_transform_inds = \ transformer.apply_all_transforms(x_test) sets_tuple = ((x_train_transformed, train_transform_inds), (x_val_transformed, val_transform_inds), (x_test_transformed, test_transform_inds)) utils.save_pickle(sets_tuple, transformed_data_path) return sets_tuple
def hits4c_tr18(): hits_params = { loader_keys.DATA_PATH: os.path.join(PROJECT_PATH, '../datasets/HiTS2013_300k_samples.pkl'), loader_keys.N_SAMPLES_BY_CLASS: 10000, loader_keys.TEST_PERCENTAGE: 0.2, loader_keys.VAL_SET_INLIER_PERCENTAGE: 0.1, loader_keys.USED_CHANNELS: [0, 1, 2, 3], loader_keys.CROP_SIZE: 21, general_keys.RANDOM_SEED: 42, loader_keys.TRANSFORMATION_INLIER_CLASS_VALUE: 1 } data_loader = HiTSOutlierLoader(hits_params) (x_train, y_train), (x_val, y_val), ( x_test, y_test) = data_loader.get_outlier_detection_datasets() transformer = transformations_tf.KernelTransformer(flips=True, gauss=False, log=False) mdl = EnsembleOVOTransformODSimpleModel( data_loader, transformer=transformer, input_shape=x_train.shape[1:], results_folder_name='transform_selection_1') mdl.fit(x_train, x_val, train_batch_size=1024, verbose=0) train_matrix_scores = mdl.predict_matrix_score(x_train, transform_batch_size=1024) val_matrix_scores = mdl.predict_matrix_score(x_val, transform_batch_size=1024) test_outlier_matrix_scores = mdl.predict_matrix_score( x_test[y_test == 0], transform_batch_size=1024) utils.save_pickle( train_matrix_scores, os.path.join( mdl.main_model_path, 'train_matrix_scores_translations+flip(18)_train_step.pkl')) utils.save_pickle( val_matrix_scores, os.path.join(mdl.main_model_path, 'val_matrix_scores_translations+flip(18)_train_step.pkl')) utils.save_pickle( test_outlier_matrix_scores, os.path.join( mdl.main_model_path, 'test_matrix_scores_translations+flip(18)_train_step.pkl'))
def test_model_loading(transformer, mdl, loader, dataset_name='hits-4-c', single_class_ind=1, tf_version='tf1', transformer_name='transformed', model_name='resnet', epochs=None): results_dir = os.path.join(PROJECT_PATH, 'tests', 'aux_results') save_dir = os.path.join(PROJECT_PATH, 'tests', 'aux_data') utils.check_path(results_dir) utils.check_path(save_dir) utils.check_path(os.path.join(results_dir, dataset_name)) # load-save data normal_data_path = os.path.join( save_dir, 'normal_data_%s_%s_loading.pkl' % (dataset_name, tf_version)) if os.path.exists(normal_data_path): (x_train, y_train), (x_val, y_val), (x_test, y_test) = pd.read_pickle(normal_data_path) else: (x_train, y_train), (x_val, y_val), (x_test, y_test) = loader(return_val=True) normal_data = (x_train, y_train), (x_val, y_val), (x_test, y_test) utils.save_pickle(normal_data, normal_data_path) # create model # n, k = (10, 4) # mdl = create_wide_residual_network( # x_train.shape[1:], transformer.n_transforms, n, k) mdl.compile('adam', 'categorical_crossentropy', ['acc']) # selec inliers x_train = x_train[y_train.flatten() == single_class_ind] x_val = x_val[y_val.flatten() == single_class_ind] # load-save transformed data transformed_data_path = os.path.join( save_dir, '%s_data_%s_%s_loading.pkl' % (transformer_name, dataset_name, tf_version)) if os.path.exists(transformed_data_path): (x_train_transform_tf1, y_train_transform_tf1), (x_val_transform_tf1, y_val_transform_tf1), ( x_test_transform_tf1, y_test_transform_tf1) = pd.read_pickle(transformed_data_path) else: # transform all data y_train_transform_tf1 = np.tile(np.arange(transformer.n_transforms), len(x_train)) x_train_transform_tf1 = transformer.transform_batch( np.repeat(x_train, transformer.n_transforms, axis=0), y_train_transform_tf1) y_val_transform_tf1 = np.tile(np.arange(transformer.n_transforms), len(x_val)) x_val_transform_tf1 = transformer.transform_batch( np.repeat(x_val, transformer.n_transforms, axis=0), y_val_transform_tf1) y_test_transform_tf1 = np.tile(np.arange(transformer.n_transforms), len(x_test)) x_test_transform_tf1 = transformer.transform_batch( np.repeat(x_test, transformer.n_transforms, axis=0), y_test_transform_tf1) transformed_data = ((x_train_transform_tf1, y_train_transform_tf1), (x_val_transform_tf1, y_val_transform_tf1), (x_test_transform_tf1, y_test_transform_tf1)) utils.save_pickle(transformed_data, transformed_data_path) print(x_train.shape) print(x_train_transform_tf1.shape) print(x_test.shape) print(x_test_transform_tf1.shape) # train model batch_size = 128 if epochs is None: epochs = int(np.ceil(200 / transformer.n_transforms)) mdl.fit(x=x_train_transform_tf1, y=to_categorical(y_train_transform_tf1), batch_size=batch_size, epochs=epochs) scores = np.zeros((len(x_test), )) matrix_evals = np.zeros( (len(x_test), transformer.n_transforms, transformer.n_transforms)) x_pred_train = mdl.predict(x_train_transform_tf1, batch_size=1024) x_pred_test = mdl.predict(x_test_transform_tf1, batch_size=1024) print(x_pred_train.shape) print(x_pred_test.shape) for t_ind in range(transformer.n_transforms): ind_x_pred_equal_to_t_ind = np.where(y_train_transform_tf1 == t_ind)[0] observed_dirichlet = x_pred_train[ind_x_pred_equal_to_t_ind] log_p_hat_train = np.log(observed_dirichlet).mean(axis=0) alpha_sum_approx = calc_approx_alpha_sum(observed_dirichlet) alpha_0 = observed_dirichlet.mean(axis=0) * alpha_sum_approx mle_alpha_t = fixed_point_dirichlet_mle(alpha_0, log_p_hat_train) ind_x_pred_test_equal_to_t_ind = np.where( y_test_transform_tf1 == t_ind)[0] x_test_p = x_pred_test[ind_x_pred_test_equal_to_t_ind] matrix_evals[:, :, t_ind] += x_test_p scores += dirichlet_normality_score(mle_alpha_t, x_test_p) scores /= transformer.n_transforms matrix_evals /= transformer.n_transforms scores_simple = np.trace(matrix_evals, axis1=1, axis2=2) scores_entropy = -1 * get_entropy(matrix_evals) scores_xH = -1 * get_xH(transformer, matrix_evals) labels = y_test.flatten() == single_class_ind save_results_file(results_dir, dataset_name, single_class_ind, scores=scores, labels=labels, experiment_name='%s-%s-loading-%s' % (model_name, transformer_name, tf_version)) save_results_file(results_dir, dataset_name, single_class_ind, scores=scores_simple, labels=labels, experiment_name='%s-%s-simple-loading-%s' % (model_name, transformer_name, tf_version)) save_results_file(results_dir, dataset_name, single_class_ind, scores=scores_entropy, labels=labels, experiment_name='%s-%s-entropy-loading-%s' % (model_name, transformer_name, tf_version)) save_results_file(results_dir, dataset_name, single_class_ind, scores=scores_xH, labels=labels, experiment_name='%s-%s-xH-loading-%s' % (model_name, transformer_name, tf_version)) mdl_weights_name = '{}_{}_{}_{}_loading_{}_weights.h5'.format( model_name, transformer_name, dataset_name, tf_version, get_class_name_from_index(single_class_ind, dataset_name)) mdl_weights_path = os.path.join(results_dir, dataset_name, mdl_weights_name) mdl.save_weights(mdl_weights_path) reset_weights() """ Time test_model_original(transformer, load_hits4c, dataset_name='hits-4-c', tf_version='tf1') 00:04:31.65 (0.992217, 0.9895665, 0.99131725, 0.989478125) (0.99240075, 0.9900822499999999, 0.99215325, 0.9901300000000001) """ return get_roc_auc(scores, labels), get_roc_auc(scores_simple, labels), \ get_roc_auc(scores_entropy, labels), get_roc_auc(scores_xH, labels)
def test_model_original(transformer, loader, dataset_name='hits-4-c', single_class_ind=1): results_dir = os.path.join(PROJECT_PATH, 'tests', 'aux_results') save_dir = os.path.join(PROJECT_PATH, 'tests', 'aux_data') utils.check_path(results_dir) utils.check_path(save_dir) utils.check_path(os.path.join(results_dir, dataset_name)) # load-save data (x_train, y_train), (x_val, y_val), (x_test, y_test) = loader(return_val=True) normal_data = (x_train, y_train), (x_val, y_val), (x_test, y_test) utils.save_pickle( normal_data, os.path.join(save_dir, 'normal_data_%s_tf1_original.pkl' % dataset_name)) # create model n, k = (10, 4) mdl = create_wide_residual_network(x_train.shape[1:], transformer.n_transforms, n, k) mdl.compile('adam', 'categorical_crossentropy', ['acc']) # get inliers of specific class # get inliers x_train_task = x_train[y_train.flatten() == single_class_ind] print(x_train_task.shape) # transform inliers transformations_inds = np.tile(np.arange(transformer.n_transforms), len(x_train_task)) x_train_task_transformed = transformer.transform_batch( np.repeat(x_train_task, transformer.n_transforms, axis=0), transformations_inds) print(x_train_task_transformed.shape) # train model batch_size = 128 mdl.fit(x=x_train_task_transformed, y=to_categorical(transformations_inds), batch_size=batch_size, epochs=int(np.ceil(200 / transformer.n_transforms))) scores = np.zeros((len(x_test), )) matrix_evals = np.zeros( (len(x_test), transformer.n_transforms, transformer.n_transforms)) observed_data = x_train_task for t_ind in range(transformer.n_transforms): observed_dirichlet = mdl.predict(transformer.transform_batch( observed_data, [t_ind] * len(observed_data)), batch_size=1024) log_p_hat_train = np.log(observed_dirichlet).mean(axis=0) alpha_sum_approx = calc_approx_alpha_sum(observed_dirichlet) alpha_0 = observed_dirichlet.mean(axis=0) * alpha_sum_approx mle_alpha_t = fixed_point_dirichlet_mle(alpha_0, log_p_hat_train) x_test_p = mdl.predict(transformer.transform_batch( x_test, [t_ind] * len(x_test)), batch_size=1024) matrix_evals[:, :, t_ind] += x_test_p scores += dirichlet_normality_score(mle_alpha_t, x_test_p) scores /= transformer.n_transforms matrix_evals /= transformer.n_transforms scores_simple = np.trace(matrix_evals, axis1=1, axis2=2) scores_entropy = -1 * get_entropy(matrix_evals) scores_xH = -1 * get_xH(transformer, matrix_evals) labels = y_test.flatten() == single_class_ind save_results_file(results_dir, dataset_name, single_class_ind, scores=scores, labels=labels, experiment_name='transformations') save_results_file(results_dir, dataset_name, single_class_ind, scores=scores_simple, labels=labels, experiment_name='transformations-simple') save_results_file(results_dir, dataset_name, single_class_ind, scores=scores_entropy, labels=labels, experiment_name='transformations-entropy') save_results_file(results_dir, dataset_name, single_class_ind, scores=scores_xH, labels=labels, experiment_name='transformations-xH') mdl_weights_name = '{}_tf1_original_{}_weights.h5'.format( dataset_name, get_class_name_from_index(single_class_ind, dataset_name)) mdl_weights_path = os.path.join(results_dir, dataset_name, mdl_weights_name) mdl.save_weights(mdl_weights_path) """ Time test_model_original(transformer, load_hits4c, dataset_name='hits-4-c') 00:06:58.37 (0.9917134999999999, 0.9350055, 0.9872614999999999, 0.94142025) (0.9938067500000001, 0.9923547500000001, 0.9931685, 0.992637375) (0.9912172499999999, 0.9883357499999998, 0.9909070000000001, 0.9886706249999999) #train only Time test_model_original(transformer, load_hits4c, dataset_name='hits-4-c', tf_version='tf1') 00:03:48.29 """ return get_roc_auc(scores, labels), get_roc_auc(scores_simple, labels), \ get_roc_auc(scores_entropy, labels), get_roc_auc(scores_xH, labels)