def main(args): """Main function of DVRL for corrupted sample discovery experiment. Args: args: data_name, train_no, valid_no, noise_rate, normalization, network parameters """ # Data loading and sample corruption data_name = args.data_name # The number of training and validation samples dict_no = dict() dict_no['train'] = args.train_no dict_no['valid'] = args.valid_no # Additional noise ratio noise_rate = args.noise_rate # Checkpoint file name checkpoint_file_name = args.checkpoint_file_name # Data loading and label corruption noise_idx = data_loading.load_tabular_data(data_name, dict_no, noise_rate) # noise_idx: ground truth noisy label indices print('Finished data loading.') # Data preprocessing # Normalization methods: 'minmax' or 'standard' normalization = args.normalization # Extracts features and labels. Then, normalizes features x_train, y_train, x_valid, y_valid, x_test, y_test, _ = \ data_loading.preprocess_data(normalization, 'train.csv', 'valid.csv', 'test.csv') print('Finished data preprocess.') # Run DVRL # Resets the graph tf.reset_default_graph() # Network parameters parameters = dict() parameters['hidden_dim'] = args.hidden_dim parameters['comb_dim'] = args.comb_dim parameters['activation'] = tf.nn.relu parameters['iterations'] = args.iterations parameters['layer_number'] = args.layer_number parameters['batch_size'] = args.batch_size parameters['learning_rate'] = args.learning_rate # In this example, we consider a classification problem and we use Logistic # Regression as the predictor model. problem = 'classification' pred_model = linear_model.LogisticRegression(solver='lbfgs') # Flags for using stochastic gradient descent / pre-trained model flags = {'sgd': False, 'pretrain': False} # Initalizes DVRL dvrl_class = dvrl.Dvrl(x_train, y_train, x_valid, y_valid, problem, pred_model, parameters, checkpoint_file_name, flags) # Trains DVRL dvrl_class.train_dvrl('auc') print('Finished dvrl training.') # Outputs # Data valuation dve_out = dvrl_class.data_valuator(x_train, y_train) print('Finished date valuation.') # Evaluations # Evaluation model eval_model = lightgbm.LGBMClassifier() # 1. Robust learning (DVRL-weighted learning) robust_perf = dvrl_metrics.learn_with_dvrl(dve_out, eval_model, x_train, y_train, x_valid, y_valid, x_test, y_test, 'accuracy') print('DVRL-weighted learning performance: ' + str(np.round(robust_perf, 4))) # 2. Performance after removing high/low values _ = dvrl_metrics.remove_high_low(dve_out, eval_model, x_train, y_train, x_valid, y_valid, x_test, y_test, 'accuracy', plot=True) # 3. Corrupted sample discovery # If noise_rate is positive value. if noise_rate > 0: # Evaluates corrupted_sample_discovery # and plot corrupted sample discovery results _ = dvrl_metrics.discover_corrupted_sample(dve_out, noise_idx, noise_rate, plot=True)
def main(args): """Main function of DVRL for data valuation experiment. Args: args: data_name, train_no, valid_no, normalization, network parameters, number of examples """ # Data loading and sample corruption data_name = args.data_name # The number of training and validation samples dict_no = dict() dict_no['train'] = args.train_no dict_no['valid'] = args.valid_no # Network parameters parameters = dict() parameters['hidden_dim'] = args.hidden_dim parameters['comb_dim'] = args.comb_dim parameters['iterations'] = args.iterations parameters['activation'] = tf.nn.relu parameters['inner_iterations'] = args.inner_iterations parameters['layer_number'] = args.layer_number parameters['learning_rate'] = args.learning_rate parameters['batch_size'] = args.batch_size parameters['batch_size_predictor'] = args.batch_size_predictor # The number of examples n_exp = args.n_exp # Checkpoint file name checkpoint_file_name = args.checkpoint_file_name # Data loading _ = data_loading.load_tabular_data(data_name, dict_no, 0.0) print('Finished data loading.') # Data preprocessing # Normalization methods: 'minmax' or 'standard' normalization = args.normalization # Extracts features and labels. Then, normalizes features x_train, y_train, x_valid, y_valid, x_test, y_test, col_names = \ data_loading.preprocess_data(normalization, 'train.csv', 'valid.csv', 'test.csv') print('Finished data preprocess.') # Run DVRL # Resets the graph tf.reset_default_graph() keras.backend.clear_session() # Here, we assume a classification problem and we assume a predictor model # in the form of a simple multi-layer perceptron. problem = 'classification' # Predictive model define pred_model = keras.models.Sequential() pred_model.add( keras.layers.Dense(parameters['hidden_dim'], activation='relu')) pred_model.add( keras.layers.Dense(parameters['hidden_dim'], activation='relu')) pred_model.add(keras.layers.Dense(2, activation='softmax')) pred_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # Flags for using stochastic gradient descent / pre-trained model flags = {'sgd': True, 'pretrain': False} # Initializes DVRL dvrl_class = dvrl.Dvrl(x_train, y_train, x_valid, y_valid, problem, pred_model, parameters, checkpoint_file_name, flags) # Trains DVRL dvrl_class.train_dvrl('auc') print('Finished dvrl training.') # Outputs # Data valuation dve_out = dvrl_class.data_valuator(x_train, y_train) print('Finished data valuation.') # Evaluations # 1. Data valuation # Data valuation sorted_idx = np.argsort(-dve_out) sorted_x_train = x_train[sorted_idx] # Indices of top n high valued samples print('Indices of top ' + str(n_exp) + ' high valued samples: ' + str(sorted_idx[:n_exp])) print( pd.DataFrame(data=sorted_x_train[:n_exp, :], index=range(n_exp), columns=col_names).head()) # Indices of top n low valued samples print('Indices of top ' + str(n_exp) + ' low valued samples: ' + str(sorted_idx[-n_exp:])) print( pd.DataFrame(data=sorted_x_train[-n_exp:, :], index=range(n_exp), columns=col_names).head()) # 2. Performance after removing high/low values # Here, as the evaluation model, we use LightGBM. eval_model = lightgbm.LGBMClassifier() # Performance after removing high/low values _ = dvrl_metrics.remove_high_low(dve_out, eval_model, x_train, y_train, x_valid, y_valid, x_test, y_test, 'accuracy', plot=True) return
def main(args): """Main function of DVRL with transfer learning for image data. Main function of DVRL for corrupted sample discovery and robust learning with transfer learning for image data. Args: args: data_name, train_no, valid_no, noise_rate, normalization, network parameters """ # Data name (either cifar10 or cifar100) data_name = args.data_name # The number of training and validation samples dict_no = dict() dict_no['train'] = args.train_no dict_no['valid'] = args.valid_no dict_no['test'] = args.test_no # Additional noise ratio noise_rate = args.noise_rate # Checkpoint file name checkpoint_file_name = args.checkpoint_file_name # Data loading and label corruption noise_idx = data_loading.load_image_data(data_name, dict_no, noise_rate) # noise_idx: ground truth noisy label indices print('Finished data loading.') # Extracts features and labels. x_train, y_train, x_valid, y_valid, x_test, y_test = \ data_loading.load_image_data_from_file('train.npz', 'valid.npz', 'test.npz') print('Finished data preprocess.') # Encodes samples # The preprocessing function used on the pre-training dataset is also # applied while encoding the inputs. preprocess_function = applications.inception_v3.preprocess_input input_shape = (299, 299) def encoder_model(architecture='inception_v3', pre_trained_dataset='imagenet', downsample_factor=8): """Returns encoder model. Defines the encoder model to learn the representations for image dataset. In this example, we are considering the InceptionV3 model trained on ImageNet dataset, followed by simple average pooling-based downsampling. Args: architecture: Base architecture of encoder model (e.g. 'inception_v3') pre_trained_dataset: The dataset used to pre-train the encoder model downsample_factor: Downsample factor for the outputs Raises: NameError: Returns name errors if architecture is not 'inception_v3' """ tf_input = layers.Input(shape=(input_shape[0], input_shape[1], 3)) if architecture == 'inception_v3': model = applications.inception_v3.InceptionV3( input_tensor=tf_input, weights=pre_trained_dataset, include_top=False) output_pooled = \ layers.AveragePooling2D((downsample_factor, downsample_factor), strides=(downsample_factor, downsample_factor))(model.output) else: raise NameError('Invalid architecture') return models.Model(model.input, output_pooled) # Encodes training samples enc_x_train = \ data_loading.encode_image(x_train, encoder_model, input_shape, preprocess_function) # Encodes validation samples enc_x_valid = \ data_loading.encode_image(x_valid, encoder_model, input_shape, preprocess_function) # Encodes testing samples enc_x_test = \ data_loading.encode_image(x_test, encoder_model, input_shape, preprocess_function) print('Finished data encoding') # Run DVRL # Resets the graph tf.reset_default_graph() keras.backend.clear_session() # Network parameters parameters = dict() parameters['hidden_dim'] = args.hidden_dim parameters['comb_dim'] = args.comb_dim parameters['activation'] = tf.nn.relu parameters['iterations'] = args.iterations parameters['layer_number'] = args.layer_number parameters['batch_size'] = args.batch_size parameters['learning_rate'] = args.learning_rate parameters['inner_iterations'] = args.inner_iterations parameters['batch_size_predictor'] = args.batch_size_predictor # Defines problem problem = 'classification' # Defines predictive model pred_model = keras.models.Sequential() pred_model.add(keras.layers.Dense(len(set(y_train)), activation='softmax')) pred_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) # Flags for using stochastic gradient descent / pre-trained model flags = {'sgd': True, 'pretrain': False} # Initalizes DVRL dvrl_class = dvrl.Dvrl(enc_x_train, y_train, enc_x_valid, y_valid, problem, pred_model, parameters, checkpoint_file_name, flags) # Trains DVRL dvrl_class.train_dvrl('accuracy') print('Finished DVRL training.') # Outputs # Data valuation dve_out = dvrl_class.data_valuator(enc_x_train, y_train) print('Finished data valuation.') # Evaluations # Evaluation model eval_model = linear_model.LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=2000) # 1. Robust learning (DVRL-weighted learning) robust_perf = dvrl_metrics.learn_with_dvrl(dve_out, eval_model, enc_x_train, y_train, enc_x_valid, y_valid, enc_x_test, y_test, 'accuracy') print('DVRL-weighted learning performance: ' + str(np.round(robust_perf, 4))) # 2. Performance after removing high/low values _ = dvrl_metrics.remove_high_low(dve_out, eval_model, enc_x_train, y_train, enc_x_valid, y_valid, enc_x_test, y_test, 'accuracy', plot=True) # 3. Corrupted sample discovery # If noise_idx variable exist (explicit indices for noisy sample) # and noise_rate is positive value. if noise_rate > 0: # Evaluates corrupted_sample_discovery # and plot corrupted sample discovery results _ = dvrl_metrics.discover_corrupted_sample(dve_out, noise_idx, noise_rate, plot=True)