Beispiel #1
0
  def testDvrlPrediction(self):
    """Tests predictions of DVRL."""

    tf.reset_default_graph()

    dvrl_class = dvrl.Dvrl(
        x_train=self.x_train, y_train=self.y_train,
        x_valid=self.x_valid, y_valid=self.y_valid,
        problem=self.problem, pred_model=self.pred_model,
        parameters=self.parameters,
        checkpoint_file_name=self.checkpoint_file_name,
        flags=self.flags)

    dvrl_class.train_dvrl('auc')

    y_test_hat = dvrl_class.dvrl_predictor(x_test=self.x_test)

    self.assertAllEqual([2000, 2], y_test_hat.shape)
Beispiel #2
0
  def testDvrlDataValuation(self):
    """Tests data valuation of DVRL."""

    tf.reset_default_graph()

    dvrl_class = dvrl.Dvrl(
        x_train=self.x_train, y_train=self.y_train,
        x_valid=self.x_valid, y_valid=self.y_valid,
        problem=self.problem, pred_model=self.pred_model,
        parameters=self.parameters,
        checkpoint_file_name=self.checkpoint_file_name,
        flags=self.flags)

    dvrl_class.train_dvrl('auc')

    dve_out = dvrl_class.data_valuator(
        x_train=self.x_train, y_train=self.y_train)

    self.assertAllEqual([1000,], dve_out.shape)
def main(args):
    """Main function of DVRL for domain adaptation experiment.

  Args:
    args: train_no, valid_no,
          normalization, network parameters
  """

    # Data loading
    # The number of training and validation samples
    dict_no = dict()
    dict_no['source'] = args.train_no
    dict_no['valid'] = args.valid_no

    # Setting and target store type
    setting = 'train-on-rest'
    target_store_type = 'B'

    # Network parameters
    parameters = dict()
    parameters['hidden_dim'] = args.hidden_dim
    parameters['comb_dim'] = args.comb_dim
    parameters['iterations'] = args.iterations
    parameters['activation'] = tf.nn.tanh
    parameters['layer_number'] = args.layer_number
    parameters['batch_size'] = args.batch_size
    parameters['learning_rate'] = args.learning_rate

    # Checkpoint file name
    checkpoint_file_name = args.checkpoint_file_name

    # Data loading
    data_loading.load_rossmann_data(dict_no, setting, target_store_type)

    print('Finished data loading.')

    # Data preprocessing
    # Normalization methods: 'minmax' or 'standard'
    normalization = args.normalization

    # Extracts features and labels. Then, normalizes features
    x_source, y_source, x_valid, y_valid, x_target, y_target, _ = \
        data_loading.preprocess_data(normalization,
                                     'source.csv', 'valid.csv', 'target.csv')

    print('Finished data preprocess.')

    # Run DVRL
    # Resets the graph
    tf.reset_default_graph()

    problem = 'regression'
    # Predictor model definition
    pred_model = lightgbm.LGBMRegressor()

    # Flags for using stochastic gradient descent / pre-trained model
    flags = {'sgd': False, 'pretrain': False}

    # Initializes DVRL
    dvrl_class = dvrl.Dvrl(x_source, y_source, x_valid, y_valid, problem,
                           pred_model, parameters, checkpoint_file_name, flags)

    # Trains DVRL
    dvrl_class.train_dvrl('rmspe')

    print('Finished dvrl training.')

    # Outputs
    # Data valuation
    dve_out = dvrl_class.data_valuator(x_source, y_source)

    print('Finished date valuation.')

    # Evaluations
    # Evaluation model
    eval_model = lightgbm.LGBMRegressor()

    # DVRL-weighted learning
    dvrl_perf = dvrl_metrics.learn_with_dvrl(dve_out, eval_model, x_source,
                                             y_source, x_valid, y_valid,
                                             x_target, y_target, 'rmspe')

    # Baseline prediction performance (treat all training samples equally)
    base_perf = dvrl_metrics.learn_with_baseline(eval_model, x_source,
                                                 y_source, x_target, y_target,
                                                 'rmspe')

    print('Finish evaluation.')
    print('DVRL learning performance: ' + str(np.round(dvrl_perf, 4)))
    print('Baseline performance: ' + str(np.round(base_perf, 4)))
def main(args):
    """Main function of DVRL for data valuation experiment.

  Args:
    args: data_name, train_no, valid_no,
          normalization, network parameters, number of examples
  """
    # Data loading and sample corruption
    data_name = args.data_name

    # The number of training and validation samples
    dict_no = dict()
    dict_no['train'] = args.train_no
    dict_no['valid'] = args.valid_no

    # Network parameters
    parameters = dict()
    parameters['hidden_dim'] = args.hidden_dim
    parameters['comb_dim'] = args.comb_dim
    parameters['iterations'] = args.iterations
    parameters['activation'] = tf.nn.relu
    parameters['inner_iterations'] = args.inner_iterations
    parameters['layer_number'] = args.layer_number
    parameters['learning_rate'] = args.learning_rate
    parameters['batch_size'] = args.batch_size
    parameters['batch_size_predictor'] = args.batch_size_predictor

    # The number of examples
    n_exp = args.n_exp

    # Checkpoint file name
    checkpoint_file_name = args.checkpoint_file_name

    # Data loading
    _ = data_loading.load_tabular_data(data_name, dict_no, 0.0)

    print('Finished data loading.')

    # Data preprocessing
    # Normalization methods: 'minmax' or 'standard'
    normalization = args.normalization

    # Extracts features and labels. Then, normalizes features
    x_train, y_train, x_valid, y_valid, x_test, y_test, col_names = \
    data_loading.preprocess_data(normalization, 'train.csv',
                                 'valid.csv', 'test.csv')

    print('Finished data preprocess.')

    # Run DVRL
    # Resets the graph
    tf.reset_default_graph()
    keras.backend.clear_session()

    # Here, we assume a classification problem and we assume a predictor model
    # in the form of a simple multi-layer perceptron.
    problem = 'classification'
    # Predictive model define
    pred_model = keras.models.Sequential()
    pred_model.add(
        keras.layers.Dense(parameters['hidden_dim'], activation='relu'))
    pred_model.add(
        keras.layers.Dense(parameters['hidden_dim'], activation='relu'))
    pred_model.add(keras.layers.Dense(2, activation='softmax'))
    pred_model.compile(optimizer='adam',
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])

    # Flags for using stochastic gradient descent / pre-trained model
    flags = {'sgd': True, 'pretrain': False}

    # Initializes DVRL
    dvrl_class = dvrl.Dvrl(x_train, y_train, x_valid, y_valid, problem,
                           pred_model, parameters, checkpoint_file_name, flags)

    # Trains DVRL
    dvrl_class.train_dvrl('auc')

    print('Finished dvrl training.')

    # Outputs
    # Data valuation
    dve_out = dvrl_class.data_valuator(x_train, y_train)

    print('Finished data valuation.')

    # Evaluations
    # 1. Data valuation
    # Data valuation
    sorted_idx = np.argsort(-dve_out)
    sorted_x_train = x_train[sorted_idx]

    # Indices of top n high valued samples
    print('Indices of top ' + str(n_exp) + ' high valued samples: ' +
          str(sorted_idx[:n_exp]))
    print(
        pd.DataFrame(data=sorted_x_train[:n_exp, :],
                     index=range(n_exp),
                     columns=col_names).head())

    # Indices of top n low valued samples
    print('Indices of top ' + str(n_exp) + ' low valued samples: ' +
          str(sorted_idx[-n_exp:]))
    print(
        pd.DataFrame(data=sorted_x_train[-n_exp:, :],
                     index=range(n_exp),
                     columns=col_names).head())

    # 2. Performance after removing high/low values
    # Here, as the evaluation model, we use LightGBM.
    eval_model = lightgbm.LGBMClassifier()

    # Performance after removing high/low values
    _ = dvrl_metrics.remove_high_low(dve_out,
                                     eval_model,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     y_valid,
                                     x_test,
                                     y_test,
                                     'accuracy',
                                     plot=True)

    return
def main(args):
    """Main function of DVRL for corrupted sample discovery experiment.

  Args:
    args: data_name, train_no, valid_no, noise_rate,
          normalization, network parameters
  """
    # Data loading and sample corruption
    data_name = args.data_name

    # The number of training and validation samples
    dict_no = dict()
    dict_no['train'] = args.train_no
    dict_no['valid'] = args.valid_no

    # Additional noise ratio
    noise_rate = args.noise_rate

    # Checkpoint file name
    checkpoint_file_name = args.checkpoint_file_name

    # Data loading and label corruption
    noise_idx = data_loading.load_tabular_data(data_name, dict_no, noise_rate)
    # noise_idx: ground truth noisy label indices

    print('Finished data loading.')

    # Data preprocessing
    # Normalization methods: 'minmax' or 'standard'
    normalization = args.normalization

    # Extracts features and labels. Then, normalizes features
    x_train, y_train, x_valid, y_valid, x_test, y_test, _ = \
    data_loading.preprocess_data(normalization, 'train.csv',
                                 'valid.csv', 'test.csv')

    print('Finished data preprocess.')

    # Run DVRL
    # Resets the graph
    tf.reset_default_graph()

    # Network parameters
    parameters = dict()
    parameters['hidden_dim'] = args.hidden_dim
    parameters['comb_dim'] = args.comb_dim
    parameters['activation'] = tf.nn.relu
    parameters['iterations'] = args.iterations
    parameters['layer_number'] = args.layer_number
    parameters['batch_size'] = args.batch_size
    parameters['learning_rate'] = args.learning_rate

    # In this example, we consider a classification problem and we use Logistic
    # Regression as the predictor model.
    problem = 'classification'
    pred_model = linear_model.LogisticRegression(solver='lbfgs')

    # Flags for using stochastic gradient descent / pre-trained model
    flags = {'sgd': False, 'pretrain': False}

    # Initalizes DVRL
    dvrl_class = dvrl.Dvrl(x_train, y_train, x_valid, y_valid, problem,
                           pred_model, parameters, checkpoint_file_name, flags)

    # Trains DVRL
    dvrl_class.train_dvrl('auc')

    print('Finished dvrl training.')

    # Outputs
    # Data valuation
    dve_out = dvrl_class.data_valuator(x_train, y_train)

    print('Finished date valuation.')

    # Evaluations
    # Evaluation model
    eval_model = lightgbm.LGBMClassifier()

    # 1. Robust learning (DVRL-weighted learning)
    robust_perf = dvrl_metrics.learn_with_dvrl(dve_out, eval_model, x_train,
                                               y_train, x_valid, y_valid,
                                               x_test, y_test, 'accuracy')

    print('DVRL-weighted learning performance: ' +
          str(np.round(robust_perf, 4)))

    # 2. Performance after removing high/low values
    _ = dvrl_metrics.remove_high_low(dve_out,
                                     eval_model,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     y_valid,
                                     x_test,
                                     y_test,
                                     'accuracy',
                                     plot=True)

    # 3. Corrupted sample discovery
    # If noise_rate is positive value.
    if noise_rate > 0:
        # Evaluates corrupted_sample_discovery
        # and plot corrupted sample discovery results
        _ = dvrl_metrics.discover_corrupted_sample(dve_out,
                                                   noise_idx,
                                                   noise_rate,
                                                   plot=True)
Beispiel #6
0
def main(args):
  """Main function of DVRL with transfer learning for image data.

  Main function of DVRL for corrupted sample discovery and robust learning
  with transfer learning for image data.

  Args:
    args: data_name, train_no, valid_no, noise_rate,
          normalization, network parameters
  """
  # Data name (either cifar10 or cifar100)
  data_name = args.data_name

  # The number of training and validation samples
  dict_no = dict()
  dict_no['train'] = args.train_no
  dict_no['valid'] = args.valid_no
  dict_no['test'] = args.test_no

  # Additional noise ratio
  noise_rate = args.noise_rate

  # Checkpoint file name
  checkpoint_file_name = args.checkpoint_file_name

  # Data loading and label corruption
  noise_idx = data_loading.load_image_data(data_name, dict_no, noise_rate)
  # noise_idx: ground truth noisy label indices

  print('Finished data loading.')

  # Extracts features and labels.
  x_train, y_train, x_valid, y_valid, x_test, y_test = \
  data_loading.load_image_data_from_file('train.npz', 'valid.npz', 'test.npz')
  print('Finished data preprocess.')

  # Encodes samples
  # The preprocessing function used on the pre-training dataset is also
  # applied while encoding the inputs.
  preprocess_function = applications.inception_v3.preprocess_input
  input_shape = (299, 299)

  def encoder_model(architecture='inception_v3', pre_trained_dataset='imagenet',
                    downsample_factor=8):
    """Returns encoder model.

    Defines the encoder model to learn the representations for image dataset.
    In this example, we are considering the InceptionV3 model trained on
    ImageNet dataset, followed by simple average pooling-based downsampling.

    Args:
      architecture: Base architecture of encoder model (e.g. 'inception_v3')
      pre_trained_dataset: The dataset used to pre-train the encoder model
      downsample_factor: Downsample factor for the outputs

    Raises:
      NameError: Returns name errors if architecture is not 'inception_v3'
    """
    tf_input = layers.Input(shape=(input_shape[0], input_shape[1], 3))
    if architecture == 'inception_v3':
      model = applications.inception_v3.InceptionV3(
          input_tensor=tf_input, weights=pre_trained_dataset, include_top=False)
      output_pooled = \
          layers.AveragePooling2D((downsample_factor, downsample_factor),
                                  strides=(downsample_factor,
                                           downsample_factor))(model.output)
    else:
      raise NameError('Invalid architecture')
    return models.Model(model.input, output_pooled)

  # Encodes training samples
  enc_x_train = \
      data_loading.encode_image(x_train,
                                encoder_model,
                                input_shape,
                                preprocess_function)
  # Encodes validation samples
  enc_x_valid = \
      data_loading.encode_image(x_valid,
                                encoder_model,
                                input_shape,
                                preprocess_function)
  # Encodes testing samples
  enc_x_test = \
      data_loading.encode_image(x_test,
                                encoder_model,
                                input_shape,
                                preprocess_function)

  print('Finished data encoding')

  # Run DVRL
  # Resets the graph
  tf.reset_default_graph()
  keras.backend.clear_session()

  # Network parameters
  parameters = dict()
  parameters['hidden_dim'] = args.hidden_dim
  parameters['comb_dim'] = args.comb_dim
  parameters['activation'] = tf.nn.relu
  parameters['iterations'] = args.iterations
  parameters['layer_number'] = args.layer_number
  parameters['batch_size'] = args.batch_size
  parameters['learning_rate'] = args.learning_rate
  parameters['inner_iterations'] = args.inner_iterations
  parameters['batch_size_predictor'] = args.batch_size_predictor

  # Defines problem
  problem = 'classification'

  # Defines predictive model
  pred_model = keras.models.Sequential()
  pred_model.add(keras.layers.Dense(len(set(y_train)), activation='softmax'))
  pred_model.compile(optimizer='adam', loss='categorical_crossentropy',
                     metrics=['accuracy'])

  # Flags for using stochastic gradient descent / pre-trained model
  flags = {'sgd': True, 'pretrain': False}

  # Initalizes DVRL
  dvrl_class = dvrl.Dvrl(enc_x_train, y_train, enc_x_valid, y_valid,
                         problem, pred_model, parameters,
                         checkpoint_file_name, flags)

  # Trains DVRL
  dvrl_class.train_dvrl('accuracy')

  print('Finished DVRL training.')

  # Outputs
  # Data valuation
  dve_out = dvrl_class.data_valuator(enc_x_train, y_train)

  print('Finished data valuation.')

  # Evaluations
  # Evaluation model
  eval_model = linear_model.LogisticRegression(solver='lbfgs',
                                               multi_class='auto',
                                               max_iter=2000)

  # 1. Robust learning (DVRL-weighted learning)
  robust_perf = dvrl_metrics.learn_with_dvrl(dve_out, eval_model,
                                             enc_x_train, y_train,
                                             enc_x_valid, y_valid,
                                             enc_x_test, y_test, 'accuracy')

  print('DVRL-weighted learning performance: ' + str(np.round(robust_perf, 4)))

  # 2. Performance after removing high/low values
  _ = dvrl_metrics.remove_high_low(dve_out, eval_model, enc_x_train, y_train,
                                   enc_x_valid, y_valid, enc_x_test, y_test,
                                   'accuracy', plot=True)

  # 3. Corrupted sample discovery
  # If noise_idx variable exist (explicit indices for noisy sample)
  # and noise_rate is positive value.

  if noise_rate > 0:

    # Evaluates corrupted_sample_discovery
    # and plot corrupted sample discovery results
    _ = dvrl_metrics.discover_corrupted_sample(dve_out,
                                               noise_idx, noise_rate,
                                               plot=True)