Ejemplo n.º 1
0
def generate_one_curve(X,
                       y,
                       sampler,
                       score_model,
                       seed,
                       warmstart_size,
                       batch_size,
                       select_model=None,
                       confusion=0.,
                       active_p=1.0,
                       max_points=None,
                       standardize_data=False,
                       norm_data=False,
                       train_horizon=0.5):
    """Creates one learning curve for both active and passive learning.

  Will calculate accuracy on validation set as the number of training data
  points increases for both PL and AL.
  Caveats: training method used is sensitive to sorting of the data so we
    resort all intermediate datasets

  Args:
    X: training data
    y: training labels
    sampler: sampling class from sampling_methods, assumes reference
      passed in and sampler not yet instantiated.
    score_model: model used to score the samplers.  Expects fit and predict
      methods to be implemented.
    seed: seed used for data shuffle and other sources of randomness in sampler
      or model training
    warmstart_size: float or int.  float indicates percentage of train data
      to use for initial model
    batch_size: float or int.  float indicates batch size as a percent of
      training data
    select_model: defaults to None, in which case the score model will be
      used to select new datapoints to label.  Model must implement fit,
      predict and depending on AL method may also need decision_function.
    confusion: percentage of labels of one class to flip to the other
    active_p: percent of batch to allocate to active learning
    max_points: limit dataset size for preliminary
    standardize_data: wheter to standardize the data to 0 mean unit variance
    norm_data: whether to normalize the data.  Default is False for logistic
      regression.
    train_horizon: how long to draw the curve for.  Percent of training data.

  Returns:
    results: dictionary of results for all samplers
    sampler_states: dictionary of sampler objects for debugging
  """

    # TODO(lishal): add option to find best hyperparameter setting first on
    # full dataset and fix the hyperparameter for the rest of the routine
    # This will save computation and also lead to more stable behavior for the
    # test accuracy

    # TODO(lishal): remove mixture parameter and have the mixture be specified
    # as a mixture of samplers strategy
    def select_batch(sampler, uniform_sampler, mixture, N, already_selected,
                     **kwargs):
        n_active = int(mixture * N)
        n_passive = N - n_active
        kwargs["N"] = n_active
        kwargs["already_selected"] = already_selected
        batch_AL = sampler.select_batch(**kwargs)
        already_selected = already_selected + batch_AL
        kwargs["N"] = n_passive
        kwargs["already_selected"] = already_selected
        batch_PL = uniform_sampler.select_batch(**kwargs)
        return batch_AL + batch_PL

    # set a random seed
    # is this a correct way to do this?
    np.random.seed(seed)
    data_splits = [2. / 3, 1. / 6, 1. / 6]

    # 2/3 of data for training
    if max_points is None:
        max_points = len(y)
    train_size = int(min(max_points, len(y)) * data_splits[0])

    # Compute the batch size if it is less than 1. Then it is the batch_size
    # multiplied by the train_size
    if batch_size < 1:
        batch_size = batch_size * train_size
    batch_size = int(batch_size)

    # Use a warm start.
    if warmstart_size < 1:
        # Set seed batch to provide enough samples to get at least 4 per class
        # TODO(lishal): switch to sklearn stratified sampler
        seed_batch = int(warmstart_size * train_size)
    else:
        seed_batch = int(warmstart_size)
    seed_batch = max(seed_batch, 6 * len(np.unique(y)))

    # make a split of the data: switch to sklearn data splitter?
    indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = (
        utils.get_train_val_test_splits(X,
                                        y,
                                        max_points,
                                        seed,
                                        confusion,
                                        seed_batch,
                                        split=data_splits))

    # Preprocess data
    if norm_data:
        print("Normalizing data")
        X_train = normalize(X_train)
        X_val = normalize(X_val)
        X_test = normalize(X_test)
    if standardize_data:
        print("Standardizing data")
        scaler = StandardScaler(with_mean=False).fit(X_train)
        X_train = scaler.transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
    print("active percentage: {} warmstart batch: {} "
          "batch size: {} confusion: {} seed: {}".format(
              active_p, seed_batch, batch_size, confusion, seed))

    # Initialize samplers
    uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed)
    sampler = sampler(X_train, y_train, seed)

    results = {}
    data_sizes = []
    accuracy = []
    selected_inds = list(range(seed_batch))

    # If select model is None, use score_model
    same_score_select = False
    if select_model is None:
        select_model = score_model
        same_score_select = True

    n_batches = int(
        np.ceil(
            (train_horizon * train_size - seed_batch) * 1.0 / batch_size)) + 1
    for b in range(n_batches):
        n_train = seed_batch + min(train_size - seed_batch, b * batch_size)
        print("Training model on " + str(n_train) + " datapoints")

        assert n_train == len(selected_inds)
        data_sizes.append(n_train)

        # Sort active_ind so that the end results matches that of uniform
        # sampling
        partial_X = X_train[sorted(selected_inds)]
        partial_y = y_train[sorted(selected_inds)]
        score_model.fit(partial_X, partial_y)
        if not same_score_select:
            select_model.fit(partial_X, partial_y)
        acc = score_model.score(X_test, y_test)
        accuracy.append(acc)
        print("Sampler: %s, Accuracy: %.2f%%" %
              (sampler.name, accuracy[-1] * 100))

        n_sample = min(batch_size, train_size - len(selected_inds))
        select_batch_inputs = {
            "model": select_model,
            "labeled": dict(zip(selected_inds, y_train[selected_inds])),
            "eval_acc": accuracy[-1],
            "X_test": X_val,
            "y_test": y_val,
            "y": y_train
        }
        new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample,
                                 selected_inds, **select_batch_inputs)
        selected_inds.extend(new_batch)

        # it seems that a difference between the requested and selected
        # samples is possible. mayby in case of already reviewed samples.
        print('Requested: %d, Selected: %d' % (n_sample, len(new_batch)))
        assert len(new_batch) == n_sample  # raises if not equal
        assert len(list(set(selected_inds))) == len(selected_inds)

    # Check that the returned indice are correct and will allow mapping to
    # training set from original data
    assert all(y_noise[indices[selected_inds]] == y_train[selected_inds])
    results["accuracy"] = accuracy
    results["selected_inds"] = selected_inds
    results["data_sizes"] = data_sizes
    results["indices"] = indices
    results["noisy_targets"] = y_noise
    return results, sampler
Ejemplo n.º 2
0
def generate_one_curve(X,
                       y,
                       sampler,
                       score_model,
                       seed,
                       warmstart_size,
                       batch_size,
                       select_model=None,
                       max_points=None):
    """Creates one learning curve for both active and passive learning.

  Will calculate accuracy on validation set as the number of training data
  points increases for both PL and AL.
  Caveats: training method used is sensitive to sorting of the data so we
    resort all intermediate datasets

  Args:
    X: training data
    y: training labels
    sampler: sampling class from sampling_methods, assumes reference
      passed in and sampler not yet instantiated.
    score_model: model used to score the samplers.  Expects fit and predict
      methods to be implemented.
    seed: seed used for data shuffle and other sources of randomness in sampler
      or model training
    warmstart_size: float or int.  float indicates percentage of train data
      to use for initial model
    batch_size: float or int.  float indicates batch size as a percent of
      training data
    select_model: defaults to None, in which case the score model will be
      used to select new datapoints to label.  Model must implement fit, predict
      and depending on AL method may also need decision_function.
    confusion: percentage of labels of one class to flip to the other
    max_points: limit dataset size for preliminary
    standardize_data: wheter to standardize the data to 0 mean unit variance
    norm_data: whether to normalize the data.  Default is False for logistic
      regression.
    train_horizon: how long to draw the curve for.  Percent of training data.

  Returns:
    results: dictionary of results for all samplers
    sampler_states: dictionary of sampler objects for debugging
  """
    def select_batch(sampler, N, already_selected, **kwargs):
        kwargs["N"] = N
        kwargs["already_selected"] = already_selected
        batch_AL = sampler.select_batch(**kwargs)
        return batch_AL

    np.random.seed(seed)
    data_splits = [2. / 3, 1. / 6, 1. / 6]

    # 2/3 of data for training
    if max_points is None:
        max_points = len(y)
    if max_points < 1:
        max_points = int(max_points * len(y))
    else:
        max_points = int(max_points)
    train_size = int(min(max_points, len(y) * data_splits[0]))
    if batch_size < 1:
        batch_size = int(batch_size * train_size)
    else:
        batch_size = int(batch_size)
    if warmstart_size < 1:
        seed_batch = int(warmstart_size * train_size)
    else:
        seed_batch = int(warmstart_size)
    seed_batch = max(seed_batch, 6 * len(np.unique(y)))

    indices, X_train, y_train, X_val, y_val, X_test, y_test = (
        utils.get_train_val_test_splits(X,
                                        y,
                                        max_points,
                                        seed,
                                        seed_batch,
                                        split=data_splits))

    print(" warmstart batch: " + str(seed_batch) + " batch size: " +
          str(batch_size) + " seed: " + str(seed))

    # Initialize samplers
    sampler = sampler(X_train, seed)

    results = {}
    data_sizes = []
    accuracy = []
    selected_inds = range(seed_batch)

    # If select model is None, use score_model
    same_score_select = False
    if select_model is None:
        select_model = score_model
        same_score_select = True

    n_batches = int(np.ceil((train_size - seed_batch) * 1.0 / batch_size)) + 1
    for b in range(n_batches):
        n_train = seed_batch + min(train_size - seed_batch, b * batch_size)
        print("Training model on " + str(n_train) + " datapoints")

        assert n_train == len(selected_inds)
        data_sizes.append(n_train)

        # Sort active_ind so that the end results matches that of uniform sampling
        partial_X = X_train[sorted(selected_inds)]
        partial_y = y_train[sorted(selected_inds)]
        score_model.fit(partial_X, partial_y)
        if not same_score_select:
            select_model.fit(partial_X, partial_y)
        acc = score_model.score(X_test, y_test)
        accuracy.append(acc)
        print("Sampler: %s, Accuracy: %.2f%%" %
              (sampler.name, accuracy[-1] * 100))

        n_sample = min(batch_size, train_size - len(selected_inds))
        select_batch_inputs = {
            "model": select_model,
            "labeled": dict(zip(selected_inds, y_train[selected_inds])),
            "eval_acc": accuracy[-1],
            "X_test": X_val,
            "y_test": y_val,
            "y": y_train
        }
        new_batch = select_batch(sampler, n_sample, selected_inds,
                                 **select_batch_inputs)
        selected_inds.extend(new_batch)
        print('Requested: %d, Selected: %d' % (n_sample, len(new_batch)))
        assert len(new_batch) == n_sample
        assert len(list(set(selected_inds))) == len(selected_inds)

    # Check that the returned indice are correct and will allow mapping to
    # training set from original data

    results["accuracy"] = accuracy
    results["selected_inds"] = selected_inds
    results["data_sizes"] = data_sizes
    results["indices"] = indices

    return results, sampler
Ejemplo n.º 3
0
def generate_one_curve(X,
                       y,
                       sampler,
                       score_model,
                       seed,
                       warmstart_size,
                       batch_size,
                       select_model=None,
                       confusion=0.,
                       active_p=1.0,
                       max_points=None,
                       standardize_data=False,
                       norm_data=False,
                       train_horizon=0.5):
  """Creates one learning curve for both active and passive learning.

  Will calculate accuracy on validation set as the number of training data
  points increases for both PL and AL.
  Caveats: training method used is sensitive to sorting of the data so we
    resort all intermediate datasets

  Args:
    X: training data
    y: training labels
    sampler: sampling class from sampling_methods, assumes reference
      passed in and sampler not yet instantiated.
    score_model: model used to score the samplers.  Expects fit and predict
      methods to be implemented.
    seed: seed used for data shuffle and other sources of randomness in sampler
      or model training
    warmstart_size: float or int.  float indicates percentage of train data
      to use for initial model
    batch_size: float or int.  float indicates batch size as a percent of
      training data
    select_model: defaults to None, in which case the score model will be
      used to select new datapoints to label.  Model must implement fit, predict
      and depending on AL method may also need decision_function.
    confusion: percentage of labels of one class to flip to the other
    active_p: percent of batch to allocate to active learning
    max_points: limit dataset size for preliminary
    standardize_data: wheter to standardize the data to 0 mean unit variance
    norm_data: whether to normalize the data.  Default is False for logistic
      regression.
    train_horizon: how long to draw the curve for.  Percent of training data.

  Returns:
    results: dictionary of results for all samplers
    sampler_states: dictionary of sampler objects for debugging
  """

  def select_batch(sampler, uniform_sampler, mixture, N, already_selected,
                   **kwargs):
    n_active = int(mixture * N)
    n_passive = N - n_active
    kwargs["N"] = n_active
    kwargs["already_selected"] = already_selected
    batch_AL = sampler.select_batch(**kwargs)
    already_selected = list(already_selected) + batch_AL
    kwargs["N"] = n_passive
    kwargs["already_selected"] = already_selected
    batch_PL = uniform_sampler.select_batch(**kwargs)
    return batch_AL + batch_PL

  np.random.seed(seed)

  # Specify the train, validation, and test split

  cifar10 = [8./10, 1./30, 1./15] #Train: 48000, Val: 2000, Test: 10000
  mnist = [29./35, 1./35 , 1./7]  #Train: 58000, Val: 2000, Test: 10000
  medical = [0.48388, 0.06452, 0.4516]
  data_splits = cifar10


  if max_points is None:
    max_points = len(y)
  train_size = int(min(max_points, len(y)) * data_splits[0])
  if batch_size < 1:
    batch_size = int(batch_size * train_size)
  else:
    batch_size = int(batch_size)
  if warmstart_size < 1:

    seed_batch = int(warmstart_size * train_size)
  else:
    seed_batch = int(warmstart_size)
  seed_batch = max(seed_batch, 6 * len(np.unique(y)))



  # if FLAGS.dataset == "audi":
  #
  #     print("FLAGS DATASET = AUDI")
  #
  #     # Specify train, validation, and test split for the Audi data set
  #
  #     audi_split = [0.5284, 0.07868, 0.39171]
  #
  #     indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = (
  #         utils.get_train_val_test_splits(X,y,max_points,seed,confusion,
  #                                         seed_batch, split=audi_split))
  #     print('X_train:', X_train.shape)
  #     print('y_train:', y_train.shape)
  #     print('X_val:', X_val.shape)
  #     print('y_val:', y_val.shape)
  #
  #
  #     X_test, y_test = shuffle(X_test, y_test, random_state=0)
  #
  #     print('X_test:', X_test.shape)
  #     print('y_test:', y_test.shape)



  indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = (
          utils.get_train_val_test_splits(X,y,max_points,seed,confusion,
                                          seed_batch, split=data_splits))

  print('X_train:', X_train.shape)
  print('y_train:', y_train.shape)
  print('X_val:', X_val.shape)
  print('y_val:', y_val.shape)
  print('X_test:', X_test.shape)
  print('y_test:', y_test.shape)

  # Preprocess data
  # if norm_data:
  #   print("Normalizing data")
  #   X_train = normalize(X_train)
  #   X_val = normalize(X_val)
  #   X_test = normalize(X_test)
  # if standardize_data:
  #   print("Standardizing data")
  #   print(X_train.shape)
  #   scaler = StandardScaler().fit(X_train)
  #   X_train = scaler.transform(X_train)
  #   X_val = scaler.transform(X_val)
  #   X_test = scaler.transform(X_test)
  # print("active percentage: " + str(active_p) + " warmstart batch: " +
  #       str(seed_batch) + " batch size: " + str(batch_size) + " confusion: " +
  #       str(confusion) + " seed: " + str(seed))

  # Initialize samplers
  uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed)
  sampler = sampler(X_train, y_train, seed)

  results = {}
  data_sizes = []
  accuracy = []
  selected_inds = list(range(seed_batch))

  # If select model is None, use score_model
  same_score_select = False
  if select_model is None:
    select_model = score_model
    same_score_select = True

  n_batches = int(np.ceil((train_horizon * train_size - seed_batch) *
                          1.0 / batch_size)) + 1
  print('Number of active Learning rounds:', n_batches)
  for b in range(n_batches):

    print('NBATCHES: ', n_batches)

    n_train = seed_batch + min(train_size - seed_batch, b * batch_size)
    print("Training model on " + str(n_train) + " datapoints")

    assert n_train == len(selected_inds)
    data_sizes.append(n_train)

    # Sort active_ind so that the end results matches that of uniform sampling
    partial_X = X_train[sorted(selected_inds)]
    partial_y = y_train[sorted(selected_inds)]

    partial_X = np.array(partial_X)
    partial_y = np.array(partial_y)

    print('PARTIAL_X')
    print(partial_X.shape)
    print('PARTIAL_Y')
    print(partial_y.shape)#

    n_ensembles = 5
    mean_acc = []

    X_Pool_Dropout = X_train
    All_Dropout_Classes = np.zeros(shape=(X_Pool_Dropout.shape[0], 1)) # Für AUDI -16 hinzugefügt
    print('Use trained model for test time dropout')

    for i in range(n_ensembles):
        print('N_ENSEMBLE: '+ str(i+1))

        score_model.build_model(X_val, y_val, X_test, y_test)
        score_model.fit(partial_X, partial_y, X_val, y_val, FLAGS)
        if not same_score_select:
            select_model.fit(partial_X, partial_y)


        # Predictions at Test Time
        try:
            pred = score_model.predict(X_Pool_Dropout)
        except:
            #pred = model.predict_proba(self.X)
            print('Ein Fehler ist bei der Vorhersage aufgetreten')


        dropout_classes = np.argmax(pred, axis=1)
        print('DROPOUT CLASSES.SHAPE')
        print(dropout_classes.shape)
        dropout_classes = np.array([dropout_classes]).T


        All_Dropout_Classes = np.append(All_Dropout_Classes, dropout_classes, axis=1)


        # Calculate test accuracy as an average of the ensembles
        acc = score_model.score(X_test, y_test, FLAGS)
        mean_acc.append(acc)


    with open('./trained_models/All_Dropout_Classes', 'wb') as fp:
        pickle.dump(All_Dropout_Classes, fp)



    # Calculate mean for each model
    accuracy.append(np.mean(mean_acc))
    print("Sampler: %s, Accuracy: %.2f%%" % (sampler.name, accuracy[-1]*100))

    with open('./test_accuracy/ResNet_ensemble_varRatio_lr0.0005_batch64' + str(seed) + '.json', 'w') as f:
        json.dump(str(accuracy), f)




    n_sample = min(batch_size, train_size - len(selected_inds))
    select_batch_inputs = {
        "model": select_model,
        "labeled": dict(zip(selected_inds, y_train[selected_inds])),
        "eval_acc": accuracy[-1],
        "X_test": X_val,
        "y_test": y_val,
        "y": y_train
    }
    new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample,
                             selected_inds, **select_batch_inputs)
    selected_inds.extend(new_batch)
    print('Requested: %d, Selected: %d' % (n_sample, len(new_batch)))
    assert len(new_batch) == n_sample
    assert len(list(set(selected_inds))) == len(selected_inds)

  # Check that the returned indice are correct and will allow mapping to
  # training set from original data
  assert all(y_noise[indices[selected_inds]] == y_train[selected_inds])
  results["accuracy"] = accuracy
  results["selected_inds"] = selected_inds
  results["data_sizes"] = data_sizes
  results["indices"] = indices
  results["noisy_targets"] = y_noise
  return results, sampler, accuracy
Ejemplo n.º 4
0
def generate_one_curve(X,
                       y,
                       sampler,
                       score_model,
                       seed,
                       warmstart_size,
                       batch_size,
                       select_model=None,
                       confusion=0.,
                       active_p=1.0,
                       max_points=None,
                       standardize_data=False,
                       norm_data=False,
                       train_horizon=0.5):
    """Creates one learning curve for both active and passive learning.

  Will calculate accuracy on validation set as the number of training data
  points increases for both PL and AL.
  Caveats: training method used is sensitive to sorting of the data so we
    resort all intermediate datasets

  Args:
    X: training data
    y: training labels
    sampler: sampling class from sampling_methods, assumes reference
      passed in and sampler not yet instantiated.
    score_model: model used to score the samplers.  Expects fit and predict
      methods to be implemented.
    seed: seed used for data shuffle and other sources of randomness in sampler
      or model training
    warmstart_size: float or int.  float indicates percentage of train data
      to use for initial model
    batch_size: float or int.  float indicates batch size as a percent of
      training data
    select_model: defaults to None, in which case the score model will be
      used to select new datapoints to label.  Model must implement fit, predict
      and depending on AL method may also need decision_function.
    confusion: percentage of labels of one class to flip to the other
    active_p: percent of batch to allocate to active learning
    max_points: limit dataset size for preliminary
    standardize_data: wheter to standardize the data to 0 mean unit variance
    norm_data: whether to normalize the data.  Default is False for logistic
      regression.
    train_horizon: how long to draw the curve for.  Percent of training data.

  Returns:
    results: dictionary of results for all samplers
    sampler_states: dictionary of sampler objects for debugging
  """
    def select_batch(sampler, uniform_sampler, mixture, N, already_selected,
                     **kwargs):
        n_active = int(mixture * N)
        n_passive = N - n_active
        kwargs["N"] = n_active
        kwargs["already_selected"] = already_selected
        batch_AL = sampler.select_batch(**kwargs)
        already_selected = list(already_selected) + batch_AL
        kwargs["N"] = n_passive
        kwargs["already_selected"] = already_selected
        batch_PL = uniform_sampler.select_batch(**kwargs)
        return batch_AL + batch_PL

    np.random.seed(seed)

    # Specify the train, validation, and test split

    cifar10 = [8. / 10, 1. / 30,
               1. / 15]  #Train: 48000, Val: 2000, Test: 10000
    mnist = [29. / 35, 1. / 35, 1. / 7]  #Train: 58000, Val: 2000, Test: 10000
    audi = [0.744, 0.0732, 0.177]
    data_splits = cifar10

    if max_points is None:
        max_points = len(y)
    train_size = int(min(max_points, len(y)) * data_splits[0])
    if batch_size < 1:
        batch_size = int(batch_size * train_size)
    else:
        batch_size = int(batch_size)
    if warmstart_size < 1:

        seed_batch = int(warmstart_size * train_size)
    else:
        seed_batch = int(warmstart_size)
    seed_batch = max(seed_batch, 6 * len(np.unique(y)))

    indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = (
        utils.get_train_val_test_splits(X,
                                        y,
                                        max_points,
                                        seed,
                                        confusion,
                                        seed_batch,
                                        split=data_splits))
    print('X_train:', X_train.shape)
    print('y_train:', y_train.shape)
    print('X_val:', X_val.shape)
    print('y_val:', y_val.shape)
    print('X_test:', X_test.shape)
    print('y_test:', y_test.shape)

    # Preprocess data
    # if norm_data:
    #   print("Normalizing data")
    #   X_train = normalize(X_train)
    #   X_val = normalize(X_val)
    #   X_test = normalize(X_test)
    # if standardize_data:
    #   print("Standardizing data")
    #   print(X_train.shape)
    #   scaler = StandardScaler().fit(X_train)
    #   X_train = scaler.transform(X_train)
    #   X_val = scaler.transform(X_val)
    #   X_test = scaler.transform(X_test)
    # print("active percentage: " + str(active_p) + " warmstart batch: " +
    #       str(seed_batch) + " batch size: " + str(batch_size) + " confusion: " +
    #       str(confusion) + " seed: " + str(seed))

    # Initialize samplers
    uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed)
    sampler = sampler(X_train, y_train, seed)

    results = {}
    data_sizes = []
    accuracy = []
    selected_inds = list(range(seed_batch))

    # If select model is None, use score_model
    same_score_select = False
    if select_model is None:
        select_model = score_model
        same_score_select = True

    n_batches = int(
        np.ceil(
            (train_horizon * train_size - seed_batch) * 1.0 / batch_size)) + 1
    print('Number of active Learning rounds:', n_batches)
    sampling_time_measurement = []
    for b in range(n_batches):
        n_train = seed_batch + min(train_size - seed_batch, b * batch_size)
        print("Training model on " + str(n_train) + " datapoints")

        assert n_train == len(selected_inds)
        data_sizes.append(n_train)

        # Sort active_ind so that the end results matches that of uniform sampling
        partial_X = X_train[sorted(selected_inds)]
        partial_y = y_train[sorted(selected_inds)]

        partial_X = np.array(partial_X)
        partial_y = np.array(partial_y)

        print('PARTIAL_X')
        print(partial_X.shape)
        print('PARTIAL_Y')
        print(partial_y.shape)
        print('Histogram of labeled data')
        print(np.histogram(partial_y)[0] / partial_y.shape[0])

        score_model.fit(
            partial_X,
            partial_y,
        )
        if not same_score_select:
            select_model.fit(partial_X, partial_y)
        acc = score_model.score(X_test, y_test)
        accuracy.append(acc)
        print("Sampler: %s, Accuracy: %.2f%%" %
              (sampler.name, accuracy[-1] * 100))

        n_sample = min(batch_size, train_size - len(selected_inds))
        select_batch_inputs = {
            "model": select_model,
            "labeled": dict(zip(selected_inds, y_train[selected_inds])),
            "eval_acc": accuracy[-1],
            "X_test": X_val,
            "y_test": y_val,
            "y": y_train
        }
        start = time.time()
        new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample,
                                 selected_inds, **select_batch_inputs)
        selected_inds.extend(new_batch)
        end = time.time()
        execution_time = end - start
        sampling_time_measurement.append(execution_time)
        print('Time elapsed for batch selection: ', execution_time)
        print('Requested: %d, Selected: %d' % (n_sample, len(new_batch)))
        assert len(new_batch) == n_sample
        assert len(list(set(selected_inds))) == len(selected_inds)

    # Check that the returned indice are correct and will allow mapping to
    # training set from original data
    assert all(y_noise[indices[selected_inds]] == y_train[selected_inds])
    results["accuracy"] = accuracy
    results["selected_inds"] = selected_inds
    results["data_sizes"] = data_sizes
    results["indices"] = indices
    results["noisy_targets"] = y_noise
    return results, sampler, accuracy, sampling_time_measurement
Ejemplo n.º 5
0
def generate_one_curve(X,
                       y,
                       sampler,
                       score_model,
                       seed,
                       warmstart_size,
                       batch_size,
                       select_model=None,
                       confusion=0.,
                       active_p=1.0,
                       max_points=None,
                       standardize_data=False,
                       norm_data=False,
                       train_horizon=0.5):
    """Creates one learning curve for both active and passive learning.

  Will calculate accuracy on validation set as the number of training data
  points increases for both PL and AL.
  Caveats: training method used is sensitive to sorting of the data so we
    resort all intermediate datasets

  Args:
    X: training data
    y: training labels
    sampler: sampling class from sampling_methods, assumes reference
      passed in and sampler not yet instantiated.
    score_model: model used to score the samplers.  Expects fit and predict
      methods to be implemented.
    seed: seed used for data shuffle and other sources of randomness in sampler
      or model training
    warmstart_size: float or int.  float indicates percentage of train data
      to use for initial model
    batch_size: float or int.  float indicates batch size as a percent of
      training data
    select_model: defaults to None, in which case the score model will be
      used to select new datapoints to label.  Model must implement fit, predict
      and depending on AL method may also need decision_function.
    confusion: percentage of labels of one class to flip to the other
    active_p: percent of batch to allocate to active learning
    max_points: limit dataset size for preliminary
    standardize_data: wheter to standardize the data to 0 mean unit variance
    norm_data: whether to normalize the data.  Default is False for logistic
      regression.
    train_horizon: how long to draw the curve for.  Percent of training data.

  Returns:
    results: dictionary of results for all samplers
    sampler_states: dictionary of sampler objects for debugging
  """
    """
  参数:
  X:训练集
  Y:测试集
  sampler:来自samples_methods的采样类,假定传入的引用和采样器尚未实例化
  score_model:用于对采样器评分的模型。
  seed:用于数据混洗,和采样器或模型训练中其他随机种子相同
  warmstart_size:初始统一的采样示例作为种子数据,可以是整数,可以是浮点数。浮点数表示总训练数据的百分比,整数表示原始数据集大小。
  batch_size:每个批次中要请求的数据数量。 浮点数表示总训练数据的百分比,整数表示原始大小
  select_model:默认为None,在这种情况下,分数模型将用于选择要标记的新数据点。模型必须实现拟合,预测和依赖于AL的方法,可能还需要Decision_function。
  confusion: 一类标签翻转到另一类标签的百分比
  active_p:每批次分配给主动学习百分比
  max_points:初步限制数据集大小
  standardize_data:是否把数据标准化为平均值为0
  norm_data:是否规范化数据。
  train_horizo​​n:绘制曲线的时间长度。
  返回值:
  results:所有采样器的结果字典
  sampler_states:用于调试的采样器对象字典
  """

    # TODO(lishal): add option to find best hyperparameter setting first on
    # full dataset and fix the hyperparameter for the rest of the routine
    # This will save computation and also lead to more stable behavior for the
    # test accuracy

    # TODO(lishal): remove mixture parameter and have the mixture be specified as
    # a mixture of samplers strategy
    def select_batch(sampler, uniform_sampler, mixture, N, already_selected,
                     **kwargs):
        n_active = int(mixture * N)
        n_passive = N - n_active
        kwargs["N"] = n_active
        kwargs["already_selected"] = already_selected
        batch_AL = sampler.select_batch(**kwargs)
        already_selected = already_selected + batch_AL
        kwargs["N"] = n_passive
        kwargs["already_selected"] = already_selected
        batch_PL = uniform_sampler.select_batch(**kwargs)
        return batch_AL + batch_PL

    np.random.seed(seed)
    data_splits = [2. / 3, 1. / 6, 1. / 6]

    # 2/3 of data for training
    if max_points is None:
        max_points = len(y)
    train_size = int(min(max_points, len(y)) * data_splits[0])
    if batch_size < 1:
        batch_size = int(batch_size * train_size)
    else:
        batch_size = int(batch_size)
    if warmstart_size < 1:
        # Set seed batch to provide enough samples to get at least 4 per class
        # TODO(lishal): switch to sklearn stratified sampler
        seed_batch = int(warmstart_size * train_size)
    else:
        seed_batch = int(warmstart_size)
    seed_batch = max(seed_batch, 6 * len(np.unique(y)))

    indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = (
        utils.get_train_val_test_splits(X,
                                        y,
                                        max_points,
                                        seed,
                                        confusion,
                                        seed_batch,
                                        split=data_splits))

    # Preprocess data
    if norm_data:
        print("Normalizing data")
        X_train = normalize(X_train)
        X_val = normalize(X_val)
        X_test = normalize(X_test)
    if standardize_data:
        print("Standardizing data")
        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(X_test)
    print("active percentage: " + str(active_p) + " warmstart batch: " +
          str(seed_batch) + " batch size: " + str(batch_size) +
          " confusion: " + str(confusion) + " seed: " + str(seed))

    # Initialize samplers
    uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed)
    sampler = sampler(X_train, y_train, seed)

    results = {}
    data_sizes = []
    accuracy = []
    selected_inds = list(range(seed_batch))

    # If select model is None, use score_model
    same_score_select = False
    if select_model is None:
        select_model = score_model
        same_score_select = True

    n_batches = int(
        np.ceil(
            (train_horizon * train_size - seed_batch) * 1.0 / batch_size)) + 1
    for b in range(n_batches):
        n_train = seed_batch + min(train_size - seed_batch, b * batch_size)
        print("Training model on " + str(n_train) + " datapoints")

        assert n_train == len(selected_inds)
        data_sizes.append(n_train)

        # Sort active_ind so that the end results matches that of uniform sampling
        partial_X = X_train[sorted(selected_inds)]
        partial_y = y_train[sorted(selected_inds)]
        score_model.fit(partial_X, partial_y)
        if not same_score_select:
            select_model.fit(partial_X, partial_y)
        acc = score_model.score(X_test, y_test)
        accuracy.append(acc)
        print("Sampler: %s, Accuracy: %.2f%%" %
              (sampler.name, accuracy[-1] * 100))

        n_sample = min(batch_size, train_size - len(selected_inds))
        select_batch_inputs = {
            "model": select_model,
            "labeled": dict(zip(selected_inds, y_train[selected_inds])),
            "eval_acc": accuracy[-1],
            "X_test": X_val,
            "y_test": y_val,
            "y": y_train
        }
        new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample,
                                 selected_inds, **select_batch_inputs)
        selected_inds.extend(new_batch)
        print('Requested: %d, Selected: %d' % (n_sample, len(new_batch)))
        assert len(new_batch) == n_sample
        assert len(list(set(selected_inds))) == len(selected_inds)

    # Check that the returned indice are correct and will allow mapping to
    # training set from original data
    assert all(y_noise[indices[selected_inds]] == y_train[selected_inds])
    results["accuracy"] = accuracy
    results["selected_inds"] = selected_inds
    results["data_sizes"] = data_sizes
    results["indices"] = indices
    results["noisy_targets"] = y_noise
    return results, sampler