def generate_one_curve(X, y, sampler, score_model, seed, warmstart_size, batch_size, select_model=None, confusion=0., active_p=1.0, max_points=None, standardize_data=False, norm_data=False, train_horizon=0.5): """Creates one learning curve for both active and passive learning. Will calculate accuracy on validation set as the number of training data points increases for both PL and AL. Caveats: training method used is sensitive to sorting of the data so we resort all intermediate datasets Args: X: training data y: training labels sampler: sampling class from sampling_methods, assumes reference passed in and sampler not yet instantiated. score_model: model used to score the samplers. Expects fit and predict methods to be implemented. seed: seed used for data shuffle and other sources of randomness in sampler or model training warmstart_size: float or int. float indicates percentage of train data to use for initial model batch_size: float or int. float indicates batch size as a percent of training data select_model: defaults to None, in which case the score model will be used to select new datapoints to label. Model must implement fit, predict and depending on AL method may also need decision_function. confusion: percentage of labels of one class to flip to the other active_p: percent of batch to allocate to active learning max_points: limit dataset size for preliminary standardize_data: wheter to standardize the data to 0 mean unit variance norm_data: whether to normalize the data. Default is False for logistic regression. train_horizon: how long to draw the curve for. Percent of training data. Returns: results: dictionary of results for all samplers sampler_states: dictionary of sampler objects for debugging """ # TODO(lishal): add option to find best hyperparameter setting first on # full dataset and fix the hyperparameter for the rest of the routine # This will save computation and also lead to more stable behavior for the # test accuracy # TODO(lishal): remove mixture parameter and have the mixture be specified # as a mixture of samplers strategy def select_batch(sampler, uniform_sampler, mixture, N, already_selected, **kwargs): n_active = int(mixture * N) n_passive = N - n_active kwargs["N"] = n_active kwargs["already_selected"] = already_selected batch_AL = sampler.select_batch(**kwargs) already_selected = already_selected + batch_AL kwargs["N"] = n_passive kwargs["already_selected"] = already_selected batch_PL = uniform_sampler.select_batch(**kwargs) return batch_AL + batch_PL # set a random seed # is this a correct way to do this? np.random.seed(seed) data_splits = [2. / 3, 1. / 6, 1. / 6] # 2/3 of data for training if max_points is None: max_points = len(y) train_size = int(min(max_points, len(y)) * data_splits[0]) # Compute the batch size if it is less than 1. Then it is the batch_size # multiplied by the train_size if batch_size < 1: batch_size = batch_size * train_size batch_size = int(batch_size) # Use a warm start. if warmstart_size < 1: # Set seed batch to provide enough samples to get at least 4 per class # TODO(lishal): switch to sklearn stratified sampler seed_batch = int(warmstart_size * train_size) else: seed_batch = int(warmstart_size) seed_batch = max(seed_batch, 6 * len(np.unique(y))) # make a split of the data: switch to sklearn data splitter? indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = ( utils.get_train_val_test_splits(X, y, max_points, seed, confusion, seed_batch, split=data_splits)) # Preprocess data if norm_data: print("Normalizing data") X_train = normalize(X_train) X_val = normalize(X_val) X_test = normalize(X_test) if standardize_data: print("Standardizing data") scaler = StandardScaler(with_mean=False).fit(X_train) X_train = scaler.transform(X_train) X_val = scaler.transform(X_val) X_test = scaler.transform(X_test) print("active percentage: {} warmstart batch: {} " "batch size: {} confusion: {} seed: {}".format( active_p, seed_batch, batch_size, confusion, seed)) # Initialize samplers uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed) sampler = sampler(X_train, y_train, seed) results = {} data_sizes = [] accuracy = [] selected_inds = list(range(seed_batch)) # If select model is None, use score_model same_score_select = False if select_model is None: select_model = score_model same_score_select = True n_batches = int( np.ceil( (train_horizon * train_size - seed_batch) * 1.0 / batch_size)) + 1 for b in range(n_batches): n_train = seed_batch + min(train_size - seed_batch, b * batch_size) print("Training model on " + str(n_train) + " datapoints") assert n_train == len(selected_inds) data_sizes.append(n_train) # Sort active_ind so that the end results matches that of uniform # sampling partial_X = X_train[sorted(selected_inds)] partial_y = y_train[sorted(selected_inds)] score_model.fit(partial_X, partial_y) if not same_score_select: select_model.fit(partial_X, partial_y) acc = score_model.score(X_test, y_test) accuracy.append(acc) print("Sampler: %s, Accuracy: %.2f%%" % (sampler.name, accuracy[-1] * 100)) n_sample = min(batch_size, train_size - len(selected_inds)) select_batch_inputs = { "model": select_model, "labeled": dict(zip(selected_inds, y_train[selected_inds])), "eval_acc": accuracy[-1], "X_test": X_val, "y_test": y_val, "y": y_train } new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample, selected_inds, **select_batch_inputs) selected_inds.extend(new_batch) # it seems that a difference between the requested and selected # samples is possible. mayby in case of already reviewed samples. print('Requested: %d, Selected: %d' % (n_sample, len(new_batch))) assert len(new_batch) == n_sample # raises if not equal assert len(list(set(selected_inds))) == len(selected_inds) # Check that the returned indice are correct and will allow mapping to # training set from original data assert all(y_noise[indices[selected_inds]] == y_train[selected_inds]) results["accuracy"] = accuracy results["selected_inds"] = selected_inds results["data_sizes"] = data_sizes results["indices"] = indices results["noisy_targets"] = y_noise return results, sampler
def generate_one_curve(X, y, sampler, score_model, seed, warmstart_size, batch_size, select_model=None, max_points=None): """Creates one learning curve for both active and passive learning. Will calculate accuracy on validation set as the number of training data points increases for both PL and AL. Caveats: training method used is sensitive to sorting of the data so we resort all intermediate datasets Args: X: training data y: training labels sampler: sampling class from sampling_methods, assumes reference passed in and sampler not yet instantiated. score_model: model used to score the samplers. Expects fit and predict methods to be implemented. seed: seed used for data shuffle and other sources of randomness in sampler or model training warmstart_size: float or int. float indicates percentage of train data to use for initial model batch_size: float or int. float indicates batch size as a percent of training data select_model: defaults to None, in which case the score model will be used to select new datapoints to label. Model must implement fit, predict and depending on AL method may also need decision_function. confusion: percentage of labels of one class to flip to the other max_points: limit dataset size for preliminary standardize_data: wheter to standardize the data to 0 mean unit variance norm_data: whether to normalize the data. Default is False for logistic regression. train_horizon: how long to draw the curve for. Percent of training data. Returns: results: dictionary of results for all samplers sampler_states: dictionary of sampler objects for debugging """ def select_batch(sampler, N, already_selected, **kwargs): kwargs["N"] = N kwargs["already_selected"] = already_selected batch_AL = sampler.select_batch(**kwargs) return batch_AL np.random.seed(seed) data_splits = [2. / 3, 1. / 6, 1. / 6] # 2/3 of data for training if max_points is None: max_points = len(y) if max_points < 1: max_points = int(max_points * len(y)) else: max_points = int(max_points) train_size = int(min(max_points, len(y) * data_splits[0])) if batch_size < 1: batch_size = int(batch_size * train_size) else: batch_size = int(batch_size) if warmstart_size < 1: seed_batch = int(warmstart_size * train_size) else: seed_batch = int(warmstart_size) seed_batch = max(seed_batch, 6 * len(np.unique(y))) indices, X_train, y_train, X_val, y_val, X_test, y_test = ( utils.get_train_val_test_splits(X, y, max_points, seed, seed_batch, split=data_splits)) print(" warmstart batch: " + str(seed_batch) + " batch size: " + str(batch_size) + " seed: " + str(seed)) # Initialize samplers sampler = sampler(X_train, seed) results = {} data_sizes = [] accuracy = [] selected_inds = range(seed_batch) # If select model is None, use score_model same_score_select = False if select_model is None: select_model = score_model same_score_select = True n_batches = int(np.ceil((train_size - seed_batch) * 1.0 / batch_size)) + 1 for b in range(n_batches): n_train = seed_batch + min(train_size - seed_batch, b * batch_size) print("Training model on " + str(n_train) + " datapoints") assert n_train == len(selected_inds) data_sizes.append(n_train) # Sort active_ind so that the end results matches that of uniform sampling partial_X = X_train[sorted(selected_inds)] partial_y = y_train[sorted(selected_inds)] score_model.fit(partial_X, partial_y) if not same_score_select: select_model.fit(partial_X, partial_y) acc = score_model.score(X_test, y_test) accuracy.append(acc) print("Sampler: %s, Accuracy: %.2f%%" % (sampler.name, accuracy[-1] * 100)) n_sample = min(batch_size, train_size - len(selected_inds)) select_batch_inputs = { "model": select_model, "labeled": dict(zip(selected_inds, y_train[selected_inds])), "eval_acc": accuracy[-1], "X_test": X_val, "y_test": y_val, "y": y_train } new_batch = select_batch(sampler, n_sample, selected_inds, **select_batch_inputs) selected_inds.extend(new_batch) print('Requested: %d, Selected: %d' % (n_sample, len(new_batch))) assert len(new_batch) == n_sample assert len(list(set(selected_inds))) == len(selected_inds) # Check that the returned indice are correct and will allow mapping to # training set from original data results["accuracy"] = accuracy results["selected_inds"] = selected_inds results["data_sizes"] = data_sizes results["indices"] = indices return results, sampler
def generate_one_curve(X, y, sampler, score_model, seed, warmstart_size, batch_size, select_model=None, confusion=0., active_p=1.0, max_points=None, standardize_data=False, norm_data=False, train_horizon=0.5): """Creates one learning curve for both active and passive learning. Will calculate accuracy on validation set as the number of training data points increases for both PL and AL. Caveats: training method used is sensitive to sorting of the data so we resort all intermediate datasets Args: X: training data y: training labels sampler: sampling class from sampling_methods, assumes reference passed in and sampler not yet instantiated. score_model: model used to score the samplers. Expects fit and predict methods to be implemented. seed: seed used for data shuffle and other sources of randomness in sampler or model training warmstart_size: float or int. float indicates percentage of train data to use for initial model batch_size: float or int. float indicates batch size as a percent of training data select_model: defaults to None, in which case the score model will be used to select new datapoints to label. Model must implement fit, predict and depending on AL method may also need decision_function. confusion: percentage of labels of one class to flip to the other active_p: percent of batch to allocate to active learning max_points: limit dataset size for preliminary standardize_data: wheter to standardize the data to 0 mean unit variance norm_data: whether to normalize the data. Default is False for logistic regression. train_horizon: how long to draw the curve for. Percent of training data. Returns: results: dictionary of results for all samplers sampler_states: dictionary of sampler objects for debugging """ def select_batch(sampler, uniform_sampler, mixture, N, already_selected, **kwargs): n_active = int(mixture * N) n_passive = N - n_active kwargs["N"] = n_active kwargs["already_selected"] = already_selected batch_AL = sampler.select_batch(**kwargs) already_selected = list(already_selected) + batch_AL kwargs["N"] = n_passive kwargs["already_selected"] = already_selected batch_PL = uniform_sampler.select_batch(**kwargs) return batch_AL + batch_PL np.random.seed(seed) # Specify the train, validation, and test split cifar10 = [8./10, 1./30, 1./15] #Train: 48000, Val: 2000, Test: 10000 mnist = [29./35, 1./35 , 1./7] #Train: 58000, Val: 2000, Test: 10000 medical = [0.48388, 0.06452, 0.4516] data_splits = cifar10 if max_points is None: max_points = len(y) train_size = int(min(max_points, len(y)) * data_splits[0]) if batch_size < 1: batch_size = int(batch_size * train_size) else: batch_size = int(batch_size) if warmstart_size < 1: seed_batch = int(warmstart_size * train_size) else: seed_batch = int(warmstart_size) seed_batch = max(seed_batch, 6 * len(np.unique(y))) # if FLAGS.dataset == "audi": # # print("FLAGS DATASET = AUDI") # # # Specify train, validation, and test split for the Audi data set # # audi_split = [0.5284, 0.07868, 0.39171] # # indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = ( # utils.get_train_val_test_splits(X,y,max_points,seed,confusion, # seed_batch, split=audi_split)) # print('X_train:', X_train.shape) # print('y_train:', y_train.shape) # print('X_val:', X_val.shape) # print('y_val:', y_val.shape) # # # X_test, y_test = shuffle(X_test, y_test, random_state=0) # # print('X_test:', X_test.shape) # print('y_test:', y_test.shape) indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = ( utils.get_train_val_test_splits(X,y,max_points,seed,confusion, seed_batch, split=data_splits)) print('X_train:', X_train.shape) print('y_train:', y_train.shape) print('X_val:', X_val.shape) print('y_val:', y_val.shape) print('X_test:', X_test.shape) print('y_test:', y_test.shape) # Preprocess data # if norm_data: # print("Normalizing data") # X_train = normalize(X_train) # X_val = normalize(X_val) # X_test = normalize(X_test) # if standardize_data: # print("Standardizing data") # print(X_train.shape) # scaler = StandardScaler().fit(X_train) # X_train = scaler.transform(X_train) # X_val = scaler.transform(X_val) # X_test = scaler.transform(X_test) # print("active percentage: " + str(active_p) + " warmstart batch: " + # str(seed_batch) + " batch size: " + str(batch_size) + " confusion: " + # str(confusion) + " seed: " + str(seed)) # Initialize samplers uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed) sampler = sampler(X_train, y_train, seed) results = {} data_sizes = [] accuracy = [] selected_inds = list(range(seed_batch)) # If select model is None, use score_model same_score_select = False if select_model is None: select_model = score_model same_score_select = True n_batches = int(np.ceil((train_horizon * train_size - seed_batch) * 1.0 / batch_size)) + 1 print('Number of active Learning rounds:', n_batches) for b in range(n_batches): print('NBATCHES: ', n_batches) n_train = seed_batch + min(train_size - seed_batch, b * batch_size) print("Training model on " + str(n_train) + " datapoints") assert n_train == len(selected_inds) data_sizes.append(n_train) # Sort active_ind so that the end results matches that of uniform sampling partial_X = X_train[sorted(selected_inds)] partial_y = y_train[sorted(selected_inds)] partial_X = np.array(partial_X) partial_y = np.array(partial_y) print('PARTIAL_X') print(partial_X.shape) print('PARTIAL_Y') print(partial_y.shape)# n_ensembles = 5 mean_acc = [] X_Pool_Dropout = X_train All_Dropout_Classes = np.zeros(shape=(X_Pool_Dropout.shape[0], 1)) # Für AUDI -16 hinzugefügt print('Use trained model for test time dropout') for i in range(n_ensembles): print('N_ENSEMBLE: '+ str(i+1)) score_model.build_model(X_val, y_val, X_test, y_test) score_model.fit(partial_X, partial_y, X_val, y_val, FLAGS) if not same_score_select: select_model.fit(partial_X, partial_y) # Predictions at Test Time try: pred = score_model.predict(X_Pool_Dropout) except: #pred = model.predict_proba(self.X) print('Ein Fehler ist bei der Vorhersage aufgetreten') dropout_classes = np.argmax(pred, axis=1) print('DROPOUT CLASSES.SHAPE') print(dropout_classes.shape) dropout_classes = np.array([dropout_classes]).T All_Dropout_Classes = np.append(All_Dropout_Classes, dropout_classes, axis=1) # Calculate test accuracy as an average of the ensembles acc = score_model.score(X_test, y_test, FLAGS) mean_acc.append(acc) with open('./trained_models/All_Dropout_Classes', 'wb') as fp: pickle.dump(All_Dropout_Classes, fp) # Calculate mean for each model accuracy.append(np.mean(mean_acc)) print("Sampler: %s, Accuracy: %.2f%%" % (sampler.name, accuracy[-1]*100)) with open('./test_accuracy/ResNet_ensemble_varRatio_lr0.0005_batch64' + str(seed) + '.json', 'w') as f: json.dump(str(accuracy), f) n_sample = min(batch_size, train_size - len(selected_inds)) select_batch_inputs = { "model": select_model, "labeled": dict(zip(selected_inds, y_train[selected_inds])), "eval_acc": accuracy[-1], "X_test": X_val, "y_test": y_val, "y": y_train } new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample, selected_inds, **select_batch_inputs) selected_inds.extend(new_batch) print('Requested: %d, Selected: %d' % (n_sample, len(new_batch))) assert len(new_batch) == n_sample assert len(list(set(selected_inds))) == len(selected_inds) # Check that the returned indice are correct and will allow mapping to # training set from original data assert all(y_noise[indices[selected_inds]] == y_train[selected_inds]) results["accuracy"] = accuracy results["selected_inds"] = selected_inds results["data_sizes"] = data_sizes results["indices"] = indices results["noisy_targets"] = y_noise return results, sampler, accuracy
def generate_one_curve(X, y, sampler, score_model, seed, warmstart_size, batch_size, select_model=None, confusion=0., active_p=1.0, max_points=None, standardize_data=False, norm_data=False, train_horizon=0.5): """Creates one learning curve for both active and passive learning. Will calculate accuracy on validation set as the number of training data points increases for both PL and AL. Caveats: training method used is sensitive to sorting of the data so we resort all intermediate datasets Args: X: training data y: training labels sampler: sampling class from sampling_methods, assumes reference passed in and sampler not yet instantiated. score_model: model used to score the samplers. Expects fit and predict methods to be implemented. seed: seed used for data shuffle and other sources of randomness in sampler or model training warmstart_size: float or int. float indicates percentage of train data to use for initial model batch_size: float or int. float indicates batch size as a percent of training data select_model: defaults to None, in which case the score model will be used to select new datapoints to label. Model must implement fit, predict and depending on AL method may also need decision_function. confusion: percentage of labels of one class to flip to the other active_p: percent of batch to allocate to active learning max_points: limit dataset size for preliminary standardize_data: wheter to standardize the data to 0 mean unit variance norm_data: whether to normalize the data. Default is False for logistic regression. train_horizon: how long to draw the curve for. Percent of training data. Returns: results: dictionary of results for all samplers sampler_states: dictionary of sampler objects for debugging """ def select_batch(sampler, uniform_sampler, mixture, N, already_selected, **kwargs): n_active = int(mixture * N) n_passive = N - n_active kwargs["N"] = n_active kwargs["already_selected"] = already_selected batch_AL = sampler.select_batch(**kwargs) already_selected = list(already_selected) + batch_AL kwargs["N"] = n_passive kwargs["already_selected"] = already_selected batch_PL = uniform_sampler.select_batch(**kwargs) return batch_AL + batch_PL np.random.seed(seed) # Specify the train, validation, and test split cifar10 = [8. / 10, 1. / 30, 1. / 15] #Train: 48000, Val: 2000, Test: 10000 mnist = [29. / 35, 1. / 35, 1. / 7] #Train: 58000, Val: 2000, Test: 10000 audi = [0.744, 0.0732, 0.177] data_splits = cifar10 if max_points is None: max_points = len(y) train_size = int(min(max_points, len(y)) * data_splits[0]) if batch_size < 1: batch_size = int(batch_size * train_size) else: batch_size = int(batch_size) if warmstart_size < 1: seed_batch = int(warmstart_size * train_size) else: seed_batch = int(warmstart_size) seed_batch = max(seed_batch, 6 * len(np.unique(y))) indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = ( utils.get_train_val_test_splits(X, y, max_points, seed, confusion, seed_batch, split=data_splits)) print('X_train:', X_train.shape) print('y_train:', y_train.shape) print('X_val:', X_val.shape) print('y_val:', y_val.shape) print('X_test:', X_test.shape) print('y_test:', y_test.shape) # Preprocess data # if norm_data: # print("Normalizing data") # X_train = normalize(X_train) # X_val = normalize(X_val) # X_test = normalize(X_test) # if standardize_data: # print("Standardizing data") # print(X_train.shape) # scaler = StandardScaler().fit(X_train) # X_train = scaler.transform(X_train) # X_val = scaler.transform(X_val) # X_test = scaler.transform(X_test) # print("active percentage: " + str(active_p) + " warmstart batch: " + # str(seed_batch) + " batch size: " + str(batch_size) + " confusion: " + # str(confusion) + " seed: " + str(seed)) # Initialize samplers uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed) sampler = sampler(X_train, y_train, seed) results = {} data_sizes = [] accuracy = [] selected_inds = list(range(seed_batch)) # If select model is None, use score_model same_score_select = False if select_model is None: select_model = score_model same_score_select = True n_batches = int( np.ceil( (train_horizon * train_size - seed_batch) * 1.0 / batch_size)) + 1 print('Number of active Learning rounds:', n_batches) sampling_time_measurement = [] for b in range(n_batches): n_train = seed_batch + min(train_size - seed_batch, b * batch_size) print("Training model on " + str(n_train) + " datapoints") assert n_train == len(selected_inds) data_sizes.append(n_train) # Sort active_ind so that the end results matches that of uniform sampling partial_X = X_train[sorted(selected_inds)] partial_y = y_train[sorted(selected_inds)] partial_X = np.array(partial_X) partial_y = np.array(partial_y) print('PARTIAL_X') print(partial_X.shape) print('PARTIAL_Y') print(partial_y.shape) print('Histogram of labeled data') print(np.histogram(partial_y)[0] / partial_y.shape[0]) score_model.fit( partial_X, partial_y, ) if not same_score_select: select_model.fit(partial_X, partial_y) acc = score_model.score(X_test, y_test) accuracy.append(acc) print("Sampler: %s, Accuracy: %.2f%%" % (sampler.name, accuracy[-1] * 100)) n_sample = min(batch_size, train_size - len(selected_inds)) select_batch_inputs = { "model": select_model, "labeled": dict(zip(selected_inds, y_train[selected_inds])), "eval_acc": accuracy[-1], "X_test": X_val, "y_test": y_val, "y": y_train } start = time.time() new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample, selected_inds, **select_batch_inputs) selected_inds.extend(new_batch) end = time.time() execution_time = end - start sampling_time_measurement.append(execution_time) print('Time elapsed for batch selection: ', execution_time) print('Requested: %d, Selected: %d' % (n_sample, len(new_batch))) assert len(new_batch) == n_sample assert len(list(set(selected_inds))) == len(selected_inds) # Check that the returned indice are correct and will allow mapping to # training set from original data assert all(y_noise[indices[selected_inds]] == y_train[selected_inds]) results["accuracy"] = accuracy results["selected_inds"] = selected_inds results["data_sizes"] = data_sizes results["indices"] = indices results["noisy_targets"] = y_noise return results, sampler, accuracy, sampling_time_measurement
def generate_one_curve(X, y, sampler, score_model, seed, warmstart_size, batch_size, select_model=None, confusion=0., active_p=1.0, max_points=None, standardize_data=False, norm_data=False, train_horizon=0.5): """Creates one learning curve for both active and passive learning. Will calculate accuracy on validation set as the number of training data points increases for both PL and AL. Caveats: training method used is sensitive to sorting of the data so we resort all intermediate datasets Args: X: training data y: training labels sampler: sampling class from sampling_methods, assumes reference passed in and sampler not yet instantiated. score_model: model used to score the samplers. Expects fit and predict methods to be implemented. seed: seed used for data shuffle and other sources of randomness in sampler or model training warmstart_size: float or int. float indicates percentage of train data to use for initial model batch_size: float or int. float indicates batch size as a percent of training data select_model: defaults to None, in which case the score model will be used to select new datapoints to label. Model must implement fit, predict and depending on AL method may also need decision_function. confusion: percentage of labels of one class to flip to the other active_p: percent of batch to allocate to active learning max_points: limit dataset size for preliminary standardize_data: wheter to standardize the data to 0 mean unit variance norm_data: whether to normalize the data. Default is False for logistic regression. train_horizon: how long to draw the curve for. Percent of training data. Returns: results: dictionary of results for all samplers sampler_states: dictionary of sampler objects for debugging """ """ 参数: X:训练集 Y:测试集 sampler:来自samples_methods的采样类,假定传入的引用和采样器尚未实例化 score_model:用于对采样器评分的模型。 seed:用于数据混洗,和采样器或模型训练中其他随机种子相同 warmstart_size:初始统一的采样示例作为种子数据,可以是整数,可以是浮点数。浮点数表示总训练数据的百分比,整数表示原始数据集大小。 batch_size:每个批次中要请求的数据数量。 浮点数表示总训练数据的百分比,整数表示原始大小 select_model:默认为None,在这种情况下,分数模型将用于选择要标记的新数据点。模型必须实现拟合,预测和依赖于AL的方法,可能还需要Decision_function。 confusion: 一类标签翻转到另一类标签的百分比 active_p:每批次分配给主动学习百分比 max_points:初步限制数据集大小 standardize_data:是否把数据标准化为平均值为0 norm_data:是否规范化数据。 train_horizon:绘制曲线的时间长度。 返回值: results:所有采样器的结果字典 sampler_states:用于调试的采样器对象字典 """ # TODO(lishal): add option to find best hyperparameter setting first on # full dataset and fix the hyperparameter for the rest of the routine # This will save computation and also lead to more stable behavior for the # test accuracy # TODO(lishal): remove mixture parameter and have the mixture be specified as # a mixture of samplers strategy def select_batch(sampler, uniform_sampler, mixture, N, already_selected, **kwargs): n_active = int(mixture * N) n_passive = N - n_active kwargs["N"] = n_active kwargs["already_selected"] = already_selected batch_AL = sampler.select_batch(**kwargs) already_selected = already_selected + batch_AL kwargs["N"] = n_passive kwargs["already_selected"] = already_selected batch_PL = uniform_sampler.select_batch(**kwargs) return batch_AL + batch_PL np.random.seed(seed) data_splits = [2. / 3, 1. / 6, 1. / 6] # 2/3 of data for training if max_points is None: max_points = len(y) train_size = int(min(max_points, len(y)) * data_splits[0]) if batch_size < 1: batch_size = int(batch_size * train_size) else: batch_size = int(batch_size) if warmstart_size < 1: # Set seed batch to provide enough samples to get at least 4 per class # TODO(lishal): switch to sklearn stratified sampler seed_batch = int(warmstart_size * train_size) else: seed_batch = int(warmstart_size) seed_batch = max(seed_batch, 6 * len(np.unique(y))) indices, X_train, y_train, X_val, y_val, X_test, y_test, y_noise = ( utils.get_train_val_test_splits(X, y, max_points, seed, confusion, seed_batch, split=data_splits)) # Preprocess data if norm_data: print("Normalizing data") X_train = normalize(X_train) X_val = normalize(X_val) X_test = normalize(X_test) if standardize_data: print("Standardizing data") scaler = StandardScaler().fit(X_train) X_train = scaler.transform(X_train) X_val = scaler.transform(X_val) X_test = scaler.transform(X_test) print("active percentage: " + str(active_p) + " warmstart batch: " + str(seed_batch) + " batch size: " + str(batch_size) + " confusion: " + str(confusion) + " seed: " + str(seed)) # Initialize samplers uniform_sampler = AL_MAPPING["uniform"](X_train, y_train, seed) sampler = sampler(X_train, y_train, seed) results = {} data_sizes = [] accuracy = [] selected_inds = list(range(seed_batch)) # If select model is None, use score_model same_score_select = False if select_model is None: select_model = score_model same_score_select = True n_batches = int( np.ceil( (train_horizon * train_size - seed_batch) * 1.0 / batch_size)) + 1 for b in range(n_batches): n_train = seed_batch + min(train_size - seed_batch, b * batch_size) print("Training model on " + str(n_train) + " datapoints") assert n_train == len(selected_inds) data_sizes.append(n_train) # Sort active_ind so that the end results matches that of uniform sampling partial_X = X_train[sorted(selected_inds)] partial_y = y_train[sorted(selected_inds)] score_model.fit(partial_X, partial_y) if not same_score_select: select_model.fit(partial_X, partial_y) acc = score_model.score(X_test, y_test) accuracy.append(acc) print("Sampler: %s, Accuracy: %.2f%%" % (sampler.name, accuracy[-1] * 100)) n_sample = min(batch_size, train_size - len(selected_inds)) select_batch_inputs = { "model": select_model, "labeled": dict(zip(selected_inds, y_train[selected_inds])), "eval_acc": accuracy[-1], "X_test": X_val, "y_test": y_val, "y": y_train } new_batch = select_batch(sampler, uniform_sampler, active_p, n_sample, selected_inds, **select_batch_inputs) selected_inds.extend(new_batch) print('Requested: %d, Selected: %d' % (n_sample, len(new_batch))) assert len(new_batch) == n_sample assert len(list(set(selected_inds))) == len(selected_inds) # Check that the returned indice are correct and will allow mapping to # training set from original data assert all(y_noise[indices[selected_inds]] == y_train[selected_inds]) results["accuracy"] = accuracy results["selected_inds"] = selected_inds results["data_sizes"] = data_sizes results["indices"] = indices results["noisy_targets"] = y_noise return results, sampler