def data_change(file_path): file_name = file_path + '_conv_old.txt' data = fo.FileReader(file_name) new_data = [] for data_i in data: data_i = data_i.split(' ') new_data_i = [] for each_num in data_i: each_num = float(each_num) each_num = each_num + random.normalvariate(0, 0.1) new_data_i.append(each_num) new_data.append(new_data_i) change_index = len(new_data) - 1 for i in range(len(new_data[change_index]) - 1): if new_data[change_index][i] < new_data[change_index][i + 1]: new_data[change_index][i + 1] = new_data[change_index][i] print('print data: ', new_data[change_index]) buff = [] for buff_data in new_data: buff.append(list2string(buff_data)) new_file_name = file_path + '_conv_change.txt' fo.FileWriter(new_file_name, buff, style='w') return
def run_exp_racos_for_synthetic_problem_analysis(): # parameters sample_size = 10 # the instance number of sampling in an iteration budget = 500 # budget in online style positive_num = 2 # the set size of PosPop rand_probability = 0.99 # the probability of sample in model uncertain_bit = 1 # the dimension size that is sampled randomly adv_threshold = 10 # advance sample size opt_repeat = 10 dimension_size = 10 problem_name = 'sphere' problem_num = 200 start_index = 0 bias_region = 0.2 dimension = Dimension() dimension.set_dimension_size(dimension_size) dimension.set_regions([[-1.0, 1.0] for _ in range(dimension_size)], [0 for _ in range(dimension_size)]) log_buffer = [] # logging learner_path = './ExpLearner/SyntheticProbsLearner/' + problem_name + '/dimension' + str(dimension_size)\ + '/DirectionalModel/' + 'learner-' + problem_name + '-' + 'dim' + str(dimension_size) + '-'\ + 'bias' + str(bias_region) + '-' problem_path = './ExpLog/SyntheticProbsLog/' + problem_name + '/dimension' + str(dimension_size)\ + '/DirectionalModel/' + 'bias-' + problem_name + '-' + 'dim' + str(dimension_size) + '-'\ + 'bias' + str(bias_region) + '-' func = DistributedFunction(dimension, bias_region=[-0.5, 0.5]) target_bias = [0.1 for _ in range(dimension_size)] func.setBias(target_bias) if problem_name == 'ackley': prob_fct = func.DisAckley else: prob_fct = func.DisSphere relate_error_list = [] for prob_i in range(problem_num): print( start_index + prob_i, '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++') log_buffer.append( str(start_index + prob_i) + '++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++') log_buffer.append('+++++++++++++++++++++++++++++++') log_buffer.append('optimization parameters') log_buffer.append('sample size: ' + str(sample_size)) log_buffer.append('budget: ' + str(budget)) log_buffer.append('positive num: ' + str(positive_num)) log_buffer.append('random probability: ' + str(rand_probability)) log_buffer.append('uncertain bits: ' + str(uncertain_bit)) log_buffer.append('advance num: ' + str(adv_threshold)) log_buffer.append('+++++++++++++++++++++++++++++++') log_buffer.append('problem parameters') log_buffer.append('dimension size: ' + str(dimension_size)) log_buffer.append('problem name: ' + problem_name) log_buffer.append('bias_region: ' + str(bias_region)) log_buffer.append('+++++++++++++++++++++++++++++++') problem_file = problem_path + str(start_index + prob_i) + '.txt' problem_str = fo.FileReader(problem_file)[0].split(',') problem_index = int(problem_str[0]) problem_bias = string2list(problem_str[1]) if problem_index != (start_index + prob_i): print('problem error!') exit(0) print('source bias: ', problem_bias) log_buffer.append('source bias: ' + list2string(problem_bias)) reduisal = np.array(target_bias) - np.array(problem_bias) this_distance = reduisal * reduisal.T learner_file = learner_path + str(start_index + prob_i) + '.pkl' log_buffer.append('learner file: ' + learner_file) print('learner file: ', learner_file) net = torch.load(learner_file) net_list = [net] opt_error_list = [] for i in range(opt_repeat): print('optimize ', i, '===================================================') log_buffer.append( 'optimize ' + str(i) + '===================================================') exp_racos = ExpRacosOptimization(dimension, net_list) start_t = time.time() exp_racos.exp_mix_opt(obj_fct=prob_fct, ss=sample_size, bud=budget, pn=positive_num, rp=rand_probability, ub=uncertain_bit, at=adv_threshold) end_t = time.time() print('total budget is ', budget) log_buffer.append('total budget is ' + str(budget)) hour, minute, second = time_formulate(start_t, end_t) print('spending time: ', hour, ':', minute, ':', second) log_buffer.append('spending time: ' + str(hour) + '+' + str(minute) + '+' + str(second)) optimal = exp_racos.get_optimal() opt_error = optimal.get_fitness() optimal_x = optimal.get_features() opt_error_list.append(opt_error) print('validation optimal value: ', opt_error) log_buffer.append('validation optimal value: ' + str(opt_error)) print('optimal x: ', optimal_x) log_buffer.append('optimal nn structure: ' + list2string(optimal_x)) opt_mean = np.mean(np.array(opt_error_list)) relate_error_list.append([this_distance, opt_mean]) opt_std = np.std(np.array(opt_error_list)) print('--------------------------------------------------') print('optimization result: ', opt_mean, '#', opt_std) log_buffer.append('--------------------------------------------------') log_buffer.append('optimization result: ' + str(opt_mean) + '#' + str(opt_std)) result_path = './Results/SyntheticProbs/' + problem_name + '/dimension' + str( dimension_size) + '/' relate_error_file = result_path + 'relate-error-' + problem_name + '-dim' + str(dimension_size) + '-bias'\ + str(bias_region) + '.txt' temp_buffer = [] for i in range(len(relate_error_list)): relate, error = relate_error_list[i] temp_buffer.append(str(relate) + ',' + str(error)) print('relate error logging: ', relate_error_file) log_buffer.append('relate error logging: ' + relate_error_file) fo.FileWriter(relate_error_file, temp_buffer, style='w') optimization_log_file = result_path + 'opt-log-' + problem_name + '-dim' + str(dimension_size) + '-bias'\ + str(bias_region) + '.txt' print('optimization logging: ', optimization_log_file) fo.FileWriter(optimization_log_file, log_buffer, style='w')
def chosen_single_classifier(dataset_list): data_path = 'data_set/' logging_path = 'results/baseline/' repeat = 5 for dataset_name in dataset_list: log_buffer = [] print '========================================================' log_buffer.append( '========================================================') print 'dataset: ', dataset_name log_buffer.append('dataset: ' + dataset_name) train_file = data_path + dataset_name + '/' + dataset_name + '_train_data.pkl' test_file = data_path + dataset_name + '/' + dataset_name + '_test_data.pkl' train_feature, train_label, test_feature, test_label = dataset_reader( train_file, test_file) print ' feature size:', train_feature.shape[ 0], ', feature dimension:', train_feature.shape[1] log_buffer.append(' feature size:' + str(train_feature.shape[0]) + ', feature dimension:' + str(train_feature.shape[1])) print ' feature size:', test_feature.shape[ 0], ', feature dimension:', test_feature.shape[1] log_buffer.append(' feature size:' + str(test_feature.shape[0]) + ', feature dimension:' + str(test_feature.shape[1])) dtc = DecisionTreeClassifier() mlpc = MLPClassifier() lr = LogisticRegression() svc = classes.SVC() gpc = GaussianProcessClassifier() pac = PassiveAggressiveClassifier() gnb = GaussianNB() sgdc = SGDClassifier() rfc = RandomForestClassifier() knn = KNeighborsClassifier() classifiers = [dtc, mlpc, lr, svc, gpc, pac, gnb, sgdc, rfc, knn] classifier_names = [ 'Decision Tree Classifier', 'MLPClassifier', 'LogisticRegression', 'SVC', 'Gaussian Process Classifier', 'Passive Aggressive Classifier', 'GaussianNB', 'SGDClassifier:', 'Random Forest Classifier', 'K-Neighbors Classifier' ] classifier_vali = [] for c_i in xrange(len(classifiers)): classifier = classifiers[c_i] classifier_name = classifier_names[c_i] print '--------------------------------------------------------' log_buffer.append( '--------------------------------------------------------') print classifier_name, ':' log_buffer.append(classifier_name + ':') start_t = time.time() vali_error = validation_error(classifier, train_feature, train_label, k=5) end_t = time.time() hour, minute, second = time_formulate(start_t, end_t) print ' training time: ', hour, ' hours, ', minute, ' minutes, ', second, ' seconds' log_buffer.append(' training time: ' + str(hour) + ' hours, ' + str(minute) + ' minutes, ' + str(second) + ' seconds') classifier_vali.append(vali_error) print 'validation error:', vali_error min_index = classifier_vali.index(min(classifier_vali)) best_c = classifiers[min_index] best_c_name = classifier_names[min_index] print 'test best============================================' print 'best c: ', best_c_name log_buffer.append('===================================') log_buffer.append('best classifier: ' + best_c_name) test_errors = [] for r_i in xrange(repeat): print 'test repeat ', r_i, '-----------------------------' start_t = time.time() best_c = best_c.fit(train_feature, train_label) end_t = time.time() hour, minute, second = time_formulate(start_t, end_t) print ' training time: ', hour, ' hours, ', minute, ' minutes, ', second, ' seconds' log_buffer.append(' training time: ' + str(hour) + ' hours, ' + str(minute) + ' minutes, ' + str(second) + ' seconds') predictions = best_c.predict(test_feature) accuracy = accuracy_score(test_label, predictions) print 'error: ', 1 - accuracy test_errors.append(1 - accuracy) log_buffer.append('errors: ' + list2string(test_errors)) mean_error = np.mean(np.array(test_errors)) print 'mean error: ', mean_error log_buffer.append('mean_error: ' + str(mean_error)) logging_file = logging_path + dataset_name + '_chosen_single.txt' print dataset_name, ' logging...' fo.FileWriter(logging_file, log_buffer, 'w')
def dataset_processing(): path = 'data_set/' # feature type False means categorical and True means number # cylinder if False: dataset_name = 'cylinder' ori_file = 'bands.data.txt' feature_types = [True] for i in range(19): feature_types.append(False) for i in range(19): feature_types.append(True) feature_types.append(False) label_index = 39 # abalone if False: dataset_name = 'abalone' ori_file = 'abalone.data.txt' feature_types = [False] for i in range(7): feature_types.append(True) feature_types.append(False) label_index = 8 # anneal if False: dataset_name = 'annealing' ori_train_file = 'anneal.data.txt' ori_test_file = 'anneal.test.txt' feature_types = [False for i in range(39)] number_index = [3, 4, 7, 12, 32, 33, 34, 37] for index in number_index: feature_types[index] = True useless_index = [ 10, 12, 13, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 37 ] label_index = 38 # balanceScale if False: dataset_name = 'balanceScale' ori_file = 'balance-scale.data.txt' feature_types = [False, True, True, True, True] label_index = 0 # banknote if False: dataset_name = 'banknote' ori_file = 'data_banknote_authentication.txt' feature_types = [True, True, True, True, False] label_index = 4 # car if False: dataset_name = 'car' ori_file = 'car.data.txt' feature_types = [False, False, False, False, False, False, False] label_index = 6 # chess if False: dataset_name = 'chess' ori_file = 'chess.data.txt' feature_types = [False for i in range(37)] label_index = 36 # chess2 if False: dataset_name = 'chess2' ori_file = 'krkopt.data.txt' feature_types = [False for i in range(7)] label_index = 6 # cmc if False: dataset_name = 'cmc' ori_file = 'cmc.data.txt' feature_types = [ True, False, False, True, False, False, False, False, False, False ] label_index = 9 # CNAE9 if False: dataset_name = 'CNAE9' ori_file = 'CNAE-9.data.txt' feature_types = [False] for i in range(856): feature_types.append(True) label_index = 0 # credit if False: dataset_name = 'credit' ori_file = 'crx.data.txt' feature_types = [ False, True, True, False, False, False, False, True, False, False, True, False, False, True, True, False ] label_index = 15 # egg if False: dataset_name = 'eeg' ori_file = 'EEG_Eye_State.arff.txt' feature_types = [True for i in range(14)] feature_types.append(False) label_index = 14 # german credit if False: dataset_name = 'german credit' ori_file = 'german.data.txt' feature_types = [False for i in range(21)] feature_types[1] = True feature_types[4] = True feature_types[7] = True feature_types[10] = True feature_types[12] = True feature_types[15] = True feature_types[17] = True label_index = 20 # gisette use No.3 sub-processing if False: dataset_name = 'gisette' ori_train_data = 'gisette_train.data.txt' ori_train_label = 'gisette_train.labels.txt' ori_test_data = 'gisette_valid.data' ori_test_label = 'gisette_valid.labels' feature_types = [True for i in range(5000)] # jsbach, No.1 if False: dataset_name = 'jsbach' ori_file = 'jsbach_chorals_harmony.data' feature_types = [False for i in range(17)] feature_types[15] = True label_index = 16 # imageSegmentation_car, No.2 if False: dataset_name = 'imageSegmentation_car' ori_train_file = 'segmentation.data.txt' ori_test_file = 'segmentation.test.txt' feature_types = [True for i in range(20)] feature_types[0] = False useless_index = [] label_index = 0 # iris, No.1 if False: dataset_name = 'iris' ori_file = 'iris.data' feature_types = [True, True, True, True, False] label_index = 4 # letterRecognition, No.1 if False: dataset_name = 'letterRecognition' ori_file = 'letter-recognition.data' feature_types = [True for i in range(17)] feature_types[0] = False label_index = 0 # madelon, No.3 if False: dataset_name = 'madelon' ori_train_data = 'madelon_train.data.txt' ori_train_label = 'madelon_train.labels.txt' ori_test_data = 'madelon_valid.data.txt' ori_test_label = 'madelon_valid.labels.txt' feature_types = [True for i in range(500)] # magic04, No.1 if False: dataset_name = 'magic04' ori_file = 'magic04.data.txt' feature_types = [True for i in range(11)] feature_types[10] = False label_index = 10 # Diabetic Retinopathy Debrecen Data Set Data Set, No.1 if False: dataset_name = 'messidor' ori_file = 'messidor_features.arff' feature_types = [True for i in range(20)] feature_types[19] = False label_index = 19 # mushroom, No.1 if False: dataset_name = 'mushroom' ori_file = 'agaricus-lepiota.data.txt' feature_types = [False for i in range(23)] label_index = 0 # nursery, No.1 if False: dataset_name = 'nursery' ori_file = 'nursery.data.txt' feature_types = [False for i in range(9)] label_index = 8 # occupancy, No.4, delete 0-th and 1-st features if False: dataset_name = 'occupancy' ori_file = 'datatraining.txt' feature_types = [False, False, True, True, True, True, True, False] useless_index = [0, 1] label_index = 7 # seismic, No.1 if False: dataset_name = 'seismic' ori_file = 'seismic-bumps.arff.txt' feature_types = [True for i in range(19)] feature_types[0] = False feature_types[1] = False feature_types[2] = False feature_types[7] = False feature_types[18] = False label_index = 18 # spambase, No.1 if False: dataset_name = 'spambase' ori_file = 'spambase.data.txt' feature_types = [True for i in range(58)] feature_types[57] = False label_index = 57 # statlogSegment, No.1 if False: dataset_name = 'statlogSegment' ori_file = 'segment.data.txt' feature_types = [True for i in range(20)] feature_types[19] = False label_index = 19 # wilt, No.5 if False: dataset_name = 'wilt' ori_train_file = 'training.csv' ori_test_file = 'testing.csv' feature_types = [False, True, True, True, True, True] label_index = 0 # wine_quality_red, No.1 if False: dataset_name = 'wine_quality_red' ori_file = 'winequality-red.csv' feature_types = [True for i in range(12)] feature_types[11] = False label_index = 11 # wine_quality_white, No.1 if False: dataset_name = 'wine_quality_white' ori_file = 'winequality-white.csv' feature_types = [True for i in range(12)] feature_types[11] = False label_index = 11 # yeast, no.4 if False: dataset_name = 'yeast' ori_file = 'yeast.data.txt' feature_types = [ False, True, True, True, True, True, True, True, True, False ] useless_index = [0] label_index = 9 # adult, No.6 if False: dataset_name = 'adult' ori_train_file = 'adult.data' ori_test_file = 'adult.test' feature_types = [ True, False, True, False, True, False, False, False, False, False, True, True, True, False, False ] label_index = 14 # arcene, No.3 if False: dataset_name = 'arcene' ori_train_data = 'arcene_train.data' ori_train_label = 'arcene_train.labels' ori_test_data = 'arcene_valid.data' ori_test_label = 'arcene_valid.labels' feature_types = [True for i in range(10000)] # breast_cancer_wisconsin, No.4 if False: dataset_name = 'breast_cancer_wisconsin' ori_file = 'breast-cancer-wisconsin.data' feature_types = [False for i in range(11)] useless_index = [0] label_index = 10 # covtype, No.1 if False: dataset_name = 'covtype' ori_file = 'covtype.data' feature_types = [True for i in range(10)] for i in range(45): feature_types.append(False) label_index = 54 # drug_consumption, No.1 if False: dataset_name = 'drug_consumption' ori_file = 'drug_consumption.data' feature_types = [False] for i in range(12): feature_types.append(True) for i in range(19): feature_types.append(False) useless_index = [0] label_index = 31 # ecoli, No.1 if False: dataset_name = 'ecoli' ori_file = 'ecoli.data' feature_types = [ False, True, True, True, True, True, True, True, False ] label_index = 8 # flags, No.4, choose religion as prediction target if False: dataset_name = 'flag' ori_file = 'flag.data' feature_types = [False for i in range(30)] feature_types[3] = True feature_types[4] = True for i in range(3): feature_types[7 + i] = True for i in range(5): feature_types[18 + i] = True useless_index = [0] label_index = 6 # glass, No.4 if False: dataset_name = 'glass' ori_file = 'glass.data' feature_types = [False] for i in range(9): feature_types.append(True) feature_types.append(False) useless_index = [0] label_index = 10 # horse_colic, No.2 if False: dataset_name = 'horse_colic' ori_train_file = 'horse-colic.data' ori_test_file = 'horse-colic.test' feature_types = [False for i in range(28)] feature_types[3] = True feature_types[4] = True feature_types[5] = True feature_types[15] = True feature_types[18] = True feature_types[19] = True feature_types[21] = True useless_index = [2] label_index = 23 # HTRU2, No.1, line is split by '\r' if False: dataset_name = 'HTRU2' ori_file = 'HTRU_2.arff' feature_types = [True for i in range(8)] feature_types.append(False) label_index = 8 # wdbc, No.4 if False: dataset_name = 'wdbc' ori_file = 'wdbc.data' feature_types = [True for i in range(32)] feature_types[0] = False feature_types[1] = False useless_index = [0] label_index = 1 # wpbc, No.4 if False: dataset_name = 'wpbc' ori_file = 'wpbc.data' feature_types = [True for i in range(35)] feature_types[0] = False feature_types[1] = False useless_index = [0] label_index = 1 if True: dataset_name = 'house_vote' ori_file = 'house-votes-84.data.txt' feature_types = [False for i in range(17)] useless_index = [] label_index = 0 # processing-------------------------------------------------------------- # No.1 if True: # just one data file, we should split training and testing data # cylinder, abalone, balanceScale, banknote, car, chess, chess2 file_name = path + dataset_name + '/' + ori_file # reading data from file data = data_reader(file_name, feature_types) # missing value processing data = missing_value_processing(data, feature_types) # extract label data, label = extracting_label(data, label_index) del feature_types[label_index] # categorical feature encoding data = categorical_feature_encoding(data, feature_types) label, label_name = label_encoding(label) # split data into training and testing train_data, train_label, test_data, test_label = split_data( data, label, percent=0.2) # No.2 if False: # training and testing data are in different files # anneal train_file_name = path + dataset_name + '/' + ori_train_file test_file_name = path + dataset_name + '/' + ori_test_file # get training and testing data train_data = data_reader(train_file_name, feature_types) test_data = data_reader(test_file_name, feature_types) # extract label from data train_data, train_label = extracting_label(train_data, label_index) test_data, test_label = extracting_label(test_data, label_index) del feature_types[label_index] # delete useless feature from data train_data = feature_selected(train_data, useless_index) test_data = feature_selected(test_data, useless_index) new_feature_types = [] for i in range(len(feature_types)): if not (i in useless_index): new_feature_types.append(feature_types[i]) feature_types = new_feature_types # processing training and testing data at the same time train_data_len = len(train_data) test_data_len = len(test_data) data = [] data.extend(train_data) data.extend(test_data) label = [] label.extend(train_label) label.extend(test_label) # missing feature processing data = missing_value_processing(data, feature_types) # encoding categorical feature data = categorical_feature_encoding(data, feature_types) # encoding label label, label_name = label_encoding(label) # regain training and testing data from data train_data = [] train_label = [] for i in range(train_data_len): train_data.append(data[0]) train_label.append(label[0]) del data[0] del label[0] test_data = data test_label = label # No.3 if False: # training and testing data are in different files # gisette, madelon train_data_file_name = path + dataset_name + '/' + ori_train_data train_label_file_name = path + dataset_name + '/' + ori_train_label test_data_file_name = path + dataset_name + '/' + ori_test_data test_label_file_name = path + dataset_name + '/' + ori_test_label # get training and testing data train_data = data_reader(train_data_file_name, feature_types) train_label = data_reader(train_label_file_name, [False]) train_label = np.array(train_label).reshape(len(train_label)).tolist() test_data = data_reader(test_data_file_name, feature_types) test_label = data_reader(test_label_file_name, [False]) test_label = np.array(test_label).reshape(len(test_label)).tolist() # processing training and testing data at the same time train_data_len = len(train_data) print('training data length:', train_data_len) test_data_len = len(test_data) print('testing data length:', test_data_len) data = [] data.extend(train_data) data.extend(test_data) label = [] label.extend(train_label) label.extend(test_label) # missing feature processing data = missing_value_processing(data, feature_types) # encoding categorical feature data = categorical_feature_encoding(data, feature_types) # encoding label label, label_name = label_encoding(label) # regain training and testing data from data train_data = [] train_label = [] for i in range(train_data_len): train_data.append(data[0]) train_label.append(label[0]) del data[0] del label[0] test_data = data test_label = label # No.4 # delete some feature based on No.1 # occupancy, yeast if False: file_name = path + dataset_name + '/' + ori_file # reading data from file data = data_reader(file_name, feature_types) # missing value processing data = missing_value_processing(data, feature_types) # extract label data, label = extracting_label(data, label_index) del feature_types[label_index] # delete useless feature data = feature_selected(data, useless_index) new_feature_types = [] for i in range(len(feature_types)): if not (i in useless_index): new_feature_types.append(feature_types[i]) feature_types = new_feature_types # categorical feature encoding data = categorical_feature_encoding(data, feature_types) label, label_name = label_encoding(label) # split data into training and testing train_data, train_label, test_data, test_label = split_data( data, label, percent=0.2) # No.5 # reading csv file if False: train_file_name = path + dataset_name + '/' + ori_train_file test_file_name = path + dataset_name + '/' + ori_test_file train_data = csv_data_reader(train_file_name, feature_types) test_data = csv_data_reader(test_file_name, feature_types) train_data, train_label = extracting_label(train_data, label_index) test_data, test_label = extracting_label(test_data, label_index) del feature_types[label_index] train_data_len = len(train_data) test_data_len = len(test_data) data = [] label = [] data.extend(train_data) data.extend(test_data) label.extend(train_label) label.extend(test_label) data = missing_value_processing(data, feature_types) data = categorical_feature_encoding(data, feature_types) label, label_name = label_encoding(label) # regain training and testing data from data train_data = [] train_label = [] for i in range(train_data_len): train_data.append(data[0]) train_label.append(label[0]) del data[0] del label[0] test_data = data test_label = label # No.6 # training and testing data are in different files, processing based on No.1 if False: train_file_name = path + dataset_name + '/' + ori_train_file test_file_name = path + dataset_name + '/' + ori_test_file train_data = data_reader(train_file_name, feature_types) test_data = data_reader(test_file_name, feature_types) train_data, train_label = extracting_label(train_data, label_index) test_data, test_label = extracting_label(test_data, label_index) del feature_types[label_index] train_data_len = len(train_data) test_data_len = len(test_data) data = [] label = [] data.extend(train_data) data.extend(test_data) label.extend(train_label) label.extend(test_label) data = missing_value_processing(data, feature_types) data = categorical_feature_encoding(data, feature_types) label, label_name = label_encoding(label) # regain training and testing data from data train_data = [] train_label = [] for i in range(train_data_len): train_data.append(data[0]) train_label.append(label[0]) del data[0] del label[0] test_data = data test_label = label print( '-------------------------------------------------------------------------------------' ) print('training data length:', len(train_data), ', testing data length:', len(test_data)) print('data feature size:', len(train_data[0])) print( '-------------------------------------------------------------------------------------' ) train_data_file = path + dataset_name + '/' + dataset_name + '_train_data.pkl' test_data_file = path + dataset_name + '/' + dataset_name + '_test_data.pkl' label_class_file = path + dataset_name + '/' + dataset_name + '_label_class.txt' # writing train feature and label into pickle file print('writing training data...') f = open(train_data_file, 'wb') pickle.dump(train_data, f, 2) pickle.dump(train_label, f, 2) f.close() # writing test feature and label into pickle file print('writing testing data...') f = open(test_data_file, 'wb') pickle.dump(test_data, f, 2) pickle.dump(test_label, f, 2) f.close() # writing label class into txt file print('writing label class file...') buff = [] buff.append(list2string(label_name)) fo.FileWriter(label_class_file, buff, style='w')
def synthetic_problems_sample(budget=500, problem_name='sphere', problem_size=5, max_bias=0.5, bias_step=0): sample_size = 10 # the instance number of sampling in an iteration positive_num = 2 # the set size of PosPop rand_probability = 0.99 # the probability of sample in model uncertain_bits = 2 # the dimension size that is sampled randomly start_index = 0 repeat_num = 10 exp_path = path + '/ExpLog/SyntheticProbsLog/' bias = 0 dimension_size = 10 dimension = Dimension() dimension.set_dimension_size(dimension_size) dimension.set_regions([[-1.0, 1.0] for _ in range(dimension_size)], [0 for _ in range(dimension_size)]) if bias_step > 0: problem_name += '_group-sample' for prob_i in range(problem_size): if bias_step > 0 and prob_i % (problem_size / max_bias * bias_step) == 0: bias += bias_step else: bias = max_bias # bias log format: 'index,bias_list: dim1 dim2 dim3...' bias_log = [] running_log = [] running_log.append('+++++++++++++++++++++++++++++++++') running_log.append('optimization setting: ') running_log.append('sample_size: ' + str(sample_size)) running_log.append('positive_num: ' + str(positive_num)) running_log.append('rand_probability: ' + str(rand_probability)) running_log.append('uncertain_bits: ' + str(uncertain_bits)) running_log.append('budget: ' + str(budget)) running_log.append('group sample step: ' + str(bias_step)) running_log.append('+++++++++++++++++++++++++++++++++') print(problem_name, ': ', start_index + prob_i, ' ==============================================') running_log.append(problem_name + ': ' + str(start_index + prob_i) + ' ==============================================') # problem setting func = DistributedFunction(dim=dimension, bias_region=[-bias, bias]) if 'ackley' in problem_name: prob = func.DisAckley elif 'sphere' in problem_name: prob = func.DisSphere elif 'rosenbrock' in problem_name: prob = func.DisRosenbrock else: print('Wrong function!') return # bias log bias_log.append(str(prob_i) + ',' + list2string(func.getBias())) print('function: ', problem_name, ', this bias: ', func.getBias()) running_log.append('function: ' + problem_name + ', this bias: ' + list2string(func.getBias())) # optimization setting optimizer = RacosOptimization(dimension) positive_set = [] negative_set = [] new_sample_set = [] label_set = [] for repeat_i in range(repeat_num): print('repeat ', repeat_i, ' ----------------------------------------') running_log.append('repeat ' + str(repeat_i) + ' ----------------------------------------') # optimization process start_t = time.time() optimizer.mix_opt(obj_fct=prob, ss=sample_size, bud=budget, pn=positive_num, rp=rand_probability, ub=uncertain_bits) end_t = time.time() hour, minute, second = time_formulate(start_t, end_t) # optimization results optimal = optimizer.get_optimal() print('optimal v: ', optimal.get_fitness(), ' - ', optimal.get_features()) running_log.append('optimal v: ' + str(optimal.get_fitness()) + ' - ' + list2string(optimal.get_features())) print('spent time: ', hour, ':', minute, ':', second) running_log.append('spent time: ' + str(hour) + ':' + str(minute) + ':' + str(second)) # log samples this_positive, this_negative, this_new, this_label = optimizer.get_log( ) print('sample number: ', len(this_positive), ':', len(this_label)) running_log.append('sample number: ' + str(len(this_positive)) + ':' + str(len(this_label))) positive_set.extend(this_positive) negative_set.extend(this_negative) new_sample_set.extend(this_new) label_set.extend(this_label) print('----------------------------------------------') print('sample finish!') print('all sample number: ', len(positive_set), '-', len(negative_set), '-', len(new_sample_set), \ '-', len(label_set)) running_log.append('----------------------------------------------') running_log.append('all sample number: ' + str(len(positive_set)) + '-' + str(len(negative_set)) + '-' + str(len(new_sample_set)) + '-' + str(len(label_set))) data_log_file = exp_path + str(problem_name) + '/dimension' + str(dimension_size) + '/DataLog/' + \ 'data-' + problem_name + '-' + 'dim' + str(dimension_size) + '-' + 'bias' \ + str(bias) + '-' + str(start_index + prob_i) + '.pkl' bias_log_file = exp_path + str(problem_name) + '/dimension' + str(dimension_size) + '/RecordLog/' + 'bias-' \ + problem_name + '-' + 'dim' + str(dimension_size) + '-' + 'bias' + str(bias) \ + '-' + str(start_index + prob_i) + '.txt' running_log_file = exp_path + str(problem_name) + '/dimension' + str(dimension_size) + '/RecordLog/' + \ 'running-' + problem_name + '-' + 'dim' + str(dimension_size) + '-' + 'bias' \ + str(bias) + '-' + str(start_index + prob_i) + '.txt' print('data logging: ', data_log_file) running_log.append('data log path: ' + data_log_file) save_log(positive_set, negative_set, new_sample_set, label_set, data_log_file) print('bias logging: ', bias_log_file) running_log.append('bias log path: ' + bias_log_file) fo.FileWriter(bias_log_file, bias_log, style='w') print('running logging: ', running_log_file) fo.FileWriter(running_log_file, running_log, style='w') return
def run_racos(): # parameters sample_size = 10 # the instance number of sampling in an iteration budget = 500 # budget in online style positive_num = 2 # the set size of PosPop rand_probability = 0.99 # the probability of sample in model uncertain_bit = 1 # the dimension size that is sampled randomly bias_region = 0.5 repeat = 10 # dimension setting dimension_size = 10 dimension = Dimension() dimension.set_dimension_size(dimension_size) dimension.set_regions([[-1.0, 1.0] for _ in range(dimension_size)], [0 for _ in range(dimension_size)]) func = DistributedFunction(dim=dimension, bias_region=[-bias_region, bias_region]) if problem_name == 'rosenbrock': prob = func.DisRosenbrock else: prob = func.DisSphere # optimization racos = RacosOptimization(dimension) opt_error_list = [] for i in range(repeat): start_t = time.time() racos.mix_opt(prob, ss=sample_size, bud=budget, pn=positive_num, rp=rand_probability, ub=uncertain_bit) end_t = time.time() optimal = racos.get_optimal() hour, minute, second = time_formulate(start_t, end_t) print('total budget is ', budget, '------------------------------') print('spending time: ', hour, ' hours ', minute, ' minutes ', second, ' seconds') print('optimal value: ', optimal.get_fitness()) opt_error = optimal.get_fitness() optimal_x = optimal.get_features() opt_error_list.append(opt_error) print('validation optimal value: ', opt_error) log_buffer.append('validation optimal value: ' + str(opt_error)) print('optimal x: ', optimal_x) log_buffer.append('optimal nn structure: ' + list2string(optimal_x)) opt_mean = np.mean(np.array(opt_error_list)) opt_std = np.std(np.array(opt_error_list)) print('--------------------------------------------------') print('optimization result: ', opt_mean, '#', opt_std) log_buffer.append('--------------------------------------------------') log_buffer.append('optimization result: ' + str(opt_mean) + '#' + str(opt_std)) return opt_mean
def run_for_synthetic_problem(): sample_size = 10 # the instance number of sampling in an iteration budget = 50 # budget in online style positive_num = 2 # the set size of PosPop rand_probability = 0.99 # the probability of sample in model uncertain_bit = 1 # the dimension size that is sampled randomly adv_threshold = 10 # advance sample size opt_repeat = 10 dimension_size = 10 problem_name = 'sphere' bias_region = 0.5 eta = 0.9 step = 100 dimension = Dimension() dimension.set_dimension_size(dimension_size) dimension.set_regions([[-1.0, 1.0] for _ in range(dimension_size)], [0 for _ in range(dimension_size)]) log_buffer = [] # problem define func = DistributedFunction(dimension, bias_region=[-0.5, 0.5]) target_bias = [0.2 for _ in range(dimension_size)] func.setBias(target_bias) if problem_name == 'ackley': prob_fct = func.DisAckley else: prob_fct = func.DisSphere log_buffer.append('+++++++++++++++++++++++++++++++') log_buffer.append('optimization parameters') log_buffer.append('sample size: ' + str(sample_size)) log_buffer.append('budget: ' + str(budget)) log_buffer.append('positive num: ' + str(positive_num)) log_buffer.append('random probability: ' + str(rand_probability)) log_buffer.append('uncertain bits: ' + str(uncertain_bit)) log_buffer.append('advance num: ' + str(adv_threshold)) log_buffer.append('+++++++++++++++++++++++++++++++') log_buffer.append('problem parameters') log_buffer.append('dimension size: ' + str(dimension_size)) log_buffer.append('problem name: ' + problem_name) log_buffer.append('bias: ' + list2string(target_bias)) log_buffer.append('+++++++++++++++++++++++++++++++') predictors, load_buffer = get_predicotrs() expert = Experts(predictors=predictors, eta=eta, step=step) log_buffer.extend(load_buffer) opt_error_list = [] for i in range(opt_repeat): print('optimize ', i, '===================================================') log_buffer.append( 'optimize ' + str(i) + '===================================================') exp_racos = ExpAdaRacosOptimization(dimension, expert) start_t = time.time() exp_racos.exp_ada_mix_opt(obj_fct=prob_fct, ss=sample_size, bud=budget, pn=positive_num, rp=rand_probability, ub=uncertain_bit, at=adv_threshold) end_t = time.time() print('total budget is ', budget) log_buffer.append('total budget is ' + str(budget)) hour, minute, second = time_formulate(start_t, end_t) print('spending time: ', hour, ':', minute, ':', second) log_buffer.append('spending time: ' + str(hour) + '+' + str(minute) + '+' + str(second)) optimal = exp_racos.get_optimal() opt_error = optimal.get_fitness() optimal_x = optimal.get_features() opt_error_list.append(opt_error) print('validation optimal value: ', opt_error) log_buffer.append('validation optimal value: ' + str(opt_error)) print('optimal x: ', optimal_x) log_buffer.append('optimal nn structure: ' + list2string(optimal_x)) opt_mean = np.mean(np.array(opt_error_list)) opt_std = np.std(np.array(opt_error_list)) print('--------------------------------------------------') print('optimization result: ', opt_mean, '#', opt_std) log_buffer.append('--------------------------------------------------') log_buffer.append('optimization result: ' + str(opt_mean) + '#' + str(opt_std)) result_path = path + '/Results/Ada/' + problem_name + '/dimension' + str( dimension_size) + '/' optimization_log_file = result_path + 'opt-log-' + problem_name + '-dim' + str(dimension_size) + '-bias' \ + str(bias_region) + '.txt' print('optimization logging: ', optimization_log_file) fo.FileWriter(optimization_log_file, log_buffer, style='w') return
def run(type): opt_error_list = [] log_buffer.append('+++++++++++++++++++++++++++++++') log_buffer.append('Running: ' + type) log_buffer.append('+++++++++++++++++++++++++++++++') print('+++++++++++++++++++++++++++++++') print('Running: ' + type) print('+++++++++++++++++++++++++++++++') if type == 'ada': # pre=sorted(predictors,key=lambda a:a.dist) expert = Experts(predictors=predictors, eta=eta, bg=budget) for i in range(opt_repeat): print('optimize ', i, '===================================================') log_buffer.append( 'optimize ' + str(i) + '===================================================') start_t = time.time() if type == 'ave': exp_racos = ExpRacosOptimization(dimension, nets) opt_error = exp_racos.exp_mix_opt(obj_fct=prob_fct, ss=sample_size, bud=budget, pn=positive_num, rp=rand_probability, ub=uncertain_bit, at=adv_threshold) elif type == 'ada': exp_racos = ExpAdaRacosOptimization(dimension, expert) opt_error = exp_racos.exp_ada_mix_opt(obj_fct=prob_fct, ss=sample_size, bud=budget, pn=positive_num, rp=rand_probability, ub=uncertain_bit, at=adv_threshold, step=step) elif type == 'ground truth': exp_racos = ExpRacosOptimization(dimension, nets[:step]) exp_racos.exp_mix_opt(obj_fct=prob_fct, ss=sample_size, bud=budget, pn=positive_num, rp=rand_probability, ub=uncertain_bit, at=adv_threshold) else: print('Wrong type!') return end_t = time.time() hour, minute, second = time_formulate(start_t, end_t) print('spending time: ', hour, ':', minute, ':', second) log_buffer.append('spending time: ' + str(hour) + '+' + str(minute) + '+' + str(second)) optimal = exp_racos.get_optimal() opt_error = optimal.get_fitness() optimal_x = optimal.get_features() opt_error_list.append(opt_error) print('validation optimal value: ', opt_error) log_buffer.append('validation optimal value: ' + str(opt_error)) print('optimal x: ', optimal_x) log_buffer.append('optimal nn structure: ' + list2string(optimal_x)) opt_mean = np.mean(np.array(opt_error_list), axis=0) opt_std = np.std(np.array(opt_error_list), axis=0) print('--------------------------------------------------') print('optimization result for ' + str(opt_repeat) + ' times average: ', opt_mean, ', standard variance is: ', opt_std) log_buffer.append('--------------------------------------------------') log_buffer.append('optimization result for ' + str(opt_repeat) + ' times average: ' + str(opt_mean) + ', standard variance is: ' + str(opt_std)) return opt_mean, opt_std
print('Wrong function!') exit() log_buffer.append('+++++++++++++++++++++++++++++++') log_buffer.append('optimization parameters') log_buffer.append('sample size: ' + str(sample_size)) log_buffer.append('budget: ' + str(budget)) log_buffer.append('positive num: ' + str(positive_num)) log_buffer.append('random probability: ' + str(rand_probability)) log_buffer.append('uncertain bits: ' + str(uncertain_bit)) log_buffer.append('advance num: ' + str(adv_threshold)) log_buffer.append('+++++++++++++++++++++++++++++++') log_buffer.append('problem parameters') log_buffer.append('dimension size: ' + str(dimension_size)) log_buffer.append('problem name: ' + problem_name) log_buffer.append('bias: ' + list2string(target_bias)) log_buffer.append('+++++++++++++++++++++++++++++++') predictors, nets = get_mixed_predicotrs() opt_mean_gt, opt_std_gt = run('ground truth') opt_mean_ada, opt_std_ada = run('ada') opt_mean_ave, opt_std_ave = run('ave') opt_mean_ne, opt_std_ne = run_no_expert() x = [i for i in range(len(opt_mean_ada))] y0 = [opt_mean_ne for _ in range(len(opt_mean_ada))] plt.plot(x, y0) plt.plot(x, opt_mean_ada) plt.plot(x, opt_mean_ave) plt.show()
def synthetic_problems_sample(prob_i): # bias log format: 'index,bias_list: dim1 dim2 dim3...' bias_log = [] running_log = [] running_log.append('+++++++++++++++++++++++++++++++++') running_log.append('optimization setting: ') running_log.append('sample_size: ' + str(sample_size)) running_log.append('positive_num: ' + str(positive_num)) running_log.append('rand_probability: ' + str(rand_probability)) running_log.append('uncertain_bits: ' + str(uncertain_bits)) running_log.append('budget: ' + str(budget)) running_log.append('+++++++++++++++++++++++++++++++++') print(problem_name, ': ', start_index + prob_i, ' ==============================================') running_log.append(problem_name + ': ' + str(start_index + prob_i) + ' ==============================================') # bias setting group_num = 10 group_size = problem_num / group_num bias_step = bias_region / group_num new_bias_region = int(prob_i / group_size) * bias_step # problem setting func = DistributedFunction(dim=dimension, bias_region=[-new_bias_region, new_bias_region]) if 'ackley' in problem_name: prob_fct = func.DisAckley elif 'sphere' in problem_name: prob_fct = func.DisSphere elif 'rosenbrock' in problem_name: prob_fct = func.DisRosenbrock else: print('Wrong Function!') exit() # bias log bias_log.append( str(prob_i + start_index) + ',' + list2string(func.getBias())) print('function: ', problem_name, ', this bias: ', func.getBias()) running_log.append('function: ' + problem_name + ', this bias: ' + list2string(func.getBias())) # optimization setting optimizer = RacosOptimization(dimension) positive_set = [] negative_set = [] new_sample_set = [] label_set = [] for repeat_i in range(repeat_num): print('repeat ', repeat_i, ' ----------------------------------------') running_log.append('repeat ' + str(repeat_i) + ' ----------------------------------------') # optimization process start_t = time.time() optimizer.mix_opt(obj_fct=prob_fct, ss=sample_size, bud=budget, pn=positive_num, rp=rand_probability, ub=uncertain_bits) end_t = time.time() hour, minute, second = time_formulate(start_t, end_t) # optimization results optimal = optimizer.get_optimal() print('optimal v: ', optimal.get_fitness(), ' - ', optimal.get_features()) running_log.append('optimal v: ' + str(optimal.get_fitness()) + ' - ' + list2string(optimal.get_features())) print('spent time: ', hour, ':', minute, ':', second) running_log.append('spent time: ' + str(hour) + ':' + str(minute) + ':' + str(second)) # log samples this_positive, this_negative, this_new, this_label = optimizer.get_log( ) print('sample number: ', len(this_positive), ':', len(this_label)) running_log.append('sample number: ' + str(len(this_positive)) + ':' + str(len(this_label))) positive_set.extend(this_positive) negative_set.extend(this_negative) new_sample_set.extend(this_new) label_set.extend(this_label) print('----------------------------------------------') print('sample finish!') print('all sample number: ', len(positive_set), '-', len(negative_set), '-', len(new_sample_set), \ '-', len(label_set)) running_log.append('----------------------------------------------') running_log.append('all sample number: ' + str(len(positive_set)) + '-' + str(len(negative_set)) + '-' + str(len(new_sample_set)) + '-' + str(len(label_set))) data_log_file = exp_path + str(problem_name) + '/dimension' + str(dimension_size) + '/DataLog/' + \ 'data-' + problem_name + '-' + 'dim' + str(dimension_size) + '-' + 'bias' \ + str(bias_region) + '-' + str(start_index + prob_i) + '.pkl' bias_log_file = exp_path + str(problem_name) + '/dimension' + str(dimension_size) + '/RecordLog/' + 'bias-' \ + problem_name + '-' + 'dim' + str(dimension_size) + '-' + 'bias' + str(bias_region) \ + '-' + str(start_index + prob_i) + '.txt' running_log_file = exp_path + str(problem_name) + '/dimension' + str(dimension_size) + '/RecordLog/' + \ 'running-' + problem_name + '-' + 'dim' + str(dimension_size) + '-' + 'bias' \ + str(bias_region) + '-' + str(start_index + prob_i) + '.txt' print('data logging: ', data_log_file) running_log.append('data log path: ' + data_log_file) save_log(positive_set, negative_set, new_sample_set, label_set, data_log_file) print('bias logging: ', bias_log_file) running_log.append('bias log path: ' + bias_log_file) fo.FileWriter(bias_log_file, bias_log, style='w') print('running logging: ', running_log_file) fo.FileWriter(running_log_file, running_log, style='w') return