def cv_scores_scales(X,y,clf,cv,amount,information): X,y = shuffle_set(X,y) sc = 1 scorings = [[]] score = [[]] predict = [[]] guessed = [[]] predicts = [[],[]] time = [0,0] X,y = split(X,y,amount) for i in range(0,cv): cv_clf = copy(clf) if i == 0 or i== 9: if i== 0 : X_train = X[0:len(X)-len(X)//cv] X_test = X[len(X)-len(X)//cv:len(X)] y_train = y[0:len(y)-len(y)//cv] y_test = y[len(y)-len(y)//cv:len(y)] else: X_train = X[len(X)//cv:len(X)] X_test = X[0:len(X)//cv] y_train = y[len(y)//cv:len(y)] y_test = y[0:len(y)//cv] else: X_train = X[0:len(X)//10*i] X_train.extend(X[len(X)//10*(i+1):len(X)]) X_test = X[len(X)//10*i:len(X)//10*(i+1)] y_train = y[0:len(y)//10*i] y_train.extend(y[len(y)//10*(i+1):len(y)]) y_test = y[len(y)//10*i:len(y)//10*(i+1)] with stopwatch() as sw: _ = cv_clf.fit(X_train,y_train) time[0] = time[0] + sw.duration with stopwatch() as sw: predict[0] = cv_clf.predict(X_test) time[1] = time[1] + sw.duration for k in range(0,sc): score[k] = 0 if information >=2: for k in range(0,sc): guessed[k].append(distr_guessed(predict[k])) for k in range(0,sc): scorings[k].append(accuracy_score(y_test,predict[k])) if information >= 3: for k in range(0,sc): predicts[k].append(predict[k]) predicts[sc].append(y_test) if(information >= 3): return scorings,guessed,predicts,time elif information == 2: return scorings,guessed else: return scorings
def information_theoretic_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() classes, counts = np.unique(y, return_counts = True) features["ClassEntropy"] = scipy.stats.entropy(counts, base = 2) # Information theoretic meta-features below only apply to categorical values if(sum(categorical) == 0): return OrderedDict.fromkeys(information_theoretic_metafeature_names(), value = -1) with utils.stopwatch() as sw: feature_entropies = [scipy.stats.entropy(column[0]) for column in X[:,np.where(categorical)].T] mean_feature_entropy = np.mean(feature_entropies) features["MeanFeatureEntropy"] = np.mean(mean_feature_entropy) mutual_informations = [sklearn.metrics.mutual_info_score(y, column[0]) for column in X[:, np.where(categorical)].T] mean_mutual_information = np.mean(mutual_informations) features["MeanMutualInformation"] = mean_mutual_information if(mean_mutual_information == 0): features["NoiseToSignalRatio"] = 0 features["NoiseToSignalRatio"] = (mean_feature_entropy - mean_mutual_information) / mean_mutual_information features["InformationFeatureTime"] = sw.duration return features
def ga_main1(model, out='result', clear_directory=False): ''' GAテスト & プロット ''' if clear_directory and os.path.isdir(out): shutil.rmtree(out) pop_size = 500 epoch = 100 ksize = 5 save_trigger = lambda i: i == epoch # 最後だけ model_env = {'nsga2': NSGA2_ENV, 'moead': MOEAD_ENV}[model] with model_env(pop_size=pop_size, ksize=ksize) as optimizer: with ut.stopwatch('main'): # GA開始 # 初期集団生成 optimizer.init_population() # 進化 for i in range(1, epoch + 1): optimizer.advance() print('epoch:', i, 'popsize:', len(optimizer.population), end='\r') if save_trigger(i): file = f'pop_size{pop_size}_epoch{i}_{ut.strnow("%Y%m%d_%H%M")}.pickle' optimizer.save(file=os.path.join(out, file)) ga_result11(model, out=out, show=False)
def ga_main1(method_name, out='result', clear_directory=False): ''' GA実行 ''' if clear_directory and os.path.isdir(out): shutil.rmtree(out) popsize = 100 epoch = 100 ksize = 5 # 最適化設定環境取得 with M.Optimize_ENV(method_name, popsize=popsize, ksize=ksize) as env: optimizer = env.optimizer creator = env.creator with ut.stopwatch('main'): # GA開始 # 初期集団生成 population = optimizer.init_population(creator, popsize=popsize) history = [population] # 進化 for i in range(1, epoch + 1): population = optimizer(population) history.append(population) print('epoch:', i, 'popsize:', len(population), end='\r') if i == epoch: # モデルをファイルに書き込み file = f'popsize{popsize}_epoch{i}_{ut.snow}.pkl' file = os.path.join(out, file) print('save:', file) # optimizer.save(file=os.path.join(out, file)) ut.save(file, (env, optimizer, history)) return env, optimizer, history
def simple_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() n = X.shape[0] p = X.shape[1] with utils.stopwatch() as sw: features["NumberOfInstances"] = n features["LogNumberOfInstances"] = np.log(n) features["NumberOfFeatures"] = p features["LogNumberOfFeatures"] = np.log(p) features["DatasetRatio"] = p / n features["LogDatasetRatio"] = np.log(p / n) features["InverseDatasetRatio"] = n / p features["LogInverseDatasetRatio"] = np.log(n / p) classes, counts = np.unique(y, return_counts=True) nrNominal = sum(categorical) nrNumeric = len(categorical) - sum(categorical) features["NumberOfClasses"] = classes.shape[0] features["NumberOfCategoricalFeatures"] = nrNominal features["NumberOfNumericFeatures"] = nrNumeric features[ "RatioNumericalToNominal"] = nrNumeric / nrNominal if nrNominal > 0 else 0 features[ "RatioNominalToNumerical"] = nrNominal / nrNumeric if nrNumeric > 0 else 0 class_probabilities = [count / n for count in counts] features["ClassProbabilityMin"] = np.min(class_probabilities) features["ClassProbabilityMax"] = np.max(class_probabilities) features["ClassProbabilityMean"] = np.mean(class_probabilities) features["ClassProbabilitySTD"] = np.std(class_probabilities) symbols_per_column = [ np.unique(column).shape[0] for column in X[:, np.where(categorical)].T ] if len(symbols_per_column) > 0: features["SymbolsMin"] = np.min(symbols_per_column) features["SymbolsMax"] = np.max(symbols_per_column) features["SymbolsMean"] = np.mean(symbols_per_column) features["SymbolsSTD"] = np.std(symbols_per_column) features["SymbolsSum"] = np.sum(symbols_per_column) else: features["SymbolsMin"] = features["SymbolsMax"] = features[ "SymbolsMean"] = features["SymbolsSTD"] = features[ "SymbolsSum"] = 0 features["SimpleFeatureTime"] = sw.duration # Missing value features missing for now since only datasets without missing features were selected. return features
def optimizeRF(did,amount): X,y = read_did(did) X = add_copy_features(X,amount) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = RandomForestClassifier() params = {'max_features': range(1,len(X[0])), 'min_samples_split': range(2,20)} randomRF = RandomizedSearchCV(clf, param_distributions=params, n_iter=40,n_jobs = 3) with stopwatch() as sw: _=randomRF.fit(X_train, y_train) duration = sw.duration estimator = randomRF.best_estimator_ cv_score = list(cross_val_score(randomRF.best_estimator_,X,y,cv = 10)) cv_score.append(randomRF.score(X_test,y_test)) return estimator,cv_score,duration
def simple_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() n = X.shape[0] p = X.shape[1] with utils.stopwatch() as sw: features["NumberOfInstances"] = n features["LogNumberOfInstances"] = np.log(n) features["NumberOfFeatures"] = p features["LogNumberOfFeatures"] = np.log(p) features["DatasetRatio"] = p / n features["LogDatasetRatio"] = np.log(p / n) features["InverseDatasetRatio"] = n / p features["LogInverseDatasetRatio"] = np.log(n / p) classes, counts = np.unique(y, return_counts = True) nrNominal = sum(categorical) nrNumeric = len(categorical)-sum(categorical) features["NumberOfClasses"] = classes.shape[0] features["NumberOfCategoricalFeatures"] = nrNominal features["NumberOfNumericFeatures"] = nrNumeric features["RatioNumericalToNominal"] = nrNumeric / nrNominal if nrNominal > 0 else 0 features["RatioNominalToNumerical"] = nrNominal / nrNumeric if nrNumeric > 0 else 0 class_probabilities = [ count / n for count in counts ] features["ClassProbabilityMin"] = np.min(class_probabilities) features["ClassProbabilityMax"] = np.max(class_probabilities) features["ClassProbabilityMean"] = np.mean(class_probabilities) features["ClassProbabilitySTD"] = np.std(class_probabilities) symbols_per_column = [ np.unique(column).shape[0] for column in X[:, np.where(categorical)].T] if len(symbols_per_column) > 0: features["SymbolsMin"] = np.min(symbols_per_column) features["SymbolsMax"] = np.max(symbols_per_column) features["SymbolsMean"] = np.mean(symbols_per_column) features["SymbolsSTD"] = np.std(symbols_per_column) features["SymbolsSum"] = np.sum(symbols_per_column) else: features["SymbolsMin"] = features["SymbolsMax"] = features["SymbolsMean"] = features["SymbolsSTD"] = features["SymbolsSum"] = 0 features["SimpleFeatureTime"] = sw.duration # Missing value features missing for now since only datasets without missing features were selected. return features
def cross_validate_classifier(clf, X, y, folds): """ Performs a cross-validation experiment for the classifier clf, reporting mean accuracy and time to run (as well as total time, but this was later left out in the study). """ accuracies = [] times = [] for train, test in folds: with stopwatch() as sw: clf.fit(X[train, :], y[train]) accuracy = clf.score(X[test], y[test]) accuracies.append(accuracy) times.append(sw.duration) return np.mean(accuracies), sum(times), np.mean(times)
def optimizeADA(did,amount): X,y = read_did(did) iters = 40 X = add_copy_features(X,amount) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = AdaBoostClassifier() learns = [random()*1.9 + 0.1 for i in range(iters*4)] params = {'learning_rate': learns} randomRF = RandomizedSearchCV(clf, param_distributions=params, n_iter=iters,n_jobs = 3) with stopwatch() as sw: _=randomRF.fit(X_train, y_train) duration = sw.duration estimator = randomRF.best_estimator_ cv_score = list(cross_val_score(randomRF.best_estimator_,X,y,cv = 10)) cv_score.append(randomRF.score(X_test,y_test)) return estimator,cv_score,duration
def cross_validate_classifier(clf, X, y, folds): """ Performs a cross-validation experiment for the classifier clf, reporting mean accuracy and time to run (as well as total time, but this was later left out in the study). """ accuracies = [] times = [] for train, test in folds: with stopwatch() as sw: clf.fit(X[train], y[train]) accuracy = clf.score(X[test], y[test]) accuracies.append(accuracy) times.append(sw.duration) return np.mean(accuracies), sum(times), np.mean(times)
def statistical_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() numerical = [not cat for cat in categorical] # Statistical meta-features are only for the numerical attributes, if there are none, we list them as -1 # we should see if there is a better way to deal with this, as -1 is a valid value for some of these features.. if (sum(numerical) == 0): return OrderedDict.fromkeys(statistical_metafeature_names(), value=-1) with utils.stopwatch() as sw: # Taking taking kurtosis of kurtosis and skewness of kurtosis is suggested by Reif et al. in Meta2-features (2012) kurtosisses = [ scipy.stats.kurtosis(column[0]) for column in X[:, np.where(numerical)].T ] features["KurtosisMin"] = np.min(kurtosisses) features["KurtosisMax"] = np.max(kurtosisses) features["KurtosisMean"] = np.mean(kurtosisses) features["KurtosisSTD"] = np.std(kurtosisses) features["KurtosisKurtosis"] = scipy.stats.kurtosis(kurtosisses) features["KurtosisSkewness"] = scipy.stats.skew(kurtosisses) skewnesses = [ scipy.stats.skew(column[0]) for column in X[:, np.where(numerical)].T ] features["SkewnessMin"] = np.min(skewnesses) features["SkewnessMax"] = np.max(skewnesses) features["SkewnessMean"] = np.mean(skewnesses) features["SkewnessSTD"] = np.std(skewnesses) features["SkewnessKurtosis"] = scipy.stats.kurtosis(skewnesses) features["SkewnessSkewness"] = scipy.stats.skew(skewnesses) standard_deviations = [ np.std(column[0]) for column in X[:, np.where(numerical)].T ] features["MeanSTDOfNumerical"] = np.mean(standard_deviations) features["STDSTDOfNumerical"] = np.std(standard_deviations) features["StatisticalFeatureTime"] = sw.duration return features
def ga_main2(out='result', clear_directory=False): ''' GAテスト & プロット ''' if clear_directory and os.path.isdir(out): shutil.rmtree(out) epoch = 250 save_trigger = lambda i: i == epoch # 最後だけ optimal_front = get_optomal_front() stat = [] with MOEAD_ENV() as optimizer: for rep in range(100): with ut.stopwatch(f'epoch{epoch+1}'): optimizer.create_initial_population() for i in range(1, epoch + 1): optimizer.advance() print('epoch:', i, 'popsize:', len(optimizer.population), end='\r') last_population = optimizer.get_individuals() last_population.sort(key=lambda x: x.value) conv = convergence(last_population, optimal_front) div = diversity(last_population, optimal_front[0], optimal_front[-1]) stat.append((conv, div)) print("Convergence: ", conv) print("Diversity: ", div) print('=' * 20, 'Average', '=' * 20) print("Convergence: ", np.mean([x[0] for x in stat])) print("Diversity: ", np.mean([x[1] for x in stat]))
def ga_main1(method_name, out='result', clear_directory=False): ''' GA実行 ''' if clear_directory and os.path.isdir(out): shutil.rmtree(out) popsize = 100 epoch = 100 ksize = 5 save_trigger = lambda i: i == epoch # 最後だけ model_env = {'nsga2': om.NSGA2_ENV, 'moead': om.MOEAD_ENV}[method_name] with model_env(popsize=popsize, ksize=ksize) as env: optimizer = env.optimizer creator = env.creator with ut.stopwatch('main'): # GA開始 # 初期集団生成 population = optimizer.init_population(creator, popsize=popsize) history = [population] # 進化 for i in range(1, epoch + 1): # optimizer.advance() population = optimizer(population) history.append(population) print('epoch:', i, 'popsize:', len(population), end='\r') if save_trigger(i): file = f'popsize{popsize}_epoch{i}_{ut.strnow("%Y%m%d_%H%M")}.pkl' file = os.path.join(out, file) print('save:', file) # optimizer.save(file=os.path.join(out, file)) ut.save(file, (env, optimizer, history)) return env, optimizer, history
def statistical_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() numerical = [not cat for cat in categorical] # Statistical meta-features are only for the numerical attributes, if there are none, we list them as -1 # we should see if there is a better way to deal with this, as -1 is a valid value for some of these features.. if(sum(numerical) == 0): return OrderedDict.fromkeys(statistical_metafeature_names(), value = -1) with utils.stopwatch() as sw: # Taking taking kurtosis of kurtosis and skewness of kurtosis is suggested by Reif et al. in Meta2-features (2012) kurtosisses = [scipy.stats.kurtosis(column[0]) for column in X[:,np.where(numerical)].T] features["KurtosisMin"] = np.min(kurtosisses) features["KurtosisMax"] = np.max(kurtosisses) features["KurtosisMean"] = np.mean(kurtosisses) features["KurtosisSTD"] = np.std(kurtosisses) features["KurtosisKurtosis"] = scipy.stats.kurtosis(kurtosisses) features["KurtosisSkewness"] = scipy.stats.skew(kurtosisses) skewnesses = [scipy.stats.skew(column[0]) for column in X[:,np.where(numerical)].T] features["SkewnessMin"] = np.min(skewnesses) features["SkewnessMax"] = np.max(skewnesses) features["SkewnessMean"] = np.mean(skewnesses) features["SkewnessSTD"] = np.std(skewnesses) features["SkewnessKurtosis"] = scipy.stats.kurtosis(skewnesses) features["SkewnessSkewness"] = scipy.stats.skew(skewnesses) standard_deviations = [np.std(column[0]) for column in X[:,np.where(numerical)].T] features["MeanSTDOfNumerical"] = np.mean(standard_deviations) features["STDSTDOfNumerical"] = np.std(standard_deviations) features["StatisticalFeatureTime"] = sw.duration return features
def information_theoretic_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() classes, counts = np.unique(y, return_counts=True) features["ClassEntropy"] = scipy.stats.entropy(counts, base=2) # Information theoretic meta-features below only apply to categorical values if (sum(categorical) == 0): return OrderedDict.fromkeys(information_theoretic_metafeature_names(), value=-1) with utils.stopwatch() as sw: feature_entropies = [ scipy.stats.entropy(column[0]) for column in X[:, np.where(categorical)].T ] mean_feature_entropy = np.mean(feature_entropies) features["MeanFeatureEntropy"] = np.mean(mean_feature_entropy) mutual_informations = [ sklearn.metrics.mutual_info_score(y, column[0]) for column in X[:, np.where(categorical)].T ] mean_mutual_information = np.mean(mutual_informations) features["MeanMutualInformation"] = mean_mutual_information if (mean_mutual_information == 0): features["NoiseToSignalRatio"] = 0 features["NoiseToSignalRatio"] = ( mean_feature_entropy - mean_mutual_information) / mean_mutual_information features["InformationFeatureTime"] = sw.duration return features
def cv_scores_features1(X,y,clf,cv,amount,information): dur = 0 X,y = shuffle_set(X,y) # amount = round(len(X[0])*amount) sc = 2 scorings = [[],[]] score = [[],[]] predict = [[],[]] guessed = [[],[]] predicts = [[],[],[]] time = [0,0,0,0] for i in range(0,cv): cv_clf = copy(clf) cv_clf2 = copy(clf) if i == 0 or i== 9: if i== 0 : X_train = X[0:len(X)-len(X)//cv] X_test = X[len(X)-len(X)//cv:len(X)] y_train = y[0:len(y)-len(y)//cv] y_test = y[len(y)-len(y)//cv:len(y)] train_X = remove_features(X_train,amount) test_X = remove_features(X_test,amount) else: X_train = X[len(X)//cv:len(X)] X_test = X[0:len(X)//cv] y_train = y[len(y)//cv:len(y)] y_test = y[0:len(y)//cv] train_X = remove_features(X_train,amount) test_X = remove_features(X_test,amount) else: X_train = X[0:len(X)//10*i] X_train.extend(X[len(X)//10*(i+1):len(X)]) X_test = X[len(X)//10*i:len(X)//10*(i+1)] y_train = y[0:len(y)//10*i] y_train.extend(y[len(y)//10*(i+1):len(y)]) y_test = y[len(y)//10*i:len(y)//10*(i+1)] train_X = remove_features(X_train,amount) test_X = remove_features(X_test,amount) with stopwatch() as sw: _ = cv_clf.fit(X_train,y_train) time[0] = time[0] + sw.duration with stopwatch() as sw: predict[0] = cv_clf.predict(X_test) time[1] = time[1] + sw.duration for k in range(0,sc): score[k] = 0 with stopwatch() as sw: _ = cv_clf2.fit(train_X,y_train) time[2] = time[2] + sw.duration with stopwatch() as sw: predict[1] = cv_clf2.predict(test_X) time[3] = time[3] + sw.duration if information >=2: for k in range(0,sc): guessed[k].append(distr_guessed(predict[k])) for k in range(0,sc): scorings[k].append(accuracy_score(y_test,predict[k])) if information >= 3: for k in range(0,sc): predicts[k].append(predict[k]) predicts[sc].append(y_test) if(information >= 3): return scorings,guessed,predicts,time elif information == 2: return scorings,guessed else: return scorings
# Calculate meta-features log("simple-mf") simple_features = simple_metafeatures(X_s, y_s, categorical) log("stat-mf") statistical_features = statistical_metafeatures(X_s, y_s, categorical) log("info-mf") info_features = information_theoretic_metafeatures(X_s, y_s, categorical) log("landmark-mf") landmark_features = landmarker_metafeatures(X_s, y_s, categorical, folds) # Run baseleaner experiments baselearner_results = OrderedDict() for baselearner in config.base_learners: log("base-learners: {}".format(baselearner.__name__)) with stopwatch() as sw: baselearner().fit(X_s, y_s) baselearner_results[baselearner.__name__] = sw.duration # result if type(result) is float else "E" with open(config.document_name, 'a') as fh: feature_list = [[did, i], simple_features.values(), statistical_features.values(), info_features.values(), landmark_features.values(), subsample_features.values(),baselearner_results.values()] list_as_string = ",".join([str(item) for sublist in feature_list for item in sublist]) fh.write(list_as_string + "\n") del X_s, y_s del X, y, categorical, dataset, task except Exception as err: log_traceback(config.logfile_name)