def fit(self, x, y): from genetic_selection import GeneticSelectionCV import random as rn self.X = x self.y = y if self.seed is not None: np.random.seed(self.seed) rn.seed(self.seed) # calculate ga selector = GeneticSelectionCV(self.estimator, cv=10, verbose=1, scoring="accuracy", n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=50, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, caching=True, n_jobs=-1) self.selector = selector.fit(self.X, self.y) print('GA - Selection') print(f'Number of selected features: {self.selector.n_features_}') print('Selected index:') print( pd.Series( self.colname).values[:self.feature][self.selector.support_])
def genetic_select(X, y, columns): scaler = MinMaxScaler() # Features need to be scaled in order to reduce the problem's complexity # otherwise, LogisticRegression will fail to converge! X = scaler.fit_transform(X) estimator = LogisticRegression(solver="liblinear", multi_class="ovr") selector = GeneticSelectionCV(estimator, cv=3, verbose=1, scoring="accuracy", max_features=min(X.shape[1], 30), n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=80, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=5, n_gen_no_change=10, caching=True, n_jobs=-1) selector = selector.fit(X, y) support_names = [columns[i] for i, s in enumerate(selector.support_) if s] return { # pick selected features names 'support': support_names, # pick feature coefficients #'coef': {support_names[i]: c for i, c in enumerate(selector.estimator_.coef_)}, }
def main(): data = pd.read_csv('predict_house.csv') # Some noisy data not correlated E = np.random.uniform(0, 0.1, size=(len(data), 20)) X = data.iloc[:, 0:79] y = data.iloc[:, -1] estimator = linear_model.LogisticRegression(solver="liblinear", multi_class="ovr") selector = GeneticSelectionCV(estimator, cv=5, verbose=1, scoring="accuracy", max_features=5, n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=40, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1) selector = selector.fit(X, y) kq = selector.predict(X) j = 0 for i in y: print(i) print(kq[j]) print() j = j + 1
def randomforest_genetic(X, y, X_test, y_test, columns): logger.info("Start RandomForest + Genetic") estimator = RandomForestClassifier(**CLASSIFIER_PARAMS) selector = GeneticSelectionCV(estimator, cv=5, verbose=0, scoring="accuracy", max_features=min(X.shape[1], 30), n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=80, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=5, n_gen_no_change=10, caching=True, n_jobs=-1) selector = selector.fit(X, y) logger.info("End RandomForest + Genetic") support_names = [columns[i] for i, s in enumerate(selector.support_) if s] importances = {columns[i]: v for i, v in enumerate(selector.estimator_.feature_importances_)} labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])} return { # pick selected features names 'support': support_names, # pick feature coefficients #'coef': {support_names[i]: c for i, c in enumerate(selector.estimator_.coef_)}, 'feature_importances': labeled, 'score': selector.score(X,y), 'test_score': selector.score(X_test, y_test) }
def main(): iris = datasets.load_iris() # Some noisy data not correlated E = np.random.uniform(0, 0.1, size=(len(iris.data), 20)) X = np.hstack((iris.data, E)) y = iris.target estimator = linear_model.LogisticRegression(solver="liblinear", multi_class="ovr") selector = GeneticSelectionCV(estimator, cv=5, verbose=1, scoring="accuracy", max_features=5, n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=40, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, caching=True, n_jobs=-1) selector = selector.fit(X, y) print(selector.support_)
def optim_feature_genetic( clf, df, dftest,colX, coly, params={}) : """ https://github.com/manuel-calzolari/sklearn-genetic """ from genetic_selection import GeneticSelectionCV selector = GeneticSelectionCV(clf, params) """ cv=5, verbose=1, scoring="accuracy", max_features=5, n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=40, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1) """ selector = selector.fit(df[colX].values, df[coly].values )
def run(): candidates = CandidateFeatureVector.objects.all().values() candidates_df = pd.DataFrame(candidates) candidates_df.set_index('id', inplace=True) candidates_df.drop(columns=['candidate_id'], inplace=True) candidates_df = EncodingUtil.basic_label_encode_cols( candidates_df, ConstantsUtil.BASIC_ENCODE_COLS) candidates_df = EncodingUtil.sort_position_cols_and_encode( candidates_df, ConstantsUtil.STRING_TUPLE_ENCODE_COLS) svm = SVM(C=.75, kernel='poly') X_train, X_test, y_train, y_test = svm.split_test_data( candidates_df, .3, 'classification', True) svm.fit_and_predict(X_train, X_test, y_train) print(svm.get_confusion_matrix(y_test)) print(svm.get_classification_report(y_test)) estimator = svm.get_model() selector = GeneticSelectionCV(estimator, cv=5, verbose=1, scoring="accuracy", max_features=50, n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=40, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1) X, y = svm.get_data() selector = selector.fit(X, y) print(selector.support_)
def GA_features(x, y): rf = RandomForestClassifier(max_depth=8, n_estimators=10) selector = GeneticSelectionCV( rf, cv=TimeSeriesSplit(n_splits=4), verbose=1, scoring="accuracy", max_features=80, n_population=200, crossover_proba=0.5, mutation_proba=0.2, n_generations=100, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=5, caching=True, n_jobs=-1 ) selector = selector.fit(x, y) features = x.columns[selector.support_] return features
def main(dataset): indexFile = 'data/datasets/{}/index.json'.format(dataset) resultFile = 'data/datasets/{}/feature_selection.json'.format(dataset) with open(indexFile) as f: index = json.load(f) result = {} for _sym, files in index.items(): params = { 'estimator': LogisticRegression(**{ 'solver': 'liblinear', 'multi_class': 'ovr' }), 'cv': 3, 'verbose': 1, 'scoring': "accuracy", 'max_features': 10, 'n_population': 50, 'crossover_proba': 0.5, 'mutation_proba': 0.2, 'n_generations': 80, 'crossover_independent_proba': 0.5, 'mutation_independent_proba': 0.05, 'tournament_size': 5, 'n_gen_no_change': 10, 'caching': True, 'n_jobs': -1 } pipe = Pipeline([ ('scaler', MinMaxScaler()), ('SVC', GeneticSelectionCV(**params)), ])
] random.shuffle( together) #groups based on first item of x_data, which should be shot! final_random = [i for j in together for i in j] X_data = (np.array(final_random))[:, 1:-1] Y_data = (np.array(final_random, dtype=int))[:, -1] scaler = StandardScaler() scaler.fit(X_data) X_data_v2 = scaler.transform(X_data) X = X_data_v2 y = Y_data estimator = linear_model.LogisticRegression() selector = GeneticSelectionCV(estimator, cv=5, verbose=1, scoring="accuracy", n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=40, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, caching=True, n_jobs=-1) selector = selector.fit(X, y) print(selector.support_)
df = pd.read_csv(str(path)+'PLMF.csv') n_col = df.shape[1] X = df.iloc[:,1:(n_col-1)] y = df.iloc[:,n_col-1:] estimator = linear_model.Lasso(1e-3,normalize=True, max_iter=1e9) selector = GeneticSelectionCV(estimator, cv=5, verbose=1, scoring="explained_variance", max_features=100, n_population=30, crossover_proba=0.5, mutation_proba=0.2, n_generations=20, crossover_independent_proba=0.1, mutation_independent_proba=0.05, tournament_size=5, caching=True, n_jobs=-1) selector = selector.fit(X, y) print (selector.score(X,y)) #print(selector.support_) selection = pd.DataFrame(X.columns , columns = ['features']) selection['support'] = selector.support_ selection['Flag'] = [1 if x == True else 0 for x in selection['support'] ]
pop_size = [50] cross_over = [0.2, 0.5, 0.8] mutation = [0.01, 0.05, 0.1] variations = [i for i in itertools.product(pop_size, cross_over, mutation)] run = 0 best_fitness_values = [0] * len(variations) for var_index, var in enumerate(variations): bsf_score_run = 0 selector = GeneticSelectionCV( estimator, cv=rkf, verbose=0, scoring="accuracy", max_features=len(allfeats), n_population=var[0], crossover_proba=var[1], mutation_proba=var[2], n_generations=30, crossover_independent_proba=0.5, mutation_independent_proba=0.1, #tournament_size = 3, n_gen_no_change=10, caching=True, n_jobs=-1) for i in range(30): print( "-------------------------run {} ----------------------".format(i)) selector = selector.fit(x_train, y_train) run += 1 genfeats = data[allfeats].columns[selector.support_] genfeats = list(genfeats)
def process_feature_selection(self, estimator, features, trainX, trainY, file, norm): model = None selection = None subsets = [] ####SequentialFeatureSelector from sklearn if self.dat["featureselection"]["name"] == "fs_importance": # Train model model = estimator.fit(trainX, trainY) selection = estimator.feature_importances_.argsort().tolist() elif self.dat["featureselection"]["name"] == "fs_extratrees": #estimator = ExtraTreesClassifier(n_estimators=150, n_jobs=-1) model = SelectFromModel(estimator, prefit=False).fit(trainX, trainY) selection = model.get_support(indices=True).tolist() elif self.dat["featureselection"]["name"] == "fs_svc": #estimator = LinearSVC(C=0.01, penalty="l2", dual=False) model = SelectFromModel(estimator, prefit=False).fit(trainX, trainY) selection = model.get_support(indices=True).tolist() ## only for positive values #elif self.feature_selection=="fs_chi2": # model = SelectKBest(score_func=chi2, k=trainX.shape[1]-5) # trainX = model.fit_transform(trainX) # testX = model.transform(testX) elif self.dat["featureselection"]["name"] == "fs_geuni": model = GenericUnivariateSelect(score_func=mutual_info_classif, mode='percentile', param=70).fit(trainX, trainY) selection = model.get_support(indices=True).tolist() elif self.dat["featureselection"]["name"] == "fs_rfecv": #clf = DecisionTreeClassifier() #clf = LogisticRegression(C=1, penalty='l2') #estimator = SVR(kernel="linear") #model = RFECV(clf, trainX.shape[1]-15) #estimator = LinearSVC(C=0.01, penalty="l2", dual=False) model = RFECV(estimator, min_features_to_select=int(len(features) / 3.), n_jobs=-1).fit(trainX, trainY) selection = model.get_support(indices=True).tolist() elif self.dat["featureselection"]["name"] == "fs_lasso": #estimator = LassoCV(cv=5, normalize = True, n_jobs=1) model = SelectFromModel(estimator, threshold=0.25, norm_order=1, max_features=None, prefit=False).fit(trainX, trainY) selection = model.get_support(indices=True).tolist() elif self.dat["featureselection"]["name"] == "fs_genetic": #estimator = linear_model.LogisticRegression(solver="liblinear", multi_class="ovr") #estimator = ExtraTreesClassifier(n_estimators=150) #estimator = KNeighborsClassifier(n_neighbors=2, n_jobs=-1) model = GeneticSelectionCV(estimator, cv=5, verbose=1, scoring="f1", max_features=int((len(features)) - (len(features) / 3)), n_population=70, crossover_proba=0.5, mutation_proba=0.2, n_generations=40, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1).fit(trainX, trainY) selection = model.get_support(indices=True).tolist() elif self.dat["featureselection"]["name"] == "fs_sequential_forward": #estimator = KNeighborsClassifier(n_neighbors=2) #estimator = LogisticRegression() #estimator = RandomForestClassifier(n_estimators=50, random_state=7) model = SequentialFeatureSelector( estimator, direction="forward", n_features_to_select=self.dat["featureselection"] ["n_features"], n_jobs=-1).fit(trainX, trainY) selection = model.get_support(indices=True).tolist() elif self.dat["featureselection"]["name"] == "fs_sequential_backward": #estimator = KNeighborsClassifier(n_neighbors=2) #cls = LogisticRegression() #cls = RandomForestClassifier(n_estimators=100, random_state=7, n_jobs=1) model = SequentialFeatureSelector(estimator, direction="backward", n_features_to_select=None, n_jobs=-1).fit(trainX, trainY) selection = model.get_support(indices=True).tolist() elif self.dat["featureselection"][ "name"] == "fs_mlxtend_sequential_forward": #estimator = KNeighborsClassifier(n_neighbors=2) #cls = LogisticRegression() #estimator = RandomForestClassifier(n_estimators=100, random_state=7, n_jobs=1) #estimator = svm.SVC(kernel="rbf") model = SFS(estimator, k_features=int(len(features) / 2.), forward=True, floating=False, verbose=2, scoring='f1', cv=3, n_jobs=-1).fit(trainX, trainY) selection = list(model.k_feature_idx_) #subsets = model.subsets_() elif self.dat["featureselection"][ "name"] == "fs_mlxtend_sequential_backward": #estimator = KNeighborsClassifier(n_neighbors=2) #cls = LogisticRegression() #cls = RandomForestClassifier(n_estimators=100, random_state=7, n_jobs=1) model = SFS( estimator, k_features=50, #k_features=int(len(features)/2.), forward=False, floating=False, scoring='accuracy', cv=4, n_jobs=-1).fit(trainX, trainY) selection = list(model.k_feature_idx_) #subsets = model.subsets_() elif self.dat["featureselection"][ "name"] == "fs_mlxtend_sequential_forward_floating": #estimator = KNeighborsClassifier(n_neighbors=2) #estimator = LogisticRegression() #estimator = RandomForestClassifier(n_estimators=50, random_state=7) #estimator = XGBClassifier( # learning_rate=0.2, n_estimators=50, max_depth=4, # min_child_weight=2, gamma=0.0, subsample=0.8, colsample_bytree=0.8, # objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27, # ##tree_method='gpu_hist' # THE MAGICAL PARAMETER # ) #estimator = svm.SVC(kernel="rbf") model = SFS( estimator, k_features=self.dat["featureselection"]["n_features"], #k_features=int(len(features)/2.), forward=True, floating=True, verbose=2, scoring='f1', cv=3, n_jobs=-1).fit(trainX, trainY) selection = list(model.k_feature_idx_) #subsets = model.subsets_ elif self.dat["featureselection"][ "name"] == "fs_mlxtend_sequential_backward_floating": #estimator = KNeighborsClassifier(n_neighbors=2) #estimator = LogisticRegression() #estimator = RandomForestClassifier(n_estimators=50, random_state=7) model = SFS(estimator, k_features=self.dat["featureselection"]["n_features"], forward=False, floating=True, verbose=2, scoring='f1', cv=4, n_jobs=-1).fit(trainX, trainY) selection = list(model.k_feature_idx_) #subsets = model.subsets_ #selection = model.get_support(indices=False).tolist() datafesel = { "features": features, "selected": selection, "subset": subsets } normshift = {"None": 0, "std": 1, "minmax": 2} Util.makedir(self.dat["outputdir"] + "/featureselection/") Util.write( self.dat["outputdir"] + "/featureselection/" + self.dat["featureselection"]["name"] + "_" + self.dat["featureselection"]["estimator"] + "_" + str(normshift[norm]) + "_" + file, datafesel) return datafesel
df_selected_features_filter = pd.DataFrame( df_selected_features_filter, columns=filter_method_selected_features) # 4. Final feature selection with the wrapper method of genetic algorithm X = df_selected_features_filter.copy() estimator = RandomForestClassifier(n_estimators=1000, n_jobs=1) selector = GeneticSelectionCV(estimator, cv=5, verbose=1, scoring="accuracy", max_features=18, n_population=300, crossover_proba=0.5, mutation_proba=0.2, n_generations=50, crossover_independent_proba=0.1, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1) selector = selector.fit(X, y.values.ravel()) print(selector.support_) print(X.columns) X.drop(X.columns[np.where(selector.support_ == False)[0]], axis=1, inplace=True)
def main(): result = {} for _sym in SYMBOLS: dataset = 'data/result/datasets/csv/{}.csv'.format(_sym) df = pd.read_csv(dataset, sep=',', encoding='utf-8', index_col='Date', parse_dates=True) df = df.replace([np.inf, -np.inf], np.nan).dropna() X = df[df.columns.difference(['target', 'target_pct', 'target_label'])] y = df['target'] #print("======"+_sym+"======") #print(X.info()) # Variance Threshold sel = VarianceThreshold() sel.fit_transform(X) sup = sel.get_support() X = X[[name for flag, name in zip(sup, X.columns) if flag]] ## SelectKBest sel = SelectKBest(chi2, k=30) sX = scale(X, scaler='minmax') sel.fit_transform(sX, y) sup = sel.get_support() sX = sX[[name for flag, name in zip(sup, sX.columns) if flag]] ## Recursive Feature Elimination # Create the RFE object and compute a cross-validated score. # The "accuracy" scoring is proportional to the number of correct # classifications # model = SVC(kernel="linear") # rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(2), scoring='accuracy', n_jobs=-1, verbose=1) # rfecv.fit(X, y) # X = X[[name for flag, name in zip(rfecv.support_, X.columns) if flag]] ### Genetic # estimator = MLPClassifier(**{ # 'hidden_layer_sizes': (10, 4), # 'solver': 'lbfgs', # 'learning_rate': 'constant', # 'learning_rate_init': 0.001, # 'activation': 'logistic' # }) estimator = LogisticRegression(solver="liblinear", multi_class="ovr") gscv = GeneticSelectionCV(estimator, cv=2, verbose=1, scoring="accuracy", max_features=30, n_population=50, crossover_proba=0.5, mutation_proba=0.2, n_generations=80, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=-1) gscv = gscv.fit(X, y) X = X[[name for flag, name in zip(gscv.support_, X.columns) if flag]] #print(X.columns) # print("[%s] Optimal number of features : %d Set: %s" % (_sym, rfecv.n_features_, ', '.join(X.columns))) # plt.figure() # plt.title(_sym + ' SVC RFECV K=2') # plt.xlabel("Number of features selected") # plt.ylabel("Cross validation score (nb of correct classifications)") # plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) # plt.show() logger.info("{}: {}".format(_sym, X.columns)) result[_sym] = { 'dataset': dataset, 'columns_genetic_lr_30': [c for c in X.columns], 'columns_kbest_30': [c for c in sX.columns] } return result
def main(): djia_in = [] djia_out = [] dat_ = [] dat = [] st = 0 with open("DJIA.csv", "r") as djia_r: djia_r.readline() for l in djia_r: x = l.strip().split(",") if x[1] == ".": continue u = [(datetime.datetime.strptime(x[0], "%Y-%m-%d") - datetime.datetime(1970, 1, 1)).total_seconds(), djia_out[-1] if len(djia_out) != 0 else float(x[1]), djia_out[-2] if len(djia_out) > 1 else float(x[1])] if x[0] == "2018-11-09": # Check this date st = len(dat_) v = float(x[1]) dat_.append(u + [v]) dat = dat_[st:] print(st) split = int(.8 * len(dat)) scaler = MinMaxScaler() scaler.fit(dat) djia_s = scaler.transform(dat) djia_in = [x[:-1] for x in djia_s] djia_out = [x[-1] for x in djia_s] djia_in_train = np.array(djia_in[:split]) djia_out_train = np.array(djia_out[:split]) djia_in_test = np.array(djia_in[split:]) djia_out_test = np.array(djia_out[split:]) print(djia_s) m = svm.SVR(C=0.01, cache_size=1000, coef0=djia_out_train[-1], degree=5, epsilon=0.005, gamma='auto', kernel='poly', max_iter=5000, shrinking=True, tol=0.0001, verbose=True) model = m.fit(djia_in_train, djia_out_train) res = copy.deepcopy(m.predict(djia_in_test)) xs = [x[0] for x in djia_in_test] plt.plot(xs, res, "b", label="SVR") m2 = GeneticSelectionCV(m, cv=5, verbose=1, scoring="neg_mean_squared_error", n_population=1000, crossover_proba=0.5, mutation_proba=0.2, n_generations=2000, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, n_gen_no_change=10, caching=True, n_jobs=10) m2.fit(djia_in_train, djia_out_train) res2 = m2.predict(djia_in_test) plt.plot(xs, res2, "g", label="GA/SVR") plt.plot(xs, djia_out_test, "m", label="Actual") plt.xlabel('Time (scaled)') plt.ylabel('Points (scaled)') plt.legend() tp = 0 tn = 0 fp = 0 fn = 0 for i in range(1, len(res)): dirres = res[i] - res[i - 1] >= 0 diract = djia_out_test[i] - djia_out_test[i - 1] >= 0 if dirres == diract: if dirres == True: tp += 1 else: tn += 1 else: if dirres == True: fp += 1 else: fn += 1 print(tp, tn, fp, fn) plt.suptitle("RMSE = " + str(math.sqrt(mean_squared_error(djia_out_test, res)))) plt.show()
print('Test error: {}'.format( 1 - clf.score(X_test[:, features], y_test))) if d == 2: make_plot(X1[:, features], X2[:, features]) # GA print('') print('=== Genetic Algorithm ===') clf = SVC(C=10, kernel='linear') selector = GeneticSelectionCV(clf, cv=5, verbose=1, scoring="accuracy", n_population=20, crossover_proba=0.5, mutation_proba=0.2, n_generations=10, crossover_independent_proba=0.5, mutation_independent_proba=0.05, tournament_size=3, caching=True, n_jobs=8) selector = selector.fit(X_train, y_train) features = [i for i in range(X_train.shape[1]) if selector.support_[i]] print('Features selected: {}'.format(features)) clf = SVC(C=10, kernel='linear') clf.fit(X_train[:, features], y_train) print('Train error: {}'.format(1 - clf.score(X_train[:, features], y_train))) print('Test error: {}'.format(1 - clf.score(X_test[:, features], y_test)))