def featureSelection(train_x, train_y): # Create the RFE object and compute a cross-validated score. svc = LinearSVC(C=1, class_weight='balanced') # The "accuracy" scoring is proportional to the number of correct # classifications lasso = RandomizedLasso() lasso.fit(train_x, train_y) rfecv = RFECV(estimator=svc, step=1, cv=5, scoring='accuracy') rfecv.fit(train_x, train_y) print("Optimal number of features : %d" % rfecv.n_features_) rankings = rfecv.ranking_ lasso_ranks = lasso.get_support() lassoFeats = [] recursiveFeats = [] shouldUseFeats = [] for i in range(len(rankings)): if lasso_ranks[i]: lassoFeats.append(feats[i]) if rankings[i] == 1: recursiveFeats.append(feats[i]) if lasso_ranks[i]: shouldUseFeats.append(feats[i]) keyboard() print 'Should use ' + ', '.join(shouldUseFeats) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def plot_stable_features(X_train,y_train,featnames,**kwargs): from sklearn.linear_model import LassoLarsCV,RandomizedLasso n_resampling = kwargs.pop('n_resampling',200) n_jobs = kwargs.pop('n_jobs',-1) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) # estimate alphas via xvalidation lars_cv = LassoLarsCV(cv=6,n_jobs=n_jobs).fit(X_train,y_train) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=alphas, random_state=42, n_jobs=n_jobs, n_resampling=n_resampling) clf.fit(X_train,y_train) importances = clf.scores_ indices = np.argsort(importances)[::-1] pl.bar(range(len(featnames)), importances[indices], color="r", align="center") pl.xticks(np.arange(len(featnames))+0.5,featnames[indices], rotation=45,horizontalalignment='right') pl.xlim(-0.5,len(featnames)-0.5) pl.subplots_adjust(bottom=0.2) pl.ylim(0,np.max(importances)*1.01) pl.ylabel('Selection frequency (%) for %d resamplings '%n_resampling) pl.title("Stability Selection: Selection Frequencies")
def lasso_fs(X, y): rlasso = RandomizedLasso() rlasso.fit(X, y) classes = range(0, X.shape[1]) print "Features sorted by their score:" print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), classes), reverse=True)
def select_feature_importance(): data4columns = dataset.drop(['Max overdue'], axis=1) column_names = np.asarray(data4columns.columns.values) lasso = RandomizedLasso(alpha=0.025) scaled_data = scaler.fit_transform(data) lasso.fit(scaled_data, target) scores = lasso.scores_ # column_names # print scores print sorted(zip(map(lambda x: round(x, 4), scores), column_names), reverse=True)
def featureselection(datset, output='results'): t0 = time() dataset = pd.read_csv(datset) dataset.to_csv(output[:-4] + '.csv') # DO feature evaluation and write a xlsx file. wb = Workbook() ws1 = wb.active ws1.title = "feature selection scores" rownum = 2 ws1.cell(column=1, row=1).value = 'Feature name' ws1.cell(column=2, row=1).value = 'Stability Selection' ws1.cell( column=3, row=1).value = 'Univariate using random forest regressor (r2 measure)' ws1.cell( column=4, row=1).value = 'Univariate using random forest regressor (auc measure)' ws1.cell(column=5, row=1).value = 'L1 regularization / Lasso' ws1.cell(column=6, row=1).value = 'L3 regularization / Ridge' ws1.cell(column=7, row=1).value = "Mean decrease impurity" ws1.cell(column=8, row=1).value = 'Recursive feature elimination' Collumnheadeers = list(dataset.columns.values) for imagebiom in Collumnheadeers: print imagebiom ws1.cell(column=1, row=rownum).value = imagebiom rownum += 1 print dataset print Collumnheadeers # Create and save correlation plots. ## Get labels y = dataset['label'].values ## Delete labes from list Collumnheadeers.remove('label') X = dataset[Collumnheadeers] corplot(X, filesavename='all.pdf') # performe feature selection rlasso = RandomizedLasso(alpha=0.00025) rlasso.fit(X, y) print "Features sorted by their score:" print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), Collumnheadeers), reverse=True) rownum = 2 for score_val in rlasso.scores_.tolist(): ws1.cell(column=2, row=rownum).value = score_val rownum += 1 rlasso.scores_ print np.where(rlasso.scores_ > 0.8)[0] + 1 print Collumnheadeers elementselect = np.where(rlasso.scores_ > 0.8)[0] Collumnheadeersel = [] for i in elementselect: Collumnheadeersel.append(Collumnheadeers[i]) corplot(X[Collumnheadeersel], filesavename='selectedstability.pdf') wb.save(filename='ResultTableIndividualFeature.xlsx') rf = RandomForestRegressor(n_estimators=20, max_depth=4) scores = [] scoresval = [] X1 = X.as_matrix() for i in range(X.shape[1]): score = cross_val_score(rf, X1[:, i:i + 1], y, scoring="r2", cv=ShuffleSplit(len(X), 3, .3)) scores.append((round(np.mean(score), 3), Collumnheadeers[i])) scoresval.append(round(np.mean(score), 3)) print sorted(scores, reverse=True) rownum = 2 for score_val in scoresval: ws1.cell(column=3, row=rownum).value = score_val rownum += 1 scoresval = [] for i in range(X.shape[1]): score = cross_val_score(rf, X1[:, i:i + 1], y, scoring="roc_auc", cv=ShuffleSplit(len(X), 3, .3)) scores.append((round(np.mean(score), 3), Collumnheadeers[i])) scoresval.append(round(np.mean(score), 3)) print sorted(scores, reverse=True) rownum = 2 for score_val in scoresval: ws1.cell(column=4, row=rownum).value = score_val rownum += 1 scaler = StandardScaler() X3 = scaler.fit_transform(X1) lasso = Lasso(alpha=.0003) lasso.fit(X3, y) print "Features sorted by their score:" print sorted(zip(map(lambda x: round(x, 4), lasso.coef_), Collumnheadeers), reverse=True) rownum = 2 for score_val in lasso.coef_.tolist(): ws1.cell(column=5, row=rownum).value = score_val rownum += 1 scaler = StandardScaler() X3 = scaler.fit_transform(X1) ridge = Ridge(alpha=10) ridge.fit(X3, y) print "Features sorted by their score:" print sorted(zip(map(lambda x: round(x, 4), lasso.coef_), Collumnheadeers), reverse=True) rownum = 2 for score_val in lasso.coef_.tolist(): ws1.cell(column=6, row=rownum).value = score_val rownum += 1 rf = RandomForestRegressor() rf.fit(X, y) rownum = 2 for score_val in rf.feature_importances_.tolist(): ws1.cell(column=7, row=rownum).value = score_val rownum += 1 #use linear regression as the model lr = LinearRegression() #rank all features, i.e continue the elimination until the last one rfe = RFE(lr, n_features_to_select=1) rfe.fit(X, y) rownum = 2 for score_val in rfe.ranking_.tolist(): ws1.cell(column=8, row=rownum).value = score_val rownum += 1 wb.save(filename='ResultTableIndividualFeature.xlsx') path_ = os.getcwd() directory = path_ + '/output/' if not os.path.exists(directory): os.makedirs(directory) types = ('*.pdf', '*.csv', '*.xlsx') # the tuple of file types files_grabbed = [] for files in types: files_grabbed.extend(glob.glob(files)) for file in files_grabbed: if os.path.isfile(file): shutil.copy2(file, directory) shutil.make_archive(output[:-4], 'zip', directory) return 0
def lass_varselect(train, num_vars, target, alpha): lass = RandomizedLasso(alpha=alpha, n_resampling=5) lass.fit(train[num_vars], train[target]) return lass.get_support()
rfe = rfe.fit(X_train, y_train) rfe2 = RFE(estimator=RandomForestClassifier(criterion='entropy', n_estimators=10,random_state=3,n_jobs=2), n_features_to_select=2) rfe2 = rfe2.fit(X_train, y_train) """ """ from sklearn.svm import SVR rfe3 = RFE(estimator=SVR(kernel="linear"), n_features_to_select=2) rfe3 = rfe3.fit(X_train, y_train) """ """stability selection: see which feature is selected most""" from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(X_train, y_train.values.ravel()) rlasso_score = rlasso.scores_ temp = rlasso.scores_.argsort() ranks = np.empty_like(temp) ranks[temp] = np.arange(len(rlasso.scores_)) #print (sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True)) rank_summary = pd.DataFrame() rank_summary['features'] = names #rank_summary['ranking_logi']=94-rfe.ranking_ #rank_summary['ranking_RF']=94-rfe2.ranking_ #rank_summary['ranking_svm']=94-rfe3.ranking_ rank_summary['ranking_stab_sel'] = ranks
def feature_selection(df, target_column, id_column): print("IDENTIFYING TYPES...") """ df = The training dataframe target_column = The column containing the target variable id_column = The column containing the id variable Based on the output column type (binary or numeric), it decides on the type of problem we are trying to solve. If the output column is binary (0/1), we use Genetic Algorithms for feature selection. If the output column is numeric, we use the best half of the features using the feature importance from RandomForests. """ df = df lists = set(list(df)) output_var = target_column list_inputs = [x for x in lists if not x == target_column] if (df[output_var].isin([0, 1]).all()): method_type = 'categorical' else: method_type = 'numerical' print(method_type) if method_type == "categorical": methods = [ "SVM", "Decision Trees", "KNNs", "Logistic Regression", "Naive Bayes" ] elif method_type == "numerical": methods = [ "Linear Regression", "Random Forest", "Correlation", "Ridge", "Lasso" ] if method_type == "categorical": print("GENETIC ALGORITHM FOR FEATURE SELECTION (CLASSIFICATION):") ##### #SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION) ##### creator.create("FitnessMax", base.Fitness, weights=(1.0, )) creator.create("Individual", list, fitness=creator.FitnessMax) toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list_inputs)) toolbox.register("population", tools.initRepeat, list, toolbox.individual) def evalOneMax(individual): return sum(individual), toolbox.register("evaluate", evalOneMax) toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) toolbox.register("select", tools.selTournament, tournsize=3) NPOPSIZE = 50 #RANDOM STARTING POOL SIZE population = toolbox.population(n=NPOPSIZE) ##### #ASSESSING GINI ON THE STARTING POOL ##### dic_gini = {} for i in range(np.shape(population)[0]): # TRASLATING DNA INTO LIST OF VARIABLES (1-81) var_model = [] for j in range(np.shape(population)[0]): if (population[i])[j] == 1: var_model.append(list(list_inputs)[j]) # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL X_train = df[var_model] Y_train = df[output_var] ###### # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### if "SVM" in methods: svc = svm.SVC(probability=True) model = svc.fit(X_train, Y_train) Y_predict = model.predict(X_train) ###### # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### ###### # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict) auc = metrics.auc(fpr, tpr) gini_power = abs(2 * auc - 1) ###### # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### gini = str(gini_power) + ";" + str(population[j]).replace( '[', '').replace(', ', '').replace(']', '') dic_gini[gini] = population[j] list_gini = sorted(dic_gini.keys(), reverse=True) #### # ASSESSING RMSE ON THE STARTING POOL #### if method_type == "numerical": X_train = df[var_model] Y_train = df[output_var] names = list(X_train) ranks = {} # Linear Regression Model and trying to get the feature scores for the features in Linear Regression lr = LinearRegression(normalize=True) lr.fit(X_train, Y_train) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) # Ridge Regression Model and trying to get the feature scores for the features in Ridge Regression ridge = Ridge(alpha=7) ridge.fit(X_train, Y_train) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) # Lasso Regression Model and trying to get the feature scores for the features in Lasso Regression lasso = Lasso(alpha=.05) lasso.fit(X_train, Y_train) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) #Randomized Lasso Regression Model and trying to get the feature scores for the features in Randomized Lasso Regression rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X_train, Y_train) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) # Random Forest Regression Model and trying to get the feature scores for the features in Random Forest Regression rf = RandomForestRegressor() rf.fit(X_train, Y_train) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) # Correlation Model and trying to get the feature scores for the features in Correlation f, pval = f_regression(X_train, Y_train, center=True) ranks["Corr."] = rank_to_dict(f, names) r = {} for name in names: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) # Truncating to 2 decimal points methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print(ranks["Mean"]) print("\t\t%s" % "\t".join(methods)) for name in names: print("%s\t%s" % (name, "\t".join( map(str, [ranks[method][name] for method in methods])))) #Printing out feature scores ranks_f = pd.DataFrame(ranks) ranks_f.sort_values("RF", 0, 0, inplace=True) # Sorting features by importance with respect to random forests regression print(ranks_f) #Printing out sorted feature scores featureset = ranks_f.index.values[0:(len(rank_f) / 2)] #Printing out the selected features print(featureset) if method_type == "categorical": #GENETIC ALGORITHM MAIN LOOP - START # - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES) ##### sum_current_gini = 0.0 sum_current_gini_1 = 0.0 sum_current_gini_2 = 0.0 first = 0 OK = 1 a = 0 while OK: #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS a = a + 1 print('loop ', a) OK = 0 #### # GENERATING OFFSPRING - START #### offspring = algorithms.varAnd( population, toolbox, cxpb=0.5, mutpb=0.1 ) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10% fits = toolbox.map(toolbox.evaluate, offspring) for fit, ind in zip(fits, offspring): ind.fitness.values = fit population = toolbox.select(offspring, k=len(population)) #### # GENERATING OFFSPRING - END #### sum_current_gini_2 = sum_current_gini_1 sum_current_gini_1 = sum_current_gini sum_current_gini = 0.0 ##### #ASSESSING GINI ON THE OFFSPRING - START ##### for j in range(np.shape(population)[0]): if population[j] not in dic_gini.values(): var_model = [] for i in range(np.shape(population)[0]): if (population[j])[i] == 1: var_model.append(list(list_inputs)[i]) X_train = df[var_model] Y_train = df[output_var] ###### # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### if "SVM" in methods: svc = svm.SVC(probability=True) model = svc.fit(X_train, Y_train) Y_predict = model.predict(X_train) ###### # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### ###### # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### fpr, tpr, thresholds = metrics.roc_curve( Y_train, Y_predict) auc = metrics.auc(fpr, tpr) gini_power = abs(2 * auc - 1) ###### # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### gini = str(gini_power) + ";" + str(population[j]).replace( '[', '').replace(', ', '').replace(']', '') dic_gini[gini] = population[j] ##### #ASSESSING GINI ON THE OFFSPRING - END ##### ##### #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START ##### list_gini = sorted(dic_gini.keys(), reverse=True) population = [] for i in list_gini[:NPOPSIZE]: population.append(dic_gini[i]) gini = float(i.split(';')[0]) sum_current_gini += gini ##### #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END ##### #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS print('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2) if (sum_current_gini > sum_current_gini_1 + 0.0001 or sum_current_gini > sum_current_gini_2 + 0.0001): OK = 1 ##### #GENETIC ALGORITHM MAIN LOOP - END ##### if method_type == "categorical": gini_max = list_gini[0] gini = float(gini_max.split(';')[0]) features = gini_max.split(';')[1] #### # PRINTING OUT THE LIST OF FEATURES ##### f = 0 l = list() for i in range(len(features)): if features[i] == '1': f += 1 print('feature ', f, ':', list(list_inputs)[i]) l.append(list(list_inputs)[i]) print('gini: ', gini) featureset = l # Returns the featureset from regression if output column is numerical otherwise returns the featureset from categorical if # output column is categorical return (df[featureset])
new_y = y[pass_vals] new_X = X[pass_vals] return new_X, new_y X, labels = transform_Xy(X, labels) expr = center_data(labels['Multicov'].values) spec = center_data(labels['Specificity'].values) slope, intercept, r_value, p_value, stderr = linregress(spec, expr) residues = residual(spec, expr, slope, intercept) if exp == 'full': y = expr elif exp == 'res': y = residues X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, random_state=0) rl = RandomizedLasso() fs = rl.fit(X_train, y_train) hist_scores = dict(zip(features, fs.scores_)) pickle.dump( hist_scores, open(folder + 'results/histScores' + condition + cell + exp + '.pkl', 'wb'))
def main(train_label, train_feat, modelsdir, selfeat): X_train = np.nan_to_num(np.genfromtxt(train_feat, delimiter=' ')) y_train = np.nan_to_num(np.genfromtxt(train_label, delimiter=' ')) X_trains = X_train scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # performs feature selection featsel_str = ".all-feats" if int(selfeat): print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=int(config['n_jobs']), random_state=42, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join([modelsdir, os.path.basename(train_feat)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=42, n_jobs=int(config['n_jobs'])) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=int(config['RR_Iter']), scoring=mae_scorer, n_jobs=int(config['n_jobs']), refit=True, cv=KFold(X_train.shape[0], int(config['folds']), shuffle=True, random_state=42), verbose=1, random_state=42) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(modelsdir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=int(config['n_jobs'])) print "Train the model with the best parameters ..." estimator2.fit(X_trains,y_train) from sklearn.externals import joblib joblib.dump(estimator2, modelsdir+"/XRT.pkl") joblib.dump(scaler, modelsdir+"/scaler.pkl") joblib.dump(sel_est, modelsdir+"/sel_est.pkl")
def feature_selection(df,target_column): print("IDENTIFYING TYPES...") in_model = [] list_ib = set() #input binary list_icn = set() #input categorical nominal list_ico = set() #input categorical ordinal list_if = set() #input numerical continuos (input float) list_inputs = set() output_var = target_column for var_name in df.columns: if re.search('^ib_',var_name): list_inputs.add(var_name) list_ib.add(var_name) print (var_name,"is input binary") elif re.search('^icn_',var_name): list_inputs.add(var_name) list_icn.add(var_name) print (var_name,"is input categorical nominal") elif re.search('^ico_',var_name): list_inputs.add(var_name) list_ico.add(var_name) print (var_name,"is input categorical ordinal") elif re.search('^if_',var_name): #list_inputs.add(var_name) list_if.add(var_name) print (var_name,"is input numerical continuos (input float)") elif re.search('^ob_',var_name): output_var = var_name else: print ("ERROR: unable to identify the type of:", var_name) if (df[output_var].isin([0,1]).all()): method_type = 'categorical' else: method_type = 'numerical' print(method_type) if method_type == "categorical": methods = ["SVM","Decision Trees","KNNs","Logistic Regression","Naive Bayes"] elif method_type == "numerical": methods = ["SVM","Ridge","Lasso"] if method_type == "categorical": print ("GENETIC ALGORITHM FOR FEATURE SELECTION (CLASSIFICATION):") ##### #SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION) ##### creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create("Individual", list, fitness=creator.FitnessMax) toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list_inputs)) toolbox.register("population", tools.initRepeat, list, toolbox.individual) def evalOneMax(individual): return sum(individual), toolbox.register("evaluate", evalOneMax) toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) toolbox.register("select", tools.selTournament, tournsize=3) NPOPSIZE = 50 #RANDOM STARTING POOL SIZE population = toolbox.population(n=NPOPSIZE) ##### #ASSESSING GINI ON THE STARTING POOL ##### dic_gini={} for i in range(np.shape(population)[0]): # TRASLATING DNA INTO LIST OF VARIABLES (1-81) var_model = [] for j in range(np.shape(population)[0]): if (population[i])[j]==1: var_model.append(list(list_inputs)[j]) # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL X_train=df[var_model] Y_train=df[output_var] ###### # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### if "Logistic Regression" in methods: lr = sm.Logit(Y_train, X_train) model=lr.fit() Y_predict=model.predict(X_train) ###### # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### ###### # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict) auc = metrics.auc(fpr, tpr) gini_power = abs(2*auc-1) ###### # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','') dic_gini[gini]=population[j] list_gini=sorted(dic_gini.keys(),reverse=True) #### # ASSESSING RMSE ON THE STARTING POOL #### if method_type == "numerical": X_train=df[var_model] Y_train=df["if_var_73"] names = list(X_train) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X_train, Y_train) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X_train, Y_train) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X_train, Y_train) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X_train, Y_train) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) rf = RandomForestRegressor() rf.fit(X_train,Y_train) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X_train, Y_train, center=True) ranks["Corr."] = rank_to_dict(f, names) r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print(ranks["Mean"]) print("\t\t%s" % "\t".join(methods)) for name in names: print ("%s\t%s" % (name, "\t".join(map(str, [ranks[method][name] for method in methods])))) ranks_f = pd.DataFrame(ranks) ranks_f.sort_values("RF",0,0,inplace = True) print(ranks_f) featureset = ranks_f.index.values[0:5] print(featureset) if method_type == "categorical": #GENETIC ALGORITHM MAIN LOOP - START # - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES) ##### sum_current_gini=0.0 sum_current_gini_1=0.0 sum_current_gini_2=0.0 first=0 OK = 1 a=0 while OK: #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS a=a+1 print('loop ', a) OK=0 #### # GENERATING OFFSPRING - START #### offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10% fits = toolbox.map(toolbox.evaluate, offspring) for fit, ind in zip(fits, offspring): ind.fitness.values = fit population =toolbox.select(offspring, k=len(population)) #### # GENERATING OFFSPRING - END #### sum_current_gini_2=sum_current_gini_1 sum_current_gini_1=sum_current_gini sum_current_gini=0.0 ##### #ASSESSING GINI ON THE OFFSPRING - START ##### for j in range(np.shape(population)[0]): if population[j] not in dic_gini.values(): var_model = [] for i in range(np.shape(population)[0]): if (population[j])[i]==1: var_model.append(list(list_inputs)[i]) X_train=df[var_model] Y_train=df[output_var] ###### # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### lr = sm.Logit(Y_train, X_train) model=lr.fit() Y_predict=model.predict(X_train) ###### # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### ###### # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict) auc = metrics.auc(fpr, tpr) gini_power = abs(2*auc-1) ###### # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','') dic_gini[gini]=population[j] ##### #ASSESSING GINI ON THE OFFSPRING - END ##### ##### #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START ##### list_gini=sorted(dic_gini.keys(),reverse=True) population=[] for i in list_gini[:NPOPSIZE]: population.append(dic_gini[i]) gini=float(i.split(';')[0]) sum_current_gini+=gini ##### #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END ##### #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS print ('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2) if(sum_current_gini>sum_current_gini_1+0.0001 or sum_current_gini>sum_current_gini_2+0.0001): OK=1 ##### #GENETIC ALGORITHM MAIN LOOP - END ##### if method_type == "categorical": gini_max=list_gini[0] gini=float(gini_max.split(';')[0]) features=gini_max.split(';')[1] #### # PRINTING OUT THE LIST OF FEATURES ##### f=0 for i in range(len(features)): if features[i]=='1': f+=1 print('feature ', f, ':', list(list_inputs)[i]) print ('gini: ', gini) featureset = features return(featureset)
def lasso_hq(X, y, alpha=0.3): ## feature select based on Lasso===== from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(alpha=alpha) rlasso.fit(X, y) return(rlasso.scores_)
def do_rank(self): house = pd.read_csv(self.data) house.head() #dropping the id and date columns house = house.drop(['date'], axis=1) str_list = [] for colname, colvalue in house.iteritems(): if type(colvalue[1]) == str: str_list.append(colname) num_list = house.columns.difference(str_list) house_num = house[num_list] Y = house.price.values house = house.drop(['price'], axis=1) X = house.as_matrix() colnames = house.columns ranks = {} def ranking(ranks, names, order=1): minmax = MinMaxScaler() ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0] ranks = map(lambda x: round(x, 2), ranks) return dict(zip(names, ranks)) rlasso = RandomizedLasso(alpha=0.04) # long time rlasso.fit(X, Y) ranks["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames) print('finished') lr = LinearRegression(normalize=True) lr.fit(X, Y) rfe = RFE(lr, n_features_to_select=1, verbose=3) rfe.fit(X, Y) ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1) #Using linear regression lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["LinReg"] = ranking(np.abs(lr.coef_), colnames) #using Ridge ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames) #using lasso lasso = Lasso(alpha=0.05) lasso.fit(X, Y) ranks["Lasso"] = ranking(np.abs(lasso.coef_), colnames) # long time rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3) rf.fit(X, Y) ranks["RF"] = ranking(rf.feature_importances_, colnames) r = {} for name in colnames: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") meanplot = pd.DataFrame(list(r.items()), columns=['Feature', 'Mean Ranking']) meanplot = meanplot.sort_values('Mean Ranking', ascending=False) sns.factorplot(x='Mean Ranking', y='Feature', data=meanplot, kind='bar', size=4, aspect=1.9, palette='coolwarm') plt.savefig('..\\Images\\feature_ranking.jpg')
def run(self): loanfreature_df = pd.read_csv( processData(loginemail=self.loginemail, loginpassword=self.loginpassword).output().path, low_memory=False, encoding='ISO-8859-1') Y = loanfreature_df.int_rate loanfreature_df.drop('int_rate', axis=1, inplace=True) cols_to_keep = [ 'loan_amnt', 'term', 'emp_length', 'home_ownership_category', 'annual_inc', 'verification_status_category', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'last_meanfico', 'inq_last_6mths', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'mths_since_last_major_derog', 'funded_amnt_inv', 'installment', 'application_type', 'pub_rec', 'addr_state' ] loanfreature_df = loanfreature_df[cols_to_keep] loanfreature_df = createDummies(loanfreature_df) X = loanfreature_df._get_numeric_data() names = ["%s" % i for i in X] ranks = {} lr = LinearRegression(normalize=True) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict((lr.coef_), names) ridge = Ridge(alpha=7) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict((ridge.coef_), names) lasso = Lasso(alpha=.05) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.00) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict((rlasso.scores_), names) rf = RandomForestRegressor() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) # stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=15) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(rfe.ranking_, X.columns, order=-1) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) r = {} for name in names: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") # f_rank = pd.DataFrame() print("\t%s" % "\t".join(methods)) temp = "\t".join(methods) f = open("testing.txt", 'w') f.write(temp) f.write("\n") for name in names: temp = name + "\t" + " \t".join( map(str, [ranks[method][name] for method in methods])) f.write(temp) f.write("\n") print("%s\t%s" % (name, "\t".join( map(str, [ranks[method][name] for method in methods])))) f.close() feature = pd.read_csv('testing.txt', sep='\t') feature.to_csv(self.output().path)
# Using Randomized Lasso import pandas as pd import numpy as np from sklearn.linear_model import RandomizedLasso dataset = pd.read_csv('ARJUNANADHI.csv') X = dataset.iloc[:, 1:4].values Y = dataset.iloc[:, [6,9,12]].values y_paddy=(Y[:,[0]]).ravel() y_maize=Y[:,[1]].ravel() y_cereals=Y[:,[2]].ravel() rlasso = RandomizedLasso(alpha=0.04) mapping = {0:'Meteorological', 1:'Hydrological',2:'Agricultural '} fit=rlasso.fit(X,y_paddy) Paddy=pd.DataFrame(fit.scores_) Paddy.columns = ['Scores'] Paddy=Paddy.rename(mapping) Paddy.plot.bar(title='Paddy',color='g',rot=0) fit=rlasso.fit(X,y_maize) Maize=pd.DataFrame(fit.scores_) Maize.columns = ['Scores'] Maize=Maize.rename(mapping) Maize.plot.bar(title='Maize',color='y',rot=0) fit=rlasso.fit(X,y_cereals) Cereals=pd.DataFrame(fit.scores_) Cereals.columns = ['Scores'] Cereals=Cereals.rename(mapping) Cereals.plot.bar(title='Cereals',color='c',rot=0)
def machinelearningpipeline(datset, output='results.zip'): t0 = time() dataset = pd.read_csv(datset) dataset.to_csv(output[:-4] + '.csv') # DO feature evaluation and write a xlsx file. wb = Workbook() ws1 = wb.active ws1.title = "ResultTableIndividualFeature" rownum = 2 ws1.cell(column=1, row=1).value = 'Feature name' ws1.cell(column=2, row=1).value = 'Az' ws1.cell(column=3, row=1).value = 'Optimal threshold' ws1.cell(column=4, row=1).value = 'Sensitivity' ws1.cell(column=5, row=1).value = 'Specificity' ws1.cell(column=6, row=1).value = 'Confidence interval: low' ws1.cell(column=7, row=1).value = 'Confidence interval: high' Collumnheadeers = list(dataset.columns.values) for imagebiom in Collumnheadeers: ValuesMetric = dataset[imagebiom].values Targets = dataset['label'].values roc_auc_score, optimalval, sens, spec, confidence_lower, confidence_upper = analyticscalc( ValuesMetric, Targets, imagebiom) ws1.cell(column=1, row=rownum).value = imagebiom ws1.cell(column=2, row=rownum).value = "{:0.3f}".format(roc_auc_score) ws1.cell(column=3, row=rownum).value = "{:0.3f}".format(optimalval) ws1.cell(column=4, row=rownum).value = "{:0.3f}".format(sens) ws1.cell(column=5, row=rownum).value = "{:0.3f}".format(spec) ws1.cell(column=6, row=rownum).value = "{:0.3f}".format(confidence_lower) ws1.cell(column=7, row=rownum).value = "{:0.3f}".format(confidence_upper) rownum += 1 wb.save(filename='ResultTableIndividualFeature.xlsx') print dataset print Collumnheadeers # Create and save correlation plots. ## Get labels y = dataset['label'].values ## Delete labes from list Collumnheadeers.remove('label') X = dataset[Collumnheadeers] corplot(X, filesavename='all.pdf') # performe feature selection rlasso = RandomizedLasso(alpha=0.00025) rlasso.fit(X, y) print "Features sorted by their score:" print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), Collumnheadeers), reverse=True) print np.where(rlasso.scores_ > 0.8)[0] + 1 print Collumnheadeers elementselect = np.where(rlasso.scores_ > 0.8)[0] Collumnheadeersel = [] for i in elementselect: Collumnheadeersel.append(Collumnheadeers[i]) corplot(X[Collumnheadeersel], filesavename='selected.pdf') # optimize and evalute classifier X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Compare classifier calibrationplot(X_train, X_test, y_train, y_test, filesavename='calibrationplot.pdf') X = X_train.copy() y = y_train.copy() # Run a quick example on non optimal classifiers clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() clf4 = SVC() print('5-fold cross validation:\n') for clf, label in zip( [clf1, clf2, clf3, clf4], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'SVM']): scores = cross_validation.cross_val_score(clf, X, y, cv=5, scoring='roc_auc', n_jobs=1) print("roc_auc: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) # SVM scaler = StandardScaler() X1 = scaler.fit_transform(X) C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid, cv=cv, scoring='roc_auc') grid.fit(X1, y) scores = [x[1] for x in grid.grid_scores_] scores = np.array(scores).reshape(len(C_range), len(gamma_range)) # Draw heatmap of the validation accuracy as a function of gamma and C plt.figure(figsize=(8, 6)) plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95) plt.imshow(scores, interpolation='nearest', cmap=plt.cm.jet) plt.xlabel('gamma') plt.ylabel('C') plt.colorbar() plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45) plt.yticks(np.arange(len(C_range)), C_range) plt.title('Validation accuracy') plt.savefig('SVMheatmap.pdf', dpi=300) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_)) # Check out random forest accuracy... scores = ['roc_auc'] #['precision_weighted', 'recall_weighted','roc_auc'] Random_plot = [] tuned_parameters = [{ 'n_estimators': [ 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000 ] }] for score in scores: print("# Tuning hyper-parameters for %s" % score) print() clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, n_jobs=40, scoring='%s' % score) clf.fit(X, y) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) Random_plot.append(mean_score) param = clf.best_params_ f, ax = plt.subplots(figsize=(20, 20)) plt.plot([ 1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000 ], Random_plot, lw=2) plt.title( "The best parameter is n_estimators=%s with area under ROC of %0.2f" % (param.get("n_estimators"), clf.best_score_), fontweight='bold') plt.xlabel('Numer of estimators', fontweight='bold') plt.ylabel('Area Under ROC (Az)', fontweight='bold') plt.savefig('RandomForrest.pdf', tight_layout=True, dpi=600) f, ax = plt.subplots(figsize=(20, 20)) title = 'Learning Curves (SVM)' param = grid.best_params_ estimator = SVC(kernel='rbf', C=param.get("C"), gamma=param.get("gamma")) print y_train cv = cross_validation.ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0) plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) plt.savefig('LearningCurvesSVM.pdf', tight_layout=True, dpi=600) print str((time() - t0)) estimator.fit(X, y) y_true, y_pred = y_test, estimator.predict(X_test) print(classification_report(y_true, y_pred)) cm = confusion_matrix(y_test, y_pred) np.set_printoptions(precision=2) print('Confusion matrix, without normalization') print(cm) plt.figure() # Normalize the confusion matrix by row (i.e by the number of samples # in each class) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print('Normalized confusion matrix') print(cm_normalized) plt.figure() plot_confusion_matrix(y_test, y_pred) plt.savefig('ConfusionMatrixSVM.pdf', tight_layout=True, dpi=600) # os.chdir(os.getcwd()) # for file in glob.glob("*.pdf"): # print(file) # shutisl.make_archive(output_filename, 'zip', dir_name) path_ = os.getcwd() directory = path_ + '/output/' if not os.path.exists(directory): os.makedirs(directory) types = ('*.pdf', '*.csv', '*.xlsx') # the tuple of file types files_grabbed = [] for files in types: files_grabbed.extend(glob.glob(files)) for file in files_grabbed: if os.path.isfile(file): shutil.copy2(file, directory) shutil.make_archive(output[:-4], 'zip', directory) return 0
def feature_selection(df,dfo,target_column,id_column): """ df = The training dataframe dfo = The test dataframe target_column = The column containing the target variable id_column = The column containing the id variable Based on the output column type (binary or numeric), it decides on the type of problem we are trying to solve. If the output column is binary (0/1), we use Genetic Algorithms for feature selection. If the """ print("IDENTIFYING TYPES...") in_model = [] list_ib = set() #input binary list_icn = set() #input categorical nominal list_ico = set() #input categorical ordinal list_if = set() #input numerical continuos (input float) list_inputs = set() output_var = target_column for var_name in df.columns: if re.search('^ib_',var_name): list_inputs.add(var_name) list_ib.add(var_name) print (var_name,"is input binary") elif re.search('^icn_',var_name): list_inputs.add(var_name) list_icn.add(var_name) print (var_name,"is input categorical nominal") elif re.search('^ico_',var_name): list_inputs.add(var_name) list_ico.add(var_name) print (var_name,"is input categorical ordinal") elif re.search('^if_',var_name): #list_inputs.add(var_name) list_if.add(var_name) print (var_name,"is input numerical continuos (input float)") elif re.search('^ob_',var_name): output_var = var_name else: print ("ERROR: unable to identify the type of:", var_name) if (df[output_var].isin([0,1]).all()): method_type = 'categorical' else: method_type = 'numerical' print(method_type) if method_type == "categorical": methods = ["SVM","Decision Trees","KNNs","Logistic Regression","Naive Bayes"] elif method_type == "numerical": methods = ["SVM","Ridge","Lasso"] if method_type == "categorical": print ("GENETIC ALGORITHM FOR FEATURE SELECTION (CLASSIFICATION):") ##### #SETING UP THE GENETIC ALGORITHM and CALCULATING STARTING POOL (STARTING CANDIDATE POPULATION) ##### creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create("Individual", list, fitness=creator.FitnessMax) toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(list_inputs)) toolbox.register("population", tools.initRepeat, list, toolbox.individual) def evalOneMax(individual): return sum(individual), toolbox.register("evaluate", evalOneMax) toolbox.register("mate", tools.cxTwoPoint) toolbox.register("mutate", tools.mutFlipBit, indpb=0.05) toolbox.register("select", tools.selTournament, tournsize=3) NPOPSIZE = 50 #RANDOM STARTING POOL SIZE population = toolbox.population(n=NPOPSIZE) ##### #ASSESSING GINI ON THE STARTING POOL ##### dic_gini={} for i in range(np.shape(population)[0]): # TRASLATING DNA INTO LIST OF VARIABLES (1-81) var_model = [] for j in range(np.shape(population)[0]): if (population[i])[j]==1: var_model.append(list(list_inputs)[j]) # ASSESSING GINI INDEX FOR EACH INVIVIDUAL IN THE INITIAL POOL X_train=df[var_model] Y_train=df[output_var] ###### # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### if "Logistic Regression" in methods: lr = sm.Logit(Y_train, X_train) model=lr.fit() Y_predict=model.predict(X_train) ###### # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### ###### # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict) auc = metrics.auc(fpr, tpr) gini_power = abs(2*auc-1) ###### # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','') dic_gini[gini]=population[j] list_gini=sorted(dic_gini.keys(),reverse=True) #### # ASSESSING RMSE ON THE STARTING POOL #### if method_type == "numerical": X_train=df[var_model] Y_train=df[output_var] names = list(X_train) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X_train, Y_train) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X_train, Y_train) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X_train, Y_train) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X_train, Y_train) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) rf = RandomForestRegressor() rf.fit(X_train,Y_train) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X_train, Y_train, center=True) ranks["Corr."] = rank_to_dict(f, names) r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print(ranks["Mean"]) print("\t\t%s" % "\t".join(methods)) for name in names: print ("%s\t%s" % (name, "\t".join(map(str, [ranks[method][name] for method in methods])))) ranks_f = pd.DataFrame(ranks) ranks_f.sort_values("RF",0,0,inplace = True) print(ranks_f) featureset = ranks_f.index.values[0:5] print(featureset) if method_type == "categorical": #GENETIC ALGORITHM MAIN LOOP - START # - ITERATING MANY TIMES UNTIL NO IMPROVMENT HAPPENS IN ORDER TO FIND THE OPTIMAL SET OF CHARACTERISTICS (VARIABLES) ##### sum_current_gini=0.0 sum_current_gini_1=0.0 sum_current_gini_2=0.0 first=0 OK = 1 a=0 while OK: #REPEAT UNTIL IT DO NOT IMPROVE, AT LEAST A LITLE, THE GINI IN 2 GENERATIONS a=a+1 print('loop ', a) OK=0 #### # GENERATING OFFSPRING - START #### offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.1) #CROSS-X PROBABILITY = 50%, MUTATION PROBABILITY=10% fits = toolbox.map(toolbox.evaluate, offspring) for fit, ind in zip(fits, offspring): ind.fitness.values = fit population =toolbox.select(offspring, k=len(population)) #### # GENERATING OFFSPRING - END #### sum_current_gini_2=sum_current_gini_1 sum_current_gini_1=sum_current_gini sum_current_gini=0.0 ##### #ASSESSING GINI ON THE OFFSPRING - START ##### for j in range(np.shape(population)[0]): if population[j] not in dic_gini.values(): var_model = [] for i in range(np.shape(population)[0]): if (population[j])[i]==1: var_model.append(list(list_inputs)[i]) X_train=df[var_model] Y_train=df[output_var] ###### # CHANGE_HERE - START: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### lr = sm.Logit(Y_train, X_train) model=lr.fit() Y_predict=model.predict(X_train) ###### # CHANGE_HERE - END: YOU ARE VERY LIKELY USING A DIFFERENT TECHNIQUE BY NOW. SO CHANGE TO YOURS. ##### ###### # CHANGE_HERE - START: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### fpr, tpr, thresholds = metrics.roc_curve(Y_train, Y_predict) auc = metrics.auc(fpr, tpr) gini_power = abs(2*auc-1) ###### # CHANGE_HERE - END: HERE IT USES THE DEVELOPMENT GINI TO SELECT VARIABLES, YOU SHOULD A DIFFERENT GINI. EITHER THE OOT GINI OR THE SQRT(DEV_GINI*OOT_GINI) ##### gini=str(gini_power)+";"+str(population[j]).replace('[','').replace(', ','').replace(']','') dic_gini[gini]=population[j] ##### #ASSESSING GINI ON THE OFFSPRING - END ##### ##### #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - START ##### list_gini=sorted(dic_gini.keys(),reverse=True) population=[] for i in list_gini[:NPOPSIZE]: population.append(dic_gini[i]) gini=float(i.split(';')[0]) sum_current_gini+=gini ##### #SELECTING THE BEST FITTED AMONG ALL EVER CREATED POPULATION AND CURRENT OFFSPRING - END ##### #HAS IT IMPROVED AT LEAST A LITLE THE GINI IN THE LAST 2 GENERATIONS print ('sum_current_gini=', sum_current_gini, 'sum_current_gini_1=', sum_current_gini_1, 'sum_current_gini_2=', sum_current_gini_2) if(sum_current_gini>sum_current_gini_1+0.0001 or sum_current_gini>sum_current_gini_2+0.0001): OK=1 ##### #GENETIC ALGORITHM MAIN LOOP - END ##### if method_type == "categorical": gini_max=list_gini[0] gini=float(gini_max.split(';')[0]) features=gini_max.split(';')[1] #### # PRINTING OUT THE LIST OF FEATURES ##### f=0 for i in range(len(features)): if features[i]=='1': f+=1 print('feature ', f, ':', list(list_inputs)[i]) print ('gini: ', gini) featureset = features return featureset
def stable(ress, test, labels): # ress is training data x, y = ress.shape names = np.arange(y) rlasso = RandomizedLasso() rlasso.fit(ress, labels) #print "Features sorted by their scores according to the stability scoring function" val = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True) print("len of val") # newly constructed features print(len(val)) global nc_val nc_val += len(val) finale = [] for i in range(0, len(val)): r, s = val[i] # 'r' represents scores, 's' represents column name if (r > 0.1): # This is eta for stability selection finale.append(s) #finale.append(s) print("Total features after stability selection:") print(len( finale)) # finale stores col names - 2nd, 4th etc of stable features. global stable_val stable_val += len(finale) dataset1 = np.zeros((len(ress), len(finale)), dtype=float) dataset3 = np.zeros((len(test), len(finale)), dtype=float) dataset1 = ress[:, finale] dataset3 = test[:, finale] #dataset3=test.iloc[:,finale] if os.path.exists( "sonar_stable_testfeatures.csv"): # Name of Ouput file generated os.remove("sonar_stable_testfeatures.csv") if os.path.exists( "sonar_stable_trainfeatures.csv"): # Name of Ouput file generated os.remove("sonar_stable_trainfeatures.csv") with open("sonar_stable_testfeatures.csv", "wb") as myfile: np.savetxt(myfile, dataset3, delimiter=",", fmt="%s") with open("sonar_stable_trainfeatures.csv", "wb") as myfile: np.savetxt(myfile, dataset1, delimiter=",", fmt="%s") #----------------------------------------------------------------------------------- # check the inter-feature dependence - 2nd phase of ensemble ress_new = SelectKBest(mutual_info_classif, k='all') ress_new.fit_transform(ress[:, finale], labels) #print "Features sorted by their scores according to the scoring function - mutual information gain:" feats = sorted(zip(map(lambda x: round(x, 4), ress_new.scores_), names), reverse=True) ensemble_finale = [] for i in range(0, len(feats)): r, s = feats[i] if (r > 0): # This is eta-o ensemble_finale.append(s) print("Total features after 2 phase selection:") print( len(ensemble_finale) ) # ensemble_finale stores col names further pruned in the 2nd phase of feature selection global ensemble_val ensemble_val += len(ensemble_finale) #print(ensemble_select) dataset2 = np.zeros((len(ress), len(ensemble_finale)), dtype=float) dataset4 = np.zeros((len(test), len(ensemble_finale)), dtype=float) dataset2 = ress[:, ensemble_finale] dataset4 = test[:, ensemble_finale] if os.path.exists( "sonar_ensemble_testfeatures.csv"): # Name of Ouput file generated os.remove("sonar_ensemble_testfeatures.csv") if os.path.exists("sonar_ensemble_trainfeatures.csv" ): # Name of Ouput file generated os.remove("sonar_ensemble_trainfeatures.csv") with open("sonar_ensemble_testfeatures.csv", "wb") as myfile: np.savetxt(myfile, dataset4, delimiter=",", fmt="%s") with open("sonar_ensemble_trainfeatures.csv", "wb") as myfile: np.savetxt(myfile, dataset2, delimiter=",", fmt="%s")
Lasso picks out the top performing features, while forcing other features to be close to zero. It is useful when reducing the number of features is required. """ lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names_hashingfile) """ Stability selection applies a feature selection algorithm on different subsets of data and with different subsets of features. After repeating the process a number of times, the selection results can be aggregated, for example by checking how many times a feature ended up being selected as important when it was in an inspected feature subset. We can expect strong features to have scores close to 100%, since they are always selected when possible. Weaker, but still relevant features will also have non-zero scores, since they would be selected when stronger features are not present in the currently selected subset, while irrelevant features would have scores (close to) zero, since they would never be among selected features. """ rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names_hashingfile) """ Recursive feature elimination is a greedy optimization based on the idea to repeatedly construct a model and choose either the best or worst performing feature setting the feature aside and then repeating the process with the rest of the features. We have constructed the model using Linear Regression. """ rfe = RFE( lr, n_features_to_select=topk ) #stop the search when topk features are left (they will get equal scores) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names_hashingfile, order=-1) """
## RandomizedLasso, feature stability selection from sklearn.linear_model import (RandomizedLasso, lasso_stability_path, LassoLarsCV) import warnings from sklearn.exceptions import ConvergenceWarning with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) warnings.simplefilter('ignore', ConvergenceWarning) lars_cv = LassoLarsCV(cv=6).fit(X, y) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y) names = df_merge3.columns.tolist()[:-1] print(sorted(zip(map(lambda x: round(x, 4), clf.scores_), names), reverse=True)) from sklearn.ensemble import ExtraTreesClassifier clf = ExtraTreesClassifier() clf = clf.fit(X, y) df_tree = pd.DataFrame(clf.feature_importances_) df_tree['fea_index'] = df_merge3.columns.tolist()[:-1] df_tree.columns = ["weight", "feature_index"] df_tree.sort_values("weight").tail(10) #model = SelectFromModel(lsvc, prefit=True) #X_new = model.transform(X) #X_new.shape
#4 两种顶层特征选择算法 #4.1 稳定性选择 (Stability selection) [0,1] #它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果, #比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数) from sklearn.linear_model import RandomizedLasso #随机Lasso from sklearn.datasets import load_boston boston = load_boston() #using the Boston housing data. #Data gets scaled automatically by sklearn's implementation X = boston["data"] Y = boston["target"] names = boston["feature_names"] rlasso = RandomizedLasso(alpha=0.025) #alpha自动选择最优的值 rlasso.fit(X, Y) print "Features sorted by their score:" #得分:rlasso.scores_ print sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True) #结论:好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso是不同的。 #对于特征选择任务,在许多数据集和环境下,稳定性选择往往是性能最好的方法之一 #4.2 递归特征消除 (Recursive feature elimination (RFE)) 最优特征子集贪心算法 #反复的构建模型(如SVM或者回归模型)然后选出最好的(或者最差的)的特征(可以根据系数来选),把选出来的特征放到一遍, #然后在剩余的特征上重复这个过程,直到所有特征都遍历了。这个过程中特征被消除的次序就是特征的排序 from sklearn.feature_selection import RFE from sklearn.linear_model import LinearRegression,Ridge boston = load_boston() X = boston["data"]
def feature_importance( df, train, target, dummies=[], fill_na=-999, methods=['rlasso', 'RFE', 'LinReg', 'Ridge', 'Lasso', 'RF', 'GBM']): # in lower and upper form of the methods names methods_lower = [x.lower() for x in methods] methods_upper = [x.upper() for x in methods] # combine names methods = methods + methods_lower + methods_upper # target Y = df[target].values # deal the training data df = df[train].fillna(fill_na) # dummies if dummies != []: for x in dummies: dummie_x = pd.get_dummies(df.x, prefix=x + '_').iloc[:, 1:] df = pd.concat([df, dummie_x], axis=1) def get_cat_features(df): return list(df.select_dtypes(include=['object']).columns) # automatically detect categorical variables cat_attr = get_cat_features(df) for x in cat_attr: dummie_x = pd.get_dummies(df.x, prefix=x + '_').iloc[:, 1:] df = pd.concat([df, dummie_x], axis=1) # get all attributes names colnames = df.columns # attributes X = df.values # Define dictionary to store our rankings ranks = {} # Create our function which stores the feature rankings to the ranks dictionary def ranking(ranks, names, order=1): minmax = MinMaxScaler() ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0] ranks = map(lambda x: round(x, 2), ranks) return dict(zip(names, ranks)) ''' Randomized Lasso ''' if 'rlasso' in methods: # Selection Stability method with Randomized Lasso rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["rlasso/Stability"] = ranking(np.abs(rlasso.scores_), colnames) ''' Recursive Feature Elimination ( RFE ) ''' if 'RFE' in methods: # Construct our Linear Regression model lr = LinearRegression(normalize=True) lr.fit(X, Y) # stop the search when only the last feature is left rfe = RFE(lr, n_features_to_select=1, verbose=0) rfe.fit(X, Y) ranks['RFE'] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1) ''' Linear Model Feature Ranking ''' if 'LinReg' in methods: # Using Linear Regression lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks['LinReg'] = ranking(np.abs(lr.coef_), colnames) # Using Ridge if 'Ridge' in methods: ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames) # Using Lasso if 'Lasso' in methods: lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks['Lasso'] = ranking(np.abs(lasso.coef_), colnames) ''' random forest ''' if 'RF' in methods: # parameters rf_params = { 'n_jobs': -1, 'n_estimators': 100, 'warm_start': True, 'max_features': 0.3, 'max_depth': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'random_state': 100, 'verbose': 0 } rf = RandomForestRegressor(**rf_params) rf.fit(X, Y) ranks['RF'] = ranking(rf.feature_importances_, colnames) ''' Gradient Boosting Machine ''' if 'GBM' in methods: # parameters gbm_params = { 'nthread': -1, 'colsample_bytree': 0.4, 'gamma': 0, 'reg_alpha': 0.75, 'reg_lambda': 0.45, 'subsample': 0.6, 'learning_rate': 0.07, 'max_depth': 3, 'min_child_weight': 1.5, 'n_estimators': 100, 'seed': 100 } gbm = xgb.XGBRegressor(**gbm_params) gbm.fit(X, Y) ranks['GBM'] = ranking(gbm.feature_importances_, colnames) # Create empty dictionary to store the mean value calculated from all the scores r = {} for name in colnames: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") matrix_importance = pd.DataFrame(ranks) print(matrix_importance.columns) # change the display oder of cols ordered_cols = [ 'rlasso/Stability', 'LinReg', 'Lasso', 'Ridge', 'RFE', 'GBM', 'RF', 'Mean' ] matrix_importance = matrix_importance[ordered_cols] # display the summary table display(HTML(matrix_importance.to_html())) # Put the mean scores into a Pandas dataframe meanplot = pd.DataFrame(list(r.items()), columns=['Feature', 'Mean Ranking']) # Sort the dataframe meanplot = meanplot.sort_values('Mean Ranking', ascending=False) # Let's plot the ranking of the features sns.factorplot(x="Mean Ranking", y="Feature", data=meanplot, kind="bar", size=14, aspect=1.9, palette='coolwarm')
for train_index, test_index in skf: X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train.shape) if feature_selection == "randomized_lasso": feature_selector = RandomizedLasso(sample_fraction=0.5, n_resampling=50, verbose=False, n_jobs=-1) elif feature_selection == "RFECV_linearSVM": # print(feature_selection % "selected") feature_selector = RFECV(SVC(kernel="linear"), step=1, cv=StratifiedKFold(y, 5), scoring="accuracy") else: print("Options are: randomized_lasso, RFECV_linearSVM") feature_selector.fit(X_train, y_train) result = { 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test, 'feature_selector': feature_selector } list_dicts.append(result) dict_for_attribute[attribute] = list_dicts print("done in %0.3fs" % (time() - t0))
'from_this_person_to_poi', 'shared_receipt_with_poi','from_poi_fraction','to_poi_fraction',\ 'tot_to_salary','tot_to_bonus','restr_to_total'] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) #SCALE FEATURES: #For RandomForest and DecisionTree, scaling is not necessary. #scaler = MinMaxScaler() #features = scaler.fit_transform(features) #Stability Selection: #http://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/ rlasso = RandomizedLasso(random_state=2) rlasso.fit(features,labels) scores = rlasso.scores_ print scores for j in range(len(scores)): print features_list[j+1],": ",scores[j] features_list_selected = ['poi'] for j in np.where(scores > 0.3)[0]: features_list_selected.append(features_list[j+1]) print "-------------Selected features:-------------" print features_list_selected data = featureFormat(data_dict, features_list_selected)
from sklearn.cross_validation import train_test_split from scipy import io as sio from tensorflow.python.framework import ops from dfs2 import DeepFeatureSelectionNew import numpy as np from sklearn.datasets import make_classification from sklearn.preprocessing import normalize # ourdataB = sio.loadmat("/Volumes/TONY/Regeneron/Data/OriginalData/newDataB_2labels.mat") ourdataB = sio.loadmat("/Users/xupeng.tong/Documents/Data/OriginalData/newDataB_2labels.mat") # ourdataB = sio.loadmat("/home/REGENERON/xupeng.tong/newDataB_2labels.mat") inputX = ourdataB['X'] inputX = normalize(inputX, axis=0) inputY = ourdataB['Y'][0,:] columnNames = ourdataB['columnNames'] X_train, X_test, y_train, y_test = train_test_split(inputX, inputY, test_size=0.2, random_state=42) randomized_lasso = RandomizedLasso() randomized_lasso.fit(X_train, y_train) featureMask = randomized_lasso.get_support() X_train_lasso = X_train[:,featureMask] X_test_lasso = X_train[:,featureMask] columnNames[0][:100][featureMask] sio.savemat('RandomLasso-result', {'X_train_lasso':X_train_lasso, \ 'X_train_lasso':X_test_lasso, 'featureMask':featureMask})
def main(): start = time.time() MAX_TRAIN_SIZE = 126838 train_size = 20000 val_size = MAX_TRAIN_SIZE - train_size data, test_data = get_data('data') X = data[0:train_size,0:-1] y = [lbl for lbl in data[0:train_size,-1]] print(X.shape) print(len(y)) # use randomized log regression for feature selection clfR = RandomizedLasso( alpha='aic', scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=1, pre_dispatch='3*n_jobs', #memory=Memory(cachedir=None) ) # fit regresion clfR.fit(X,y) # Transform Train Data to selected features X = np.array(X).copy() # little hack to fix assignment dest. read only error X_new = clfR.transform(X) X = X_new ## transform Quiz Dataset test_data = np.array(test_data).copy() # little hack to fix assignment dest. read only error transformed_test_data = clfR.transform(test_data) test_data = transformed_test_data print('Dimensions after feature Reduction: ' + str(X.shape) ) print("Elapsed Time For Feature Reduction: " + str(duration)) # Training classifier clf1 = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None, presort=False) # fit sub-classifiers clf1.fit(X,y) # fit voting classifier print("Elapsed Time For Classifier Training: " + str(duration)) # predict & calculate training error y_hat = clf1.predict(X) test_err = 1 for yi, y_hati in zip(y, y_hat): test_err += (yi == y_hati) test_err /= train_size print("train: " + str(test_err)) # validation data - calculate valdiation error val_start = train_size val_end = train_size + val_size # get validation data set # TODO: put this back in if MAX_TRAIN_SIZE - train_size > val_size: print("Beginning test validation...") X_val = data[val_start:val_end,0:-1] y_val = [lbl for lbl in data[val_start:val_end,-1]] y_val_hat = clf1.predict(X_val) test_err = 1 for yi, y_hati in zip(y_val, y_val_hat): test_err += (yi == y_hati) test_err /= X_val.shape[0] print("val: " + str(test_err)) #quiz data print("Beginning quiz validation...") # test_data = get_data('quiz') X_test = test_data[:,:] print(X_test.shape) y_test = [lbl for lbl in data[:,-1]] y_test_hat = clf1.predict(X_test) test_err = 1 # for yi, y_hati in zip(y_test, y_test_hat): # test_err += (yi == y_hati) # test_err /= X_test.shape[0] # print("test: " + str(test_err)) store_csv(y_test_hat, "prediction") end = time.time() duration = end - start print("Took this many seconds: " + str(duration))
def regression(file_name): import time startTime = time.time() import pandas as pd import numpy as np import sklearn from sklearn.ensemble import RandomForestRegressor import matplotlib.pyplot as plt from boruta import BorutaPy from sklearn.model_selection import train_test_split df = pd.read_csv(file_name) df.replace([np.inf, -np.inf], np.nan) df = df.dropna() df = df.astype(float) y = df['Target'].values X = df.drop(['Target'], axis=1) col = X.columns.tolist() col = ",".join(col) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1)) scaler.fit_transform(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=33) ################################################# SELECT K BEST ############################################################################## #Selected from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2, mutual_info_regression, f_regression num_features = len(X_train.columns) test = SelectKBest(score_func=f_regression, k=2) test.fit(X_train, y_train) scores = [] for i in range(num_features): scores.append(test.scores_[i]) Ranks = sorted(scores, reverse=True) writefp = open("Ranks_reg.csv", 'w') s = [str(i) for i in Ranks] res = (",".join(s)) writefp.write('Classifiers,' + col + '\n') writefp.write('Select K Best,' + res + '\n') writefp.close() ##################################################### EXTRA TREES REGRESSOR ########################################################################### #Selected from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesRegressor # Build a forest and compute the feature importances forest = ExtraTreesRegressor(n_estimators=250, random_state=0) forest.fit(X_train, y_train) importances = forest.feature_importances_ writefp = open("Ranks_reg.csv", 'a') s = [str(i) for i in importances] res = (",".join(s)) writefp.write('Extra Trees Regressor,' + res + '\n') writefp.close() ############################################# RANDOM FOREST REGRESSOR ####################################################################3 # Takes 186 seconds to complete #Selected clf = RandomForestRegressor(n_estimators=10000, random_state=0, n_jobs=-1) clf.fit(X_train, y_train) writefp = open("Ranks_reg.csv", 'a') s = [str(i) for i in clf.feature_importances_] res = (",".join(s)) writefp.write('Random Forest Regressor,' + res + '\n') writefp.close() ######################################### RIDGE ######################################## # fast #Selected from sklearn.linear_model import Ridge ridge = Ridge(alpha=7) ridge.fit(X_train, y_train) writefp = open("Ranks_reg.csv", 'a') s = [str(i) for i in np.abs(ridge.coef_)] res = (",".join(s)) writefp.write('Ridge Regressor,' + res + '\n') writefp.close() ###################################### LINEAR REGRESSION ################################### #Fast #Selected from sklearn.linear_model import LinearRegression lr = LinearRegression(normalize=True) lr.fit(X_train, y_train) writefp = open("Ranks_reg.csv", 'a') s = [str(i) for i in np.abs(lr.coef_)] res = (",".join(s)) writefp.write('Linear Regression ,' + res + '\n') writefp.close() ################################ F_REGRESSOR ################################################# from sklearn.feature_selection import RFE, f_regression f, pval = f_regression(X_train, y_train, center=True) writefp = open("Ranks_reg.csv", 'a') s = [str(i) for i in f] res = (",".join(s)) s = [str(i) for i in pval] res1 = (",".join(s)) writefp.write('F_regressor,' + res + '\n') writefp.close() ################################# LASSO ###################################################### from sklearn.linear_model import Lasso lasso = Lasso(alpha=0.05, max_iter=5000) lasso.fit(X_train, y_train) writefp = open("Ranks_reg.csv", 'a') s = [str(i) for i in np.abs(lasso.coef_)] res = (",".join(s)) writefp.write('Lasso ,' + res + '\n') writefp.close() ############################# RANDOMIZED LASSO ################################################ from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X_train, y_train) writefp = open("Ranks_reg.csv", 'a') s = [str(i) for i in np.abs(rlasso.scores_)] res = (",".join(s)) writefp.write('Randomized Lasso,' + res + '\n') writefp.close() ############################ CORRELATION ######################################################## corr = [] for i in X.columns.tolist(): corr.append(df['Target'].corr(df[i])) writefp = open("Ranks_reg.csv", 'a') s = [str(i) for i in corr] res = (",".join(s)) writefp.write('Correlation With Target,' + res + '\n') writefp.close() ################# endTime = time.time() final_time = endTime - startTime def convert(seconds): seconds = seconds % (24 * 3600) hour = seconds // 3600 seconds %= 3600 minutes = seconds // 60 seconds %= 60 return "%d:%02d:%02d" % (hour, minutes, seconds) n = final_time print(convert(n))
for key in final_feats: final_inputs[x][count] = final_feats[key][x] count = count+1 inputs = [input for input in final_inputs.values()] # Recursive feature elimination svr = SVR(kernel="linear") rfe = RFE(svr, step=1) rfe = rfe.fit(inputs,outputs[1]) rfe.support_ rfe.ranking_ # selected features by RFE selected_features = [] count = 0 for key in final_feats.keys(): if (rfe.support_[count] == True): selected_features.append(key) count = count + 1 # Randomized Lasso for feature selection rlasso = RandomizedLasso(alpha=1) rlasso.fit(inputs, outputs[2]) rlasso.scores_
def score_calculate(flag): # 行为特征选择的算法,列为特征的名称 algorithm = {} if flag=='whole': tmp_sta,tmp_rf,tmp_gbdt,tmp_extra={},{},{},{} for n in range(10): #stability rlasso = RandomizedLasso(random_state=n) rlasso.fit(data, mark) new1=rank_to_dict(np.abs(rlasso.scores_), names,cv=True) new_sta.append(new1['白球比']) tmp_sta = add(tmp_sta,rank_to_dict(np.abs(rlasso.scores_), names,cv=True)) #rf rf = RandomForestClassifier(random_state=n) rf.fit(data, mark) new2=rank_to_dict(rf.feature_importances_, names,cv=True) new_rf.append(new2['白球比']) tmp_rf = add(tmp_rf,rank_to_dict(rf.feature_importances_, names,cv=True)) #GBDT gbdt=GradientBoostingClassifier(random_state=n) gbdt.fit(data, mark) new3 = rank_to_dict(gbdt.feature_importances_, names, cv=True) new_gbdt.append(new3['白球比']) tmp_gbdt = add(tmp_gbdt, rank_to_dict(gbdt.feature_importances_, names, cv=True)) #Extra model = ExtraTreesClassifier(random_state=n) model.fit(data, mark) new4 = rank_to_dict(model.feature_importances_, names, cv=True) new_ex.append(new4['白球比']) tmp_extra = add(tmp_extra, rank_to_dict(model.feature_importances_, names, cv=True)) algorithm["stability"],algorithm["RF"],algorithm["GBDT"],algorithm["Extra"] \ = tmp_sta,tmp_rf,tmp_gbdt,tmp_extra #print(len(algorithm["stability"])) #MIC mine = MINE() mic_scores = [] res=[] for i in range(len(data[0])): for num in data: res.append(num[i]) mine.compute_score(res, mark) m = mine.mic() mic_scores.append(m) res = [] algorithm["MIC"] = rank_to_dict(mic_scores, names) #线性回归 lr = LinearRegression(normalize=True) lr.fit(data, mark) algorithm["Linear"] = rank_to_dict(np.abs(lr.coef_), names) #ridge ridgecv = RidgeCV() ridgecv.fit(data, mark) #print(ridgecv.alpha_) ridge = Ridge(alpha=ridgecv.alpha_) ridge.fit(data, mark) algorithm["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) #lasso lassocv = LassoCV() lassocv.fit(data, mark) #print(lassocv.alpha_) lasso = Lasso(alpha=lassocv.alpha_) lasso.fit(data, mark) algorithm["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) #rfe log=LogisticRegression() rfe = RFE(log, n_features_to_select=10) rfe.fit(data, mark) algorithm["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1) ''' #f值检验 f, pval = f_classif(data, mark) algorithm["Corr"] = rank_to_dict(f, names) ''' elif flag=='extra': model = ExtraTreesClassifier() model.fit(data, mark) algorithm["Extra"] = rank_to_dict(model.feature_importances_, names) elif flag=='gbdt': gbdt = GradientBoostingClassifier() gbdt.fit(data, mark) algorithm["GBDT"] = rank_to_dict(gbdt.feature_importances_, names) elif flag=='rf': rf = RandomForestClassifier() rf.fit(data, mark) algorithm["RF"] = rank_to_dict(rf.feature_importances_, names) r = {} for name in names: r[name] = round(np.mean([algorithm[method][name] for method in algorithm.keys()]), 4) methods = sorted(algorithm.keys()) algorithm["Mean"] = r methods.append("Mean") content=[] for name in names: content.append([algorithm[method][name] for method in methods]) fea_matrix = pd.DataFrame(content,index=names) #fea_matrix.to_csv('/Users/hhy/Desktop/fea_importance_'+flag+'.csv',encoding='utf-8-sig',header=methods) return algorithm
# ###identify features and labels # In[ ]: Abalone_data_features = Abalone_data.iloc[:, :-1] Abalone_data_labels = Abalone_data.iloc[:, -1:] print(Abalone_data_features.head()) print(Abalone_data_labels.head()) # ###predictive models # In[ ]: model = RandomizedLasso(alpha=0.01) model.fit(Abalone_data_features, Abalone_data_labels["Rings"]) names = list(Abalone_data_features) print("Features by their score:") print( sorted(zip(map(lambda x: round(x, 4), model.scores_), names), reverse=True)) # In[ ]: sring_labels = Abalone_data.iloc[:, -1:] # In[ ]: splits = tts(Abalone_data_features, sring_labels, test_size=0.2) X_train, X_test, y_train, y_test = splits
def data_analyse(path,file_name): #This is used to load the dataset dataset = data_loader(path,file_name) #Clean up the dataset by assigning mean dataset = DataFrameImputer().fit_transform(dataset) #Our Class/Prediction Variable Y = dataset['loan_status'] ''' 1.) Transformations ''' #Transforming the Class Variable le = p.LabelEncoder() le.fit(Y) tr = le.transform(Y) #Penaly Application Variable for the lasso model alpha = 0.001 #Just to keep a count on if any exceptions occur count = 0 #Used as Attribute list from which we will select the best predictors for our Class Variable X = dataset[["id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","pymnt_plan","url","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","acc_now_delinq","tot_coll_amt","tot_cur_bal","total_rev_hi_lim","acc_open_past_24mths","avg_cur_bal","bc_open_to_buy","bc_util","chargeoff_within_12_mths","delinq_amnt","mo_sin_old_il_acct","mo_sin_old_rev_tl_op","mo_sin_rcnt_rev_tl_op","mo_sin_rcnt_tl","mort_acc","mths_since_recent_bc","mths_since_recent_bc_dlq","mths_since_recent_inq","mths_since_recent_revol_delinq","num_accts_ever_120_pd","num_actv_bc_tl","num_actv_rev_tl","num_bc_sats","num_bc_tl","num_il_tl","num_op_rev_tl","num_rev_accts","num_rev_tl_bal_gt_0","num_sats","num_tl_120dpd_2m","num_tl_30dpd","num_tl_90g_dpd_24m","num_tl_op_past_12m","pct_tl_nvr_dlq","percent_bc_gt_75","pub_rec_bankruptcies","tax_liens","tot_hi_cred_lim","total_bal_ex_mort","total_bc_limit","total_il_high_credit_limit"]] #Just to display the score and values of each attribute names = ["id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","pymnt_plan","url","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","acc_now_delinq","tot_coll_amt","tot_cur_bal","total_rev_hi_lim","acc_open_past_24mths","avg_cur_bal","bc_open_to_buy","bc_util","chargeoff_within_12_mths","delinq_amnt","mo_sin_old_il_acct","mo_sin_old_rev_tl_op","mo_sin_rcnt_rev_tl_op","mo_sin_rcnt_tl","mort_acc","mths_since_recent_bc","mths_since_recent_bc_dlq","mths_since_recent_inq","mths_since_recent_revol_delinq","num_accts_ever_120_pd","num_actv_bc_tl","num_actv_rev_tl","num_bc_sats","num_bc_tl","num_il_tl","num_op_rev_tl","num_rev_accts","num_rev_tl_bal_gt_0","num_sats","num_tl_120dpd_2m","num_tl_30dpd","num_tl_90g_dpd_24m","num_tl_op_past_12m","pct_tl_nvr_dlq","percent_bc_gt_75","pub_rec_bankruptcies","tax_liens","tot_hi_cred_lim","total_bal_ex_mort","total_bc_limit","total_il_high_credit_limit"] #Walk through each attribute for x in X: try: #Transformation of Categorical Variable le = p.LabelEncoder() le.fit(dataset[x]) dataset[x] = le.transform(dataset[x]) except Exception: #If there are no values count += 1 #Just Refreshing X after transformation X = dataset[["id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","pymnt_plan","url","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","acc_now_delinq","tot_coll_amt","tot_cur_bal","total_rev_hi_lim","acc_open_past_24mths","avg_cur_bal","bc_open_to_buy","bc_util","chargeoff_within_12_mths","delinq_amnt","mo_sin_old_il_acct","mo_sin_old_rev_tl_op","mo_sin_rcnt_rev_tl_op","mo_sin_rcnt_tl","mort_acc","mths_since_recent_bc","mths_since_recent_bc_dlq","mths_since_recent_inq","mths_since_recent_revol_delinq","num_accts_ever_120_pd","num_actv_bc_tl","num_actv_rev_tl","num_bc_sats","num_bc_tl","num_il_tl","num_op_rev_tl","num_rev_accts","num_rev_tl_bal_gt_0","num_sats","num_tl_120dpd_2m","num_tl_30dpd","num_tl_90g_dpd_24m","num_tl_op_past_12m","pct_tl_nvr_dlq","percent_bc_gt_75","pub_rec_bankruptcies","tax_liens","tot_hi_cred_lim","total_bal_ex_mort","total_bc_limit","total_il_high_credit_limit"]] ''' 2.) Lasso Implementation ''' rlasso = RandomizedLasso(alpha=alpha) rlasso.fit(X, tr) #To sort the attributes according to the Lasso Suggested Score output = (sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), names), reverse=True)) #Just to tag the alpha related to the output, so that keeping track is easy output.insert(0,[alpha,"Alpha:"]) #Writing the output to a file for further reference. with open('./output/d_analysis-'+str(alpha)+'-.json','w') as output_file: json.dump(output,output_file,indent=4,ensure_ascii=False) ''' 3.) Linear Regression Model ''' #Selecting the output from Lasso and selecting variables with high scores dataset = dataset[['total_rec_late_fee','total_rec_int','last_pymnt_amnt','recoveries','acc_open_past_24mths','last_pymnt_d']] #Splitting the train data into parts len_train = int(len(dataset.index)*0.75) * (-1) X_train = dataset[:len_train] Y_train = tr[:len_train] #Splitting the data into test len_test = int(len(dataset.index)*0.25) * (-1) X_test = dataset[len_test:] Y_test = tr[len_test:] # Create linear regression object regr = linear_model.LinearRegression() regr.fit(X_train, Y_train) #Just to see how algorithm performed print('Coefficients: \n', regr.coef_) #R2 score print("Residual sum of squares: %.2f" % np.mean((regr.predict(X_test) - Y_test) ** 2)) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % regr.score(X_test, Y_test))
def do_ml(day): ################################################################ # Modules to use ############################################################### USE_SAX = False FEATURE_REDUCTION = False DIM_REDUCTION_SEARCH = False #Create folder for administration try: os.mkdir('performance-{}-days'.format(day)) os.mkdir('performance-{}-days/models'.format(day)) except FileExistsError as e: None ################################################################ print('Using sax: {}'.format(USE_SAX)) if USE_SAX: X = pickle.load(open('X_sax.p', 'rb')) y = pickle.load(open('y_sax.p', 'rb')) else: X = pickle.load(open('X_reg.p', 'rb')) y = pickle.load(open('y_reg.p', 'rb')) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) ################################################################ # Dimensionality reduction ################################################################ def pca_reduce(X, dim): pca = PCA(n_components=dim) X_reduced = pca.fit_transform(X) return X_reduced def isomap_reduce(X, dim): iso = Isomap(n_components=dim) X_reduced = iso.fit_transform(X) return X_reduced def _find_best_dim_red(dims, model, model_name, X, y, params): rows_list = [] for dim in dims: for f in [pca_reduce, isomap_reduce]: print( 'Start reducing dimensionality using {} to {} dimensions'. format(f.__name__, dim)) t0 = time.time() #reduce dimensionality #print(X.shape) X_red = f(X, dim) #print(X_red.shape) X_train_red, X_test_red, y_train, y_test = train_test_split( X_red, y, test_size=0.2, random_state=1234) X_train_red = f(X_train_red, dim) X_test_red = f(X_test_red, dim) t1 = time.time() print('Reducing dimensions cost {} seconds'.format(t1 - t0)) #Optimize model using grid search and cross validation print('Start optimizing {} model'.format(model_name)) t0 = time.time() optimized_model = GridSearchCV(model, params, cv=10, refit=True) optimized_model.fit(X_train_red, y_train) t1 = time.time() print('Optimizing took {} seconds'.format(t1 - t0)) #print('best found parameters') #print(optimized_model.best_params_) y_pred = optimized_model.predict(X_test_red) mse = sk.metrics.mean_squared_error(y_test, y_pred) print("MSE on test set: {}".format(mse)) #administration rows_list.append({ 'model': deepcopy(model_name), 'dimensions': deepcopy(dim), 'reduction technique': deepcopy(f.__name__), 'mse': deepcopy(mse), 'parameters': str(deepcopy(optimized_model.best_params_)) }) #store model doc = open( 'performance-{}-days/models/{}-{}-{}.pickle'.format( day, f.__name__, model_name, dim), 'wb') pickle.dump(optimized_model, doc) doc.close() adm_df = pd.DataFrame(rows_list) adm_df.to_csv('performance-{}-days/{}-dim_reduction.csv'.format( day, model_name)) def dim_reduction_search(X, y): rf_params = { "max_depth": [3, None], "max_features": [1, 3, 10, 'sqrt', 'log2', 'auto'], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["mse"] } ada_params = { 'n_estimators': [10, 50, 100, 300, 500], 'learning_rate': [1, 0.5, 0.1, 0.01, 0.001], 'loss': ['linear', 'square', 'exponential'] } dims = [10, 20, 30] _find_best_dim_red(dims, AdaBoostRegressor(), 'AdaBoost', X, y, ada_params) _find_best_dim_red(dims, RandomForestRegressor(), 'RandomForest', X, y, rf_params) #X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2) ############################################################## # Feature selection ############################################################## #Recursive Feature Elimination def ReFeEl(nr_features, X_train, y_train, X_test, y_test, estimator, nr_models=5): print('Start selecting features') t1 = time.time() #estimator = AdaBoostRegressor(learning_rate= 0.001, loss ='square', n_estimators = 50) result = [] for nr_feature in nr_features: selector = RFE(estimator, nr_feature, step=1) selector.fit(X_train, y_train) y_pred = selector.predict(X_test) mse = sk.metrics.mean_squared_error(y_test, y_pred) result.append((mse, selector)) #sort models and take nr_models best ones result.sort(key=lambda x: x[0]) result = result[:nr_models] t2 = time.time() print('selecting features took {} seconds'.format(t2 - t1)) print(result[0][1].support_) print(result[0][1].ranking_) print('Minimum MSE: {}, number of selected features: {}'.format( result[0][0], len(result[0][1].support_[result[0][1].support_]))) return result if FEATURE_REDUCTION: if not USE_SAX: estimator = AdaBoostRegressor(learning_rate=0.001, loss='square', n_estimators=50) else: estimator = AdaBoostRegressor(learning_rate=0.01, loss='linear', n_estimators=50) _, total_nr_features = X.shape nr_features = range(1, total_nr_features) opt_features_models = ReFeEl(nr_features, X_train, y_train, X_test, y_test, estimator) print(opt_features_models) with open('optimal_features_model_{}.pickle'.format(USE_SAX), 'wb') as f: pickle.dump(opt_features_models, f) # Feature Importance using Extra Trees def ET_feature_selection(): estimator = ExtraTreesRegressor() estimator.fit(X, y) print(estimator.feature_importances_) print((len(estimator.feature_importances_[ estimator.feature_importances_ < 0.01]), len(estimator.feature_importances_))) print(np.mean(estimator.feature_importances_)) print(np.std(estimator.feature_importances_)) fig = plt.figure() ax = fig.add_subplot(111) bp = ax.boxplot(estimator.feature_importances_) fig.savefig('boxplot.png', bbox_inches='tight') input('hallo') ############################################################## #Dimensionality reduction search ############################################################## if DIM_REDUCTION_SEARCH: dim_reduction_search(X, y) ############################################################## #Grid search + CV ############################################################## def append_deep_copy(rows_list, model_name, nr_features, mse, params): result = { 'model': deepcopy(model_name), 'nr_features': deepcopy(nr_features), 'MSE': deepcopy(mse), 'parameters': str(deepcopy(params)) } rows_list.append(result) return rows_list params = { 'RF': { "max_depth": [3, None], "max_features": [1, 3, 10, 'sqrt', 'log2', 'auto'], "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["mse"] }, 'AdaBoost': { 'n_estimators': [10, 50, 100, 300, 500], 'learning_rate': [1, 0.5, 0.1, 0.01, 0.001], 'loss': ['linear', 'square', 'exponential'] } } ############################################################## # Recursive Feature Elimination ############################################################# for estimator in [(AdaBoostRegressor(), 'AdaBoost'), (RandomForestRegressor(), 'RF')]: #Get features and store feature selector print('Optimizing {} using CV RFE'.format(estimator[1])) t0 = time.time() selector = RFECV(estimator[0], step=1, cv=10) selector = selector.fit(X_train, y_train) X_train_transformed = selector.transform(X_train) X_test_transformed = selector.transform(X_test) t1 = time.time() print('Optimizing features done in {} seconds, storing model..'.format( t1 - t0)) #print('Selected features ({}): {}'.format(len(selector.get_support()[selector.get_support()]),selector.get_support())) doc = open( 'performance-{}-days/models/RFE-{}-selector.pickle'.format( day, estimator[1]), 'wb') pickle.dump(selector, doc) doc.close() #Optimize hyperparameters and evaluate model print('Start optimizing hyperparameters using determined features...') t0 = time.time() opt_model = GridSearchCV(estimator[0], params[estimator[1]], cv=10, refit=True) opt_model.fit(X_train_transformed, y_train) t1 = time.time() print('Optimizing took {} seconds'.format((t1 - t0))) #print('best found parameters') #print(opt_model.best_params_) y_pred = opt_model.predict(X_test_transformed) mse = sk.metrics.mean_squared_error(y_test, y_pred) print("MSE on test set: {}".format(mse)) model_doc = open( 'performance-{}-days/models/RFE-{}-model.pickle'.format( day, estimator[1]), 'wb') pickle.dump(opt_model, model_doc) model_doc.close() ############################################################## # Feature Stability Selection ############################################################# RL = RandomizedLasso(alpha='aic') print('Start optimizing using Randomized Lasso') t0 = time.time() RL.fit(X, y) t1 = time.time() print('Optimizing done in {} seconds'.format(t1 - t0)) #print('Best parameters: {}'.format(RL.get_params())) #print('Best features: {}'.format(RL.get_support())) doc = open( 'performance-{}-days/models/RandomizedLasso-selector.pickle'.format( day), 'wb') pickle.dump(RL, doc) doc.close() X_train_RL = RL.transform(X_train) X_test_RL = RL.transform(X_test) print('Using RL features to optimize model..') for estimator in [(AdaBoostRegressor(), 'AdaBoost'), (RandomForestRegressor(), 'RF')]: print('Optimizing {} using CV RFE'.format(estimator[0])) t0 = time.time() opt_model = GridSearchCV(estimator[0], params[estimator[1]], cv=10, refit=True) opt_model.fit(X_train_RL, y_train) t1 = time.time() print('Optimizing took {} seconds'.format((t1 - t0))) #print('best found parameters') #print(opt_model.best_params_) y_pred = opt_model.predict(X_test_RL) mse = sk.metrics.mean_squared_error(y_test, y_pred) print("MSE on test set: {}".format(mse)) model_doc = open( 'performance-{}-days/models/RandomizedLasso-{}-model.pickle'. format(day, estimator[1]), 'wb') pickle.dump(opt_model, model_doc) model_doc.close()
for x in range(0,len(outputs[1])): final_inputs[x] = np.zeros(len(final_feats)) count = 0 for key in final_feats: final_inputs[x][count] = final_feats[key][x] count = count+1 inputs = [input for input in final_inputs.values()] svr = SVR(kernel="linear") rfe = RFE(svr, step=1) rfe = rfe.fit(inputs,outputs[1]) rfe.support_ rfe.ranking_ selected_features = [] count = 0 for key in final_feats.keys(): if (rfe.support_[count] == True): selected_features.append(key) count = count + 1 rlasso = RandomizedLasso(alpha=1) rlasso.fit(inputs, outputs[2]) rlasso.scores_
data_final.columns.values y_all = data_final['CLM_YesNo'] X_all = data_final.drop('CLM_YesNo', axis=1) from sklearn import datasets from sklearn.feature_selection import RFE from sklearn.linear_model import RandomizedLasso from sklearn.linear_model import LogisticRegression logreg = LogisticRegression() #rfe = RFE(logreg, 13) #rfe = rfe.fit(X_all, y_all) #print (rfe.support_) #print (rfe.ranking_) #X_rfe = X_all[X_all.columns[rfe.support_]] rlasso = RandomizedLasso(scaling=0.025) rlasso.fit(X_all, y_all) print(rlasso.scores_)
def f(): # from minepy import MINE # np.random.seed(0) # # size = 750 # X = np.random.uniform(0, 1, (size, 14)) # # # "Friedamn #1” regression problem # Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5) ** 2 + # 10 * X[:, 3] + 5 * X[:, 4] + np.random.normal(0, 1)) # # Add 3 additional correlated variables (correlated with X1-X3) # X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4)) # # names = ["x%s" % i for i in range(1, 15)] boston = pd.read_csv('/Users/ufenqi/Documents/dataming/base1/data/data_1510/traindata_use.csv') boston.fillna(-1) target = '1' IDcol = '0' predictors = [x for x in boston.columns if x not in [target, IDcol]] print len(predictors) X = boston[predictors] Y = boston[target] names = predictors ranks = {} def rank_to_dict(ranks, names, order=1): minmax = MinMaxScaler() ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0] ranks = map(lambda x: round(x, 2), ranks) return dict(zip(names, ranks)) lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) # stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(map(float, rfe.ranking_), names, order=-1) rf = RandomForestRegressor() rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_classif(X, Y) ranks["Corr."] = rank_to_dict(f, names) # mine = MINE() # mic_scores = [] # for i in range(X.shape[1]): # mine.compute_score(X[:, i], Y) # m = mine.mic() # mic_scores.append(m) # # ranks["MIC"] = rank_to_dict(mic_scores, names) r = {} for name in names: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print "\t%s" % "\t".join(methods) i=0 for name in names: print "%s\t%s" % (name, "\t".join(map(str, [ranks[method][name] for method in methods]))) i+=1 print i
def continuousFeaturesSelecting(df_continuous_removed_corr, *path_to_write): """对所有变量作标准化处理, 然后分为X自变量(除ovd_daynum)和y因变量(逾期天数), 采用随机Lasso选择对y回归贡献得分大于0.8的X, 返回y和X组成的特征选择后的数据""" #print df_continuous_removed_corr.describe() datasets = df_continuous_removed_corr.values columns_list = df_continuous_removed_corr.columns.tolist() #print columns_list #1 Normalizer() 归一化 #normal_scaler = Normalizer().fit_transform(datasets) #print normal_scaler[:, 0] #2 StandardScaler() 标准化 std_scaled = StandardScaler().fit_transform(datasets) y = std_scaled[:, 0] X = std_scaled[:, 1:] # print datasets # print X.shape, '\n', X, '\n' # print y.shape, '\n', y, '\n' # RandomizedLasso rlasso = RandomizedLasso() rlasso.fit(X, y) # 给ndarray格式特征匹配上特征名(此处不包括y逾期天数) list_features_rank = sorted(zip(columns_list[1:], map(lambda x: round(x, 4), rlasso.scores_)), key=lambda x: x[1], reverse=True) df_features_rank = pd.DataFrame(list_features_rank, columns=[ 'features_label', 'features_rank', ]) # 如果选择了存数路径, 则将连续型特征及其排名得分存入指定文件中 if path_to_write: df_features_rank.to_csv(path_to_write[0], index=False) # 截取得分在1.0及以上的特征X list_columns_features_selected =\ df_features_rank[df_features_rank[ 'features_rank']>=1.0]['features_label'].values.tolist() # 再添加上y(逾期天数), 构成进行样本聚类的所有特征 list_columns_features_selected.insert(0, 'ovd_daynum') # print list_features_rank # print df_features_rank # print list_columns_features_selected # 在输入DataFrame上删减后的DataFrame df_features_selected = df_continuous_removed_corr[ list_columns_features_selected] # 对删减后的DataFrame作标准化后再返回, 方便直接进行样本聚类 #print df_features_selected.values #print StandardScaler().fit_transform(df_features_selected.values) df_continuous_features_selected =\ pd.DataFrame(StandardScaler().fit_transform(df_features_selected.values), index=df_features_selected.index, columns=df_features_selected.columns) #print df_features_selected.head() #print df_continuous_features_selected.head() # 按行展示均值和方差 #print df_continuous_features_selected.values.mean(axis=0) #print df_continuous_features_selected.values.var(axis=0) return df_continuous_features_selected
def randomLasso_hq(X, y, alpha=0.025): ### random lasso or logistic regression ======= from sklearn.linear_model import RandomizedLasso rlasso = RandomizedLasso(alpha=alpha) rlasso.fit(X, y) return(rlasso.scores_)
from sklearn.linear_model import LogisticRegression from sklearn import model_selection from sklearn.datasets import make_classification from sklearn.linear_model import RandomizedLogisticRegression from sklearn.linear_model import RandomizedLasso from sklearn.datasets import make_regression X, y = make_classification(n_samples=100, n_features=100, n_informative=5, n_redundant=2, random_state=101) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.30, random_state=101) classifier = LogisticRegression(C=0.1, penalty='l1', random_state=101) classifier.fit(X_train, y_train) print("Out-of-sample accuracy: %0.3f" % classifier.score(X_test, y_test)) selector = RandomizedLogisticRegression(n_resampling=300, random_state=101) selector.fit(X_train, y_train) print("Variance selected: %i" % sum(selector._get_support_mask() != 0)) X_train_s = selector.transform(X_train) X_test_s = selector.transform(X_test) classifier.fit(X_train_s, y_train) print("Out-of-sample accuracy: %0.3f" % classifier.score(X_test_s, y_test)) XX, yy = make_regression(n_samples=100, n_features=10, n_informative=4, random_state=101) rlasso = RandomizedLasso() rlasso.fit(XX, yy) print(list(enumerate(rlasso.scores_)))
def train_and_analyse(_X, _y, features): X = _X Y = _y cv_l = cross_validation.KFold(X.shape[0], n_folds=10, shuffle=True, random_state=1) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) ridge = RidgeCV(cv=cv_l) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features) rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features) rfe = RFE(lr, n_features_to_select=1) rfe.fit(X,Y) ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1) rf = RandomForestRegressor(n_estimators=500) rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, features) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, features) r = {} for name in features: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") ranks = pd.DataFrame(ranks) selection_feature = ranks[ranks.Mean > 0.12].index.values return ranks, selection_feature
lr = LinearRegression(normalize=True) lr.fit(X, y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), names) ridge = Ridge(alpha=7) ridge.fit(X, y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) #stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X, y) ranks["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1) rf = RandomForestRegressor() rf.fit(X, y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X, y, center=True) ranks["Corr."] = rank_to_dict(f, names) mine = MINE()
def run(args): X_train = np.nan_to_num( np.genfromtxt(args.training_data, delimiter=args.delimiter)) y_train = np.clip(np.genfromtxt(args.training_labels), 0, 1) X_trains = X_train if args.scale: print "Scaling features (mean removal divided by std)..." scaler = StandardScaler().fit(X_train) X_trains = scaler.transform(X_train) # create output folders outF = args.output_folder + "/" + os.path.basename( args.training_data) + "--FS_" + str( args.select_features) + "--i_" + str(args.iterations) buildDir(outF) maskF = outF + "/masks/" buildDir(maskF) #evaluation features first_experiments labels logs masks parameters # predictions src suca paramF = outF + "/parameters/" buildDir(paramF) #featF = outF+"/features/" #buildDir(featF) #evalF = buildDir(outF+"/evaluation") #os.path.basename( # args.training_data)]) + featsel_str + "--" + os.path.basename( # test_label # initializes numpy random seed np.random.seed(args.seed) # performs feature selection featsel_str = ".all-feats" if args.select_features: print "Performing feature selection ..." # initializes selection estimator sel_est = RandomizedLasso(alpha="bic", verbose=True, max_iter=1000, n_jobs=8, random_state=args.seed, n_resampling=1000) sel_est.fit(X_trains, y_train) X_trains = sel_est.transform(X_trains) selected_mask = sel_est.get_support() selected_features = sel_est.get_support(indices=True) sel_feats_path = os.sep.join( # [".", "masks", os.path.basename(args.training_data)]) [maskF, os.path.basename(args.training_data)]) # saves indices np.savetxt(sel_feats_path + ".idx", selected_features, fmt="%d") # saves mask np.save(sel_feats_path + ".mask", selected_mask) featsel_str = ".randcv" estimator = ExtraTreesRegressor(random_state=args.seed, n_jobs=1) mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False) #rmse_scorer = make_scorer(mean_absolute_error, greater_is_better=False) # performs parameter optimization using random search print "Performing parameter optimization ... " param_distributions = \ {"n_estimators": [5, 10, 50, 100, 200, 500], "max_depth": [3, 2, 1, None], "max_features": ["auto", "sqrt", "log2", int(X_trains.shape[1]/2.0)], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False]} # "criterion": ["gini", "entropy"]} search = RandomizedSearchCV(estimator, param_distributions, n_iter=args.iterations, scoring=mae_scorer, n_jobs=8, refit=True, cv=KFold(X_train.shape[0], args.folds, shuffle=True, random_state=args.seed), verbose=1, random_state=args.seed) # fits model using best parameters found search.fit(X_trains, y_train) # ................SHAHAB ........................ models_dir = sorted(glob.glob(args.models_dir + os.sep + "*")) estimator2 = ExtraTreesRegressor(bootstrap=search.best_params_["bootstrap"], max_depth=search.best_params_["max_depth"], max_features=search.best_params_["max_features"], min_samples_leaf=search.best_params_["min_samples_leaf"], min_samples_split=search.best_params_["min_samples_split"], n_estimators=search.best_params_["n_estimators"], verbose=1, random_state=42, n_jobs=8) estimator2.fit(X_trains,y_train) from sklearn.externals import joblib print "koooonnn %s" % args.models_dir joblib.dump(estimator2, args.models_dir+"/XRT.pkl") joblib.dump(scaler, args.models_dir+"/scaler.pkl") joblib.dump(sel_est, args.models_dir+"/sel_est.pkl") # print "Kioonnn number of feat:\n", n_feature # ................SHAHAB ........................ print "Best parameters: ", search.best_params_ # saves parameters on yaml file #param_path = os.sep.join([".", "parameters", os.path.basename( param_path = os.sep.join([paramF, os.path.basename( args.training_data)]) + featsel_str + ".params.yaml" param_file = codecs.open(param_path, "w", "utf-8") yaml.dump(search.best_params_, stream=param_file) testF = os.sep.join([outF, "/test/"]) buildDir(testF) m = y_train.mean() # evaluates model on the different test sets test_features = sorted(glob.glob(args.test_data + os.sep + "*")) test_labels = sorted(glob.glob(args.test_labels + os.sep + "*")) for test_feature, test_label in zip(test_features, test_labels): print "Evaluating on %s" % test_label X_test = np.nan_to_num( np.genfromtxt(test_feature, delimiter=args.delimiter)) y_test = np.clip(np.genfromtxt(test_label), 0, 1) X_tests = X_test if args.scale: X_tests = scaler.transform(X_test) if args.select_features: X_tests = sel_est.transform(X_tests) # gets predictions on test set #y_pred = search.predict(X_tests) y_pred = np.clip(search.predict(X_tests), 0, 1) # evaluates on test set mae = mean_absolute_error(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print "Test MAE = %2.8f" % mae print "Test RMSE = %2.8f" % rmse print "Prediction range: [%2.4f, %2.4f]" % (y_pred.min(), y_pred.max()) # saves evaluation testFX = testF + "/" + os.path.basename(test_label) buildDir(testFX) buildDir(testFX + "/evaluation/") eval_path = os.sep.join([testFX, "evaluation", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) mae_eval = codecs.open(eval_path + ".mae", 'w', "utf-8") mae_eval.write(str(mae) + "\n") rmse_eval = codecs.open(eval_path + ".rmse", 'w', "utf-8") rmse_eval.write(str(rmse) + "\n") mu = m * np.ones(y_test.shape[0]) # baseline on test set maeB = mean_absolute_error(y_test, mu) rmseB = np.sqrt(mean_squared_error(y_test, mu)) print "Test MAE Baseline= %2.8f" % maeB print "Test RMSE Baseline= %2.8f" % rmseB mae_eval = codecs.open(eval_path + ".mae.Base", 'w', "utf-8") mae_eval.write(str(maeB) + "\n") rmse_eval = codecs.open(eval_path + ".rmse.Base", 'w', "utf-8") rmse_eval.write(str(rmseB) + "\n") # saves predictions buildDir(testFX + "/predictions/") preds_path = os.sep.join([testFX, "predictions", os.path.basename( args.training_data)]) + featsel_str + "--" + os.path.basename( test_label) + ".preds" np.savetxt(preds_path, y_pred, fmt="%2.15f")
from sklearn.linear_model import RandomizedLasso import csv data = [] mark = [] name = [] with open('/Users/hhy/Desktop/test.csv', 'r', encoding='utf-8_sig') as f: csv_reader = csv.reader(f) for x in csv_reader: data.append(list(map(int, x[0:-1]))) mark.append(int(x[-1])) with open('/Users/hhy/Desktop/feature.csv', 'r', encoding='utf-8_sig') as f: csv_reader = csv.reader(f) for x in csv_reader: name.append(x[-1]) rlasso = RandomizedLasso() rlasso.fit(data, mark) print("Features sorted by their score:") print( sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), name), reverse=True)) #rfe from sklearn import cross_validation from sklearn.linear_model import LinearRegression from sklearn.linear_model.logistic import LogisticRegression import csv data = [] mark = [] name = [] with open('/Users/hhy/Desktop/test.csv', 'r', encoding='utf-8_sig') as f: csv_reader = csv.reader(f)
def stability(features, labels): labels = labels.flatten() rlasso = RandomizedLasso(alpha=0.025) rlasso.fit(features, labels) return rlasso.scores_
class LinearAll: """ A repertoire of Linear Variable Selection and Prediction Models Parameters ---------- n_jobs : int, optional Number of jobs to run in parallel (default 1). If -1 all CPUs are used. This will only provide speedup for n_targets > 1 and sufficient large problems pre_dispatch : int, or string, optional Controls the number of jobs that get dispatched during parallel execution. Reducing this number can be useful to avoid an explosion of memory consumption when more jobs get dispatched than CPUs can process. This parameter can be: None, in which case all the jobs are immediately created and spawned. Use this for lightweight and fast-running jobs, to avoid delays due to on-demand spawning of the jobs An int, giving the exact number of total jobs that are spawned A string, giving an expression as a function of n_jobs, as in ‘2*n_jobs’ refit : boolean Refit the best estimator with the entire dataset. If “False”, it is impossible to make predictions using this GridSearchCV instance after fitting. iid : boolean, optional If True, the data is assumed to be identically distributed across the folds, and the score is computed from all samples individually, and not the mean loss across the folds. (If the number of data points is the same across folds, either returns the same thing) Attributes ---------- ols_train, predictions models before variable selection predictions models after variable selection """ def __init__ (self, cv=20, scoring = 'mean_squared_error', n_jobs=1, refit=False, iid=False, pre_pred=True, param_ridge_post=list(np.arange(1,3,0.1)), rlasso_selection_threshold = 0.5): #self.__name__ = '__main__' """ CAUTION: we changed to __main__ so that parallelization works """ self.cv = cv self.scoring = scoring self.n_jobs = n_jobs self.refit = refit self.iid = iid self.pre_pred =pre_pred self.param_ridge_post = param_ridge_post self.rlasso_selection_threshold = rlasso_selection_threshold def run_models(self, X, y, param_ridge): """ Prediction Models. OLS, PLS, Ridge """ ################################## ## OLS CV ################################## #ols = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_cv_score = cross_validation.cross_val_score( # ols, X, y, # cv=self.cv, scoring=self.scoring, # n_jobs=self.n_jobs) """ self.ols_cv_score.shape = (cv,) """ ################################## ## PLS CV ################################## tuned_parameters = [{'n_components': range(1, 5)}] pls = PLSRegression() pls_cv = GridSearchCV(pls, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) pls_cv.fit(X, y) ################################## ## Ridge CV ################################## tuned_parameters = [{'alpha': param_ridge}] ridge = linear_model.Ridge(alpha = 1) ridge_cv = GridSearchCV(ridge, tuned_parameters, cv=self.cv, scoring=self.scoring, n_jobs=self.n_jobs, refit=self.refit, iid=self.iid) ridge_cv.fit(X, y) return (pls_cv, ridge_cv) def fit(self, X, y): """ Variable Selection and Prediction. Variable Selection Model: lasso Prediction Models: see self.predict() Parameters ---------- X : numpy array or sparse matrix of shape [n_samples,n_features] Training data y : numpy array of shape [n_samples, n_targets] Target values Returns ------- self : returns an instance of self. """ ################################## ## OLS Train ################################## #ols_train = linear_model.LinearRegression(fit_intercept=True, # normalize=False, # copy_X=True) #ols_train.fit(X, y) #self.rss_ols_train = np.sum((ols_train.predict(X) - y) ** 2) """ fit_intercept=True, center the data copy=True, because centering data invovles X -= X_mean CAUTION: normalization=False, otherwise involves taking squares of X, lose precision self.rss_ols_train.shape = (1,1) """ ################################## ## Pre Variable Selection Predictions ################################## self.pre_pred = False if self.pre_pred: print "Computing ... " param_ridge_pre = list(np.arange(1e9,2e9,1e8)) self.pls_pre, self.ridge_pre = \ self.run_models(X, y, param_ridge_pre) ################################## ## Lasso Variable Selection ################################## self.lasso_cv = LassoLarsCV(fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, max_n_alphas=X.shape[1]+1000, eps= 2.2204460492503131e-16,copy_X=True, cv=self.cv, n_jobs=self.n_jobs) self.lasso_cv.fit(X, y) """ normalize=True, lasso seems to be able to handle itself """ if self.rlasso_selection_threshold == 0: self.lasso_refit = linear_model.LassoLars(alpha=self.lasso_cv.alpha_, fit_intercept=True, normalize=True, precompute='auto', max_iter=X.shape[1]+1000, eps=2.2204460492503131e-16, copy_X=True, fit_path=False) self.lasso_refit.fit(X, y) self.active = self.lasso_refit.coef_ != 0 self.active = self.active[0,:] X_selected = X[:, self.active] else: self.rlasso = RandomizedLasso(alpha=self.lasso_cv.alpha_, scaling=0.5, sample_fraction=0.75, n_resampling=200, selection_threshold=self.rlasso_selection_threshold, fit_intercept=True, verbose=False, normalize=True, precompute='auto', max_iter=500, eps=2.2204460492503131e-16, random_state=None, n_jobs=self.n_jobs, pre_dispatch='3*n_jobs',) self.rlasso.fit(X, y) X_selected = self.rlasso.transform(X) ################################## ## Post Variable Selection Predictions ################################## self.pls_post, self.ridge_post = \ self.run_models(X_selected, y, self.param_ridge_post) return self def predict(self, X_test): assert(self.refit == True) if self.pls_post.best_score_ > self.ridge_post.best_score_: self.best_model = self.pls_post print "Chosen Model: pls" else: self.best_model = self.ridge_post print "Chosen Model: ridge" if self.rlasso_selection_threshold == 0: X_test_selected = X_test[:, self.active] else: X_test_selected = self.rlasso.transform(X_test) return self.best_model.best_estimator_.predict(X_test_selected)
if attribute is "_all": continue else: # select the columns containing the attribute attribute_columns=filter(lambda x:re.search(attribute,x), data.iloc[:,10:].columns) X = data[attribute_columns[:20]] # use only 20 mode paramteres remove_highly_correlated(X,threshold=0.98) print(X.columns.values) list_dicts = list() for train_index, test_index in skf: X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] print(X_train.shape) if feature_selection == "randomized_lasso": feature_selector=RandomizedLasso(sample_fraction=0.5,n_resampling=50,verbose=False,n_jobs=-1) elif feature_selection == "RFECV_linearSVM": # print(feature_selection % "selected") feature_selector = RFECV(SVC(kernel="linear"),step=1,cv=StratifiedKFold(y,5),scoring="accuracy") else: print("Options are: randomized_lasso, RFECV_linearSVM") feature_selector.fit(X_train,y_train) result = {'X_train':X_train,'y_train':y_train,'X_test':X_test,'y_test':y_test,'feature_selector':feature_selector} list_dicts.append(result) dict_for_attribute[attribute] = list_dicts print("done in %0.3fs" % (time()-t0))