def test_f_regression_input_dtype(): # Test whether f_regression returns the same value # for any numeric data_type rng = np.random.RandomState(0) X = rng.rand(10, 20) y = np.arange(10).astype(np.int) F1, pv1 = f_regression(X, y) F2, pv2 = f_regression(X, y.astype(np.float)) assert_array_almost_equal(F1, F2, 5) assert_array_almost_equal(pv1, pv2, 5)
def test_f_regression_input_dtype(): """ Test whether f_regression returns the same value for any numeric data_type """ rng = np.random.RandomState(0) X = rng.rand(10, 20) y = np.arange(10).astype(np.int) F1, pv1 = f_regression(X, y) F2, pv2 = f_regression(X, y.astype(np.float)) assert_array_almost_equal(F1, F2, 5) assert_array_almost_equal(pv1, pv2, 5)
def score_features(X, y): f_regression(X, y) # F-test # [(460.65647274586092, 'RM'), # (668.7235158135851, 'LSTAT'), # (179.57156528039974, 'PTRATIO'), # (632.59314407749389, 'RM * LSTAT'), # (1.7387467537056727, 'RM * PTRATIO'), # (533.63937869220888, 'RM * RM'), # (365.17851177390713, 'LSTAT * LSTAT'), # (748.91444579648237, 'PTRATIO * LSTAT'), # (186.86355836745318, 'PTRATIO * PTRATIO')]
def fit_model(self): #read data from csv df = pd.read_csv('data_wrangle/df_final.csv') #drop reviews that have no score df = df.dropna(subset = ['overall']) df['overall'] = df['overall'].astype(int) #create dummy variables for chords_scale and chords_key dummy = pd.get_dummies(df['chords_scale']) df = pd.concat([df, dummy], axis=1) dummy = pd.get_dummies(df['chords_key']) df = pd.concat([df, dummy], axis=1) columns = ['spectral_complexity', 'average_loudness', 'dissonance', 'pitch_salience', 'dynamic_complexity','tuning_frequency', 'chords_strength', 'chords_changes_rate', 'bpm', 'danceability', 'beats_count', 'length', 'A', 'A#', 'B','C','C#','D','D#','E','F','F#','G','G#','minor','major'] df = shuffle(df) y = df['overall'] x = df[columns] # muuttujien P-arvojen tarkastelua. näitä käyttämällä n 1% parannus scoreen logregillä ja rndtreellä p_values = pd.DataFrame() p_values['column']= columns p_values['P']=feature_selection.f_regression(x, y)[1] p_values['F']=feature_selection.f_regression(x, y)[0] good_cols=[] for i,row in p_values.iterrows(): if row.P < 0.05: print(row.column,row.P) good_cols.append(row.column) y2 = df['overall'] x2 = df[good_cols] X_train, X_test, y_train, y_test = model_selection.train_test_split(x2,y2,test_size=0.2) #### rfc with best params #### class_weight = dict({1:0.996, 2:0.001, 3:0.001, 4:0.0015, 5:0.0005}) # class_weight = dict({1:0.70, 2:0.15, 3:0.1, 4:0.049, 5:0.001}) rtc_best = RandomForestClassifier(class_weight=class_weight, bootstrap=True, criterion= 'gini', max_depth= 30, max_features='auto', min_samples_split=4, n_estimators= 300) ## export model ## model = rtc_best.fit(X_train,y_train) joblib.dump(model, self.filename) return model
def test_f_regression_center(): # Test whether f_regression preserves dof according to 'center' argument # We use two centered variates so we have a simple relationship between # F-score with variates centering and F-score without variates centering. # Create toy example X = np.arange(-5, 6).reshape(-1, 1) # X has zero mean n_samples = X.size Y = np.ones(n_samples) Y[::2] *= -1.0 Y[0] = 0.0 # have Y mean being null F1, _ = f_regression(X, Y, center=True) F2, _ = f_regression(X, Y, center=False) assert_array_almost_equal(F1 * (n_samples - 1.0) / (n_samples - 2.0), F2) assert_almost_equal(F2[0], 0.232558139) # value from statsmodels OLS
def test_f_regression_center(): # Test whether f_regression preserves dof according to 'center' argument # We use two centered variates so we have a simple relationship between # F-score with variates centering and F-score without variates centering. # Create toy example X = np.arange(-5, 6).reshape(-1, 1) # X has zero mean n_samples = X.size Y = np.ones(n_samples) Y[::2] *= -1. Y[0] = 0. # have Y mean being null F1, _ = f_regression(X, Y, center=True) F2, _ = f_regression(X, Y, center=False) assert_array_almost_equal(F1 * (n_samples - 1.) / (n_samples - 2.), F2) assert_almost_equal(F2[0], 0.232558139) # value from statsmodels OLS
def interaction_feature(feature, target): poly = PolynomialFeatures(degree=2, interaction_only=True) record = pd.DataFrame( poly.fit_transform(feature[['div_rank', 'record', 'streak', 'gb']]), index=feature.index, columns=[ 'bias', 'div_rank', 'record', 'streak', 'gb', 'div_rank*record', 'div_rank*streak', 'div_rank*gb', 'record*streak', 'record*record', 'streak*record' ]) record_int = record[[ 'div_rank*record', 'div_rank*streak', 'div_rank*gb', 'record*streak', 'record*record', 'streak*record' ]] run = pd.DataFrame( poly.fit_transform(feature[['runs', 'runs_ma', 'runs_pg']]), index=feature.index, columns=[ 'bias', 'runs', 'runs_ma', 'runs_pg', 'runs*runs_ma', 'runs*runs_pg', 'runs_ma*runs_pg' ]) runs_int = run[['bias', 'runs*runs_ma', 'runs*runs_pg', 'runs_ma*runs_pg']] time = pd.DataFrame( poly.fit_transform(feature[['time', 'innings', 'runs_allowed']]), index=feature.index, columns=[ '1', 'time', 'innings', 'runs_allowed', 'time*innings', 'time*runs_allowed', 'innings*runs_allowed' ]) time_int = time[[ 'time*innings', 'time*runs_allowed', 'innings*runs_allowed' ]] new_features = pd.concat([feature, time_int, runs_int, record_int], axis=1) feature_p = pd.DataFrame(index=feature.columns, columns=['f_score', 'p_value']) feature_p['f_score'] = f_regression(feature, target)[0] feature_p['p_value'] = f_regression(feature, target)[1] kept_features = feature_p[feature_p['p_value'] < 0.05].index unkept_features = feature_p[feature_p['p_value'] > 0.05] return new_features, new_features[kept_features], unkept_features
def uniFeatureReg(index=0, taskID='filesReg'): _, myTrain, myVal = dataEncoding(index, taskID) for name in myTrain.columns: if (not (myTrain[name].dtype == 'O')): myTrain[name] = pre.minmax_scale(myTrain[name].astype('float')) return f_regression(myTrain, myVal)[1]
def run_linear_model(model_class, X_train, X_test, y_train, y_test, num_folds=None, alpha=None, l1_ratio=None, print_results=False): if model_class.__module__.split('.')[-1] == 'base': model_stats_dict['model_type'] = 'OLS' model = model_class() elif model_class.__module__.split('.')[-1] == 'ridge': model_stats_dict['model_type'] = 'ridge' model = model_class(alpha=alpha) elif (model_class.__module__.split('.')[-1] == 'coordinate_descent') and (l1_ratio is None): model_stats_dict['model_type'] = 'lasso' model = model_class(alpha=alpha) elif (model_class.__module__.split('.')[-1] == 'coordinate_descent') and (l1_ratio is not None): model_stats_dict['model_type'] = 'elastic_net' model = model_class(alpha=alpha, l1_ratio=l1_ratio) else: print(f'Unrecognized model class: {model_class}') return None # Train model model.fit(X_train, y_train) F, pval = f_regression(X_train, y_train) model_stats_dict['train_stats']['r-squared'] = [ model.score(X_train, y_train) ] model_stats_dict['train_stats']['features'] = X_train.columns.tolist() model_stats_dict['train_stats']['coeffs'] = model.coef_ model_stats_dict['train_stats']['F-stat'] = F model_stats_dict['train_stats']['pval'] = pval # Perform cross-validation if num_folds: cv_scores = cross_val_score(model, X_train, y_train, cv=num_folds) model_stats_dict['cv_stats']['mean_r-squared'] = [np.mean(cv_scores)] # Evaluate predictions on test set y_pred = model.predict(X_test) # Save model datasets model_stats_dict['test_data']['y_test'] = y_test model_stats_dict['test_data']['y_pred'] = y_pred # Save model statistics model_stats_dict['test_stats']['r-squared'] = [model.score(X_test, y_test)] model_stats_dict['test_stats']['mse'] = [mse(y_test, y_pred)] model_stats_dict['test_stats']['rmse'] = [rmse(y_test, y_pred)] model_stats_dict['test_stats']['mae'] = [mae(y_test, y_pred)] model_stats_dict['test_stats']['mape'] = [mape(y_test, y_pred)] if print_results: print_prediction_metrics(y_test, y_pred)
def fun(File1, File2, File3, File4, File5): start = time.time() file1 = r'E:/study/资料/数据/' ds = gdal.Open(file1 + File1) # 打开文件 im_width = ds.RasterXSize # 列数 im_height = ds.RasterYSize # 行数 im_bands = ds.RasterCount # 波段数 band1 = ds.GetRasterBand(1) img_datatype = band1.DataType data1 = np.full((39, im_height, im_width), 1.0) data2 = np.linspace(1980, 2019, 39) data3 = np.full((im_height, im_width), 1.0) data4 = np.full((im_height, im_width), 1.0) for year in range(1980, 2019): file2 = file1 + File2 + str(year) + File3 ds = gdal.Open(file2) img_data = ds.ReadAsArray() # 读取整幅图像转化为数组 data1[year - 1980] = img_data for x in range(0, im_height): for y in range(0, im_width): x_data = data1[:, x, y] y_data = data2 x_data = x_data.reshape(-1, 1) regr = LinearRegression() regr.fit(x_data, y_data) y_pred = regr.predict(x_data) r2score = r2_score(y_data, y_pred) pvalue = f_regression(x_data, y_data)[1][0] data3[x][y] = r2score data4[x][y] = pvalue Write(File4, data3, ds, img_datatype) Write(File5, data4, ds, img_datatype) end = time.time() print(end - start)
def f_regression_check(self, X, Y, names): print ('calc f_regression importance {}'.format(os.getpid())) f, pval = f_regression(X, Y, center=True) f[np.isnan(f)] = 0 #self.ranks["Corr."] = print ('calc f_regression finished !') return rank_to_dict(f, names)
def printMetrics(self): print() print("=================================================") print("=========== METRICS =============================") print("Features (put in Pandas df): ", self.feature_names) print('Mean squared error: %.2f' % mean_squared_error(self.y_test, self.y_pred)) print("Explained variance score: ", explained_variance_score(self.y_test, self.y_pred)) mi = mutual_info_regression(self.x_train, self.y_train) mi = mi / np.max(mi) fr, pval = f_regression(self.x_train, self.y_train, center=True) # center? print("Mutual Information: ", mi) #fr = fr / np.max(fr) print("f_regression: ", fr) print("pval: ", pval) print("R2 score: ", self.r2_score) print("=========== END METRICS =========================") print("=================================================") print()
def run(names, X, Y, filepath): filepath = filepath.replace(".pkl", "_{}.pkl".format(current_thread().name)) def glob_ranking(folder): ranking = {} for f in glob.iglob("{}*.pkl".format(folder)): ranking.update(load_cache(f)) return ranking while True: timestamp_start = time.time() done_ranking = glob_ranking(filepath) key, model, coef, order = FeatureProfile.queue.get() if key in done_ranking: log("{} was done before".format(key), INFO) else: if key == "Corr.": f, pval = f_regression(X, Y, center=True) FeatureProfile.ranking[key] = FeatureProfile.normalization(f, names) else: model.fit(X, Y) FeatureProfile.ranking[key] = FeatureProfile.normalization(np.abs(getattr(model, coef)), names, order) save_cache(copy.deepcopy(FeatureProfile.ranking), filepath) timestamp_end = time.time() log("Cost {:.4f} secends to finish {}".format(time.time() - timestamp_start, key), INFO) FeatureProfile.queue.task_done()
def f_test(self, X, y, ci=0.9): """Return the signficant columns of X in a linear regression against y with confidence interval ci Requires both X and y to be a data frame """ sig_cols = [] pvals = [] fscores = [] r2_scores = [] lm = linear_model.LinearRegression() for f in list(X.columns): pval = feature_selection.f_regression(X[f], y) if pval[1][0] < (1 - ci): sig_cols.append(f) fscores.append(pval[0][0]) pvals.append(pval[1][0]) lm.fit(X[[f]], y) r2_scores.append(metrics.r2_score(y, lm.predict(X[[f]]))) return pd.DataFrame({ 'feature': sig_cols, 'p-value': pvals, 'F score': fscores, 'r2': r2_scores })
def best_features(n_features, X_train, y_train, X_test, y_test): feature_importance = f_regression(train[continuous_cols], train[insurance_loss])[0] idx = np.argsort(-feature_importance)[:n_features] lr = LinearRegression() lr.fit(X_train.iloc[:, idx], y_train) return lr.score(X_test.iloc[:, idx], y_test)
def regression_feature_selection(train_features, train_labels, test_features, percent): # print(train_features[0,:]) # print(train_labels) ff = np.zeros((train_features.shape[1], train_labels.shape[1])) for p in range(train_labels.shape[1]): ff[:, p], _ = f_regression(train_features, train_labels[:, p]) # print(ff) ff = np.nanmean(ff, axis=1) # print(ff) features_to_keep = np.argsort(ff)[-int(ff.shape[0] / percent):] print(len(features_to_keep)) print(len(features_to_keep)) # threshold = int(np.nanmean(ff)*2) # features_to_keep = largest_indices(ff,10) new_train_features = np.zeros( (train_features.shape[0], len(features_to_keep))) new_test_features = np.zeros( (test_features.shape[0], len(features_to_keep))) for i, f in enumerate(features_to_keep): new_train_features[:, i] = train_features[:, f] new_test_features[:, i] = test_features[:, f] return new_train_features, new_test_features
def correlations(self): params = [ "Title", "Year", "Genre", "Director", "Writer", "Actors", "Language", "Country", "Runtime", "BoxOffice" ] df = pd.DataFrame(pd.read_csv(imdb_csv_path)) year = np.array(list(map(float, df['Year']))).reshape(-1, 1) boxoffice = np.array(list(map(float, df['BoxOffice']))) year_score = feature_selection.f_regression(year, boxoffice) print(year_score) def one_hot(string, boxoffice): enc = preprocessing.OneHotEncoder(handle_unknown='ignore') enc.fit(np.array(list(set(df[string]))).reshape(-1, 1)) try: feature = enc.transform(np.array(df[string]).reshape( -1, 1)).toarray() except ValueError: print(string) exit() feature = np.array([ ''.join(re.findall(r'\d+', g)) for g in list(map(str, feature)) ]).reshape(-1, 1) score = feature_selection.f_regression(feature, boxoffice) return score # cal_scores score = dict() for p in params[:-1]: score[p] = one_hot(p, boxoffice) print(score)
def find_next_best(dt_kmer, y, selected_kmers, to_be_selected_kmers, consider_shift=True): """ perform stepwise model selection while preventing to add a motif similar to the already selected motifs. """ F, pval = f_regression(dt_kmer[to_be_selected_kmers], y) kmer = to_be_selected_kmers.pop(pval.argmin()) selected_kmers.append(kmer) def select_criterion(s1, s2, consider_shift=True): if hamming_distance(s1, s2) <= 1: return False if consider_shift and hamming_distance(s1[1:], s2[:-1]) == 0: return False if consider_shift and hamming_distance(s1[:-1], s2[1:]) == 0: return False return True to_be_selected_kmers = [ckmer for ckmer in to_be_selected_kmers if select_criterion(ckmer, kmer, consider_shift)] if len(to_be_selected_kmers) == 0: return selected_kmers else: # regress out the new feature lm = LinearRegression() lm.fit(dt_kmer[selected_kmers], y) y_new = y - lm.predict(dt_kmer[selected_kmers]) return find_next_best(dt_kmer, y_new, selected_kmers, to_be_selected_kmers, consider_shift)
def linreg_evaluate_model(X: pd.DataFrame, y: pd.Series, y_pred: np.ndarray) -> None: """ print evaluation metrics for linear regression model """ y_label = y.columns[0] X_labels = X.columns print("Univariate Linear Regression Model Evaluation") meanse = mean_squared_error(y, y_pred) print(f"\tMean SE: {meanse:.3f}") meanae = mean_absolute_error(y, y_pred) print(f"\tMean AE: {meanae:.3f}") print() medianae = median_absolute_error(y, y_pred) print(f"\tMedian AE: {medianae:.3f}") print() r2 = r2_score(y, y_pred) print(f"\t{r2:.2%} of the variance in {y_label} can be explained " f"by {X_labels.tolist()}.") print() print("P-VALUE") f_vals, p_vals = f_regression(X, y) print(f"\tTrain: {p_vals[0]:.3}")
def select(self, dataframe: 'pd.DataFrame', y_column: str) -> list: ''' Selecting the most important columns :param dataframe: pandas DataFrame Data Frame on which the algorithm is applied :param y_column: str The column name of the value that we what to predict :return: list The list of features that are selected by the algorithm as the best one ''' # Defining the list with names of columns except the predicted one X_columns = [col for col in dataframe.columns if col != y_column] # Creating the F and p-value history dictionaries self.F_history = {} self.p_value_history = {} for col in X_columns: self.F_history[col] = [] self.p_value_history[col] = [] # Defining the feature states feature_state = list(np.ones(len(X_columns))) while True: self.iter += 1 # Extracting the selected columns X_cols = self.bin_to_cols(feature_state, X_columns) X = dataframe[X_cols].values y = dataframe[y_column].values # Choosing different strategy depending whatever it is a classification or regression. if self.classification: F_vals, p_vals = f_classif(X, y) else: F_vals, p_vals = f_regression(X, y) index = 0 for col in X_columns: if col in X_cols: self.F_history[col].append(float(F_vals[index])) self.p_value_history[col].append(float(p_vals[index])) index += 1 else: self.F_history[col].append(-1) self.p_value_history[col].append(-1) # Choosing the max value of p-value max_PValue = max(p_vals) # Erasing the column with the p-value equal with the max value of the p-value, if the max value is # higher than significance level if max_PValue > self.significance_level: for j in range(len(X_cols)): if p_vals[j].astype(float) == max_PValue: feature_state[X_columns.index(X_cols[j])] = 0 else: break # Returning the chose columns. self.choosed_cols = self.bin_to_cols(feature_state, X_columns) return self.choosed_cols
def work_sequence(self): # is it OK to do the intersect and the linear regression 23 extra times? # clear G, y, snp_name, _ = load_intersect(self.snp_reader, self.pheno_fn) # compute linear regression _, p_values_lin = f_regression(G, y, center=True) # set up empty return structures #self.rs = snp_name #self.p_values = -np.ones(len(snp_name)) # get chr names/id chr_ids = self.snp_reader.pos[:,0] #self.pos = self.snp_reader.pos #loco = [[range(0,5000), range(5000,10000)]] loco = LeaveOneChromosomeOut(chr_ids, indices=True) if len(loco) is not self.chrom_count : raise Exception("The snp reader has {0} chromosome, not {1} as specified".format(len(loco),self.chrom_count)) for i, (train_snp_idx, test_snp_idx) in enumerate(loco): if i == 0: result = {"p_values":-np.ones(len(snp_name)), "p_values_lin": p_values_lin, "rs":snp_name, "pos":self.snp_reader.pos} else: result = None yield lambda i=i, train_snp_idx=train_snp_idx,test_snp_idx=test_snp_idx,result=result,G=G,y=y: self.dowork(i,train_snp_idx,test_snp_idx,result,G,y) # the 'i=i',etc is need to get around a strangeness in Python
def summary(X_vars, y_var): """ Mutual information (MI) [R176] between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency. """ cols = X_vars.columns.tolist() lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_vars, y_var) model = SelectFromModel(lsvc, prefit=True) labels = [cols[x] for x in model.get_support(indices=True) if x] X_new = model.transform(X_vars) mi = mutual_info_regression(X_vars, y_var) if np.max(mi) != 0: mi /= np.max(mi) f_test, f_test_pvals = f_regression(X_vars, y_var) if np.max(f_test) != 0: f_test /= np.max(f_test) t_test = [math.sqrt(elem) for elem in f_test] print(cols) print("F-test") print(list(f_test)) print("F-test pvals") print(list(f_test_pvals)) print("t-test") print(list(t_test)) print("t-test pvals") print(list(f_test_pvals)) print("mutual information") print(list(mi)) print("Linear SVC Feature reduction") print(cols, "->", labels)
def fit(self, X, y=None): if not self.float_k is None: Xn = X[:, :self.float_k] else: Xn = X self.k_ = Xn.shape[1] pvals = [] self.logger.info(f'Xn.shape:{Xn.shape},Xn:{Xn}') for fn in self.transform_funcs.values(): TXn = fn(Xn) try: F, p = f_regression(TXn, y) except: self.logger.exception(f'error doing f_regression') p = np.array([10000.] * TXn.shape[1]) pvals.append(p[None, :]) pval_stack = np.concatenate(pvals, axis=0) # each row is a transform bestTloc = np.argsort(pval_stack, axis=0)[0, :] Ts = list(self.transform_funcs.keys()) self.bestTlist = [Ts[i] for i in bestTloc] self.logger.info(f'bestTlist:{self.bestTlist},') T_s = list(self.transform_funcs.keys()) self.best_T_ = [T_s[loc] for loc in bestTloc] return self
def SignificanceMatrix(data): col = data.columns colTypes = [ check_type(x) for x in data.dtypes ] relationMatrix = pd.DataFrame(index=col,columns=col) for i in range(len(col)): for j in range(i, len(col)): if i==j: pval = 1 relationMatrix.loc[col[i],col[j]] = pval else: tempdata = data[[col[i],col[j]]] tempdata = tempdata.dropna(axis=0) #Remeber to add warning where missing data is removed col1 = tempdata[col[i]] col2 = tempdata[col[j]].ravel() # print tempdata.dtypes # print colTypes[i],colTypes[j] if colTypes[i] == colTypes[j]: if colTypes[i] == "continuous": # print "both cont" pval = np.round(feature_selection.f_regression(pd.DataFrame(col1),col2)[1][0],3) else: pval = chisq_independence(tempdata[col[i]],tempdata[col[j]]) else: if colTypes[i] == "continuous": pval = np.round(feature_selection.f_classif(pd.DataFrame(col1),col2)[1][0],3) else: pval = np.round(feature_selection.f_classif(pd.DataFrame(col2),col1)[1][0],3) relationMatrix.loc[col[i],col[j]] = pval relationMatrix.loc[col[j],col[i]] = pval return relationMatrix.fillna("NAN")
def linear_trend(datay, *datax): """ :param datay: :param datax: :return: 返回趋势r2score和显著性水平pvalue """ DATA = [] for i, temp in enumerate(datax): temp = temp.reshape(-1, 1) if i == 0: x_data = temp else: x_data = np.hstack((x_data, temp)) if (len(x_data[np.isnan(x_data)]) > 0) | (len(datay[np.isnan(datay)]) > 0): return np.full((i + 3), np.nan) else: regr = LinearRegression().fit(x_data, datay) for i in regr.coef_: DATA.append(i) y_pred = regr.predict(x_data) r2score = r2_score(datay, y_pred) pvalue = f_regression(datay.reshape(-1, 1), y_pred)[1][0] DATA.append(r2score) DATA.append(pvalue) DATA = np.array(DATA) return DATA
def _select_k_best_inorder(self, k_best, X_train, y_train, X_pred, ar_cutoff=52): X_train_select = X_train[:, ar_cutoff:] if ar_cutoff is not None else X_train X_pred_select = X_pred[:, ar_cutoff:] if ar_cutoff is not None else X_pred f_scores, null = f_regression(X_train_select, y_train) k_best_f_scores = np.argsort(f_scores)[::-1][:k_best] X_train_best_sorted = X_train_select[:, k_best_f_scores] X_pred_best_sorted = X_pred_select[:, k_best_f_scores] if ar_cutoff is not None: X_train_best_sorted = np.hstack( (X_train[:, :ar_cutoff], X_train_best_sorted)) X_pred_best_sorted = np.hstack( (X_pred[:, :ar_cutoff], X_pred_best_sorted)) return X_train_best_sorted, X_pred_best_sorted
def f_regression_feature_analyse(_df, store_item_nbrs): df = _df.copy() p.figure() X, y = ut.get_train_data(df) feature_list = X.columns.values[2:] importance_value = np.zeros(len(feature_list)) total = 0 for sno, ino in store_item_nbrs: if(sno == 35): continue X_1 = X[(X.store_nbr == sno) & (X.item_nbr == ino)] X_1 = X_1.drop(['store_nbr','item_nbr'], axis=1) y_1 = y[X_1.index.values] features = feature_list F, _ = f_regression(X_1.values, y_1.values) importance = get_importance(np.nan_to_num(F)) print(importance) # to draw the each (sno, ino) pic need to uncomment underline code # draw_feature_importance(importance, features, sno, ino) importance_value += len(X_1.index) * np.array(importance) total = total + len(X_1.index) print(importance_value) importance_value = importance_value / total draw_total_average_importance(importance_value, feature_list)
def example_one(): np.random.seed(0) X = np.random.rand(1000, 3) y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000) f_test, _ = f_regression(X, y) f_test /= np.max(f_test) mi = mutual_info_regression(X, y) mi /= np.max(mi) plt.figure(figsize=(15, 5)) for i in range(3): plt.subplot(1, 3, i + 1) plt.scatter(X[:, i], y, edgecolor='black', s=20) plt.xlabel("$x_{}$".format(i + 1), fontsize=14) if i == 0: plt.ylabel("$y$", fontsize=14) plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]), fontsize=16) plt.show() return X[:, 0], X[:, 1], X[:, 2], y
def FilterInLoading(np_genotype, np_phenotype): """ This function is for filtering low quality varaint. Before modeling each subset of genotype features, two criteria were adopted to exclude low quality data. The first criterion is that the genotype frequency of a feature should exceed 5%, where the genotype frequency means the proportion of genotype among the total samples in the dataset. The second criterion is regarding the association between the feature and the phenotype. We used χ2 test to estimate the association between the feature and the phenotype, and the p-value should be smaller than 0.01. Args: np_genotype (ndarray): 2D array containing genotype data with `int8` type np_phenotype (ndarray): 2D array containing phenotype data with `float` type Returns: (ndarray): np_genotype 2D array containing genotype data with `int8` type """ try: ### variance check (detect variance < 0.05) sk_variance = VarianceThreshold(threshold=(.95 * (1 - .95))) np_genotype = sk_variance.fit_transform(np_genotype) ### f regression feature selection np_fRegression = -np.log10( f_regression(np_genotype.astype(int), np_phenotype[:, -1].astype(float))[1]) np_selectedIdx = np.array([x > 2 for x in np_fRegression]) np_genotype = np_genotype[:, np_selectedIdx] return np_genotype.shape[1] except: return 0
def forward_select(dataframe, threshold=10): """ 前 向 特 征 选 择 (Forward Feature Selection ) 前向特征选择其实就是反向特征消除的相反过程,即找到能改善模型性能的最佳特征,⽽不是删除弱影响特征。它背后的思路如下所述: 选择⼀个特征,⽤每个特征训练模型n次,得到n个模型。 选择模型性能最佳的变量作为初始变量。 每次添加⼀个变量继续训练,重复上⼀过程,最后保留性能提升最⼤的变量。 ⼀直添加,⼀直筛选,直到模型性能不再有明显提⾼。 Args: dataframe: Returns: [注]:前向特征选择和反向特征消除耗时较久,计算成本也都很高,所以只适用于输入变量较少的数据集。 """ from sklearn.feature_selection import f_regression # 返回⼀个数组,其中包括变量F值和每个F对应的p值。 ffs = f_regression(dataframe.drop(axis=1, inplace=False, columns='label'), dataframe.label) variable = [] for i in range(0, len(df.columns) - 1): if ffs[0][i] >= threshold: # 在这⾥,我们选择F值⼤于10的变量 variable.append(df.columns[i]) return variable
def run_knn_regressor(X_train, X_test, y_train, y_test, k=5, weights='uniform', print_results=False): knn = KNeighborsRegressor(n_neighbors=k, weights=weights) # Train the model knn.fit(X_train, y_train) F, pval = f_regression(X_train, y_train) model_stats_dict['train_stats']['r-squared'] = [knn.score(X_train, y_train)] model_stats_dict['train_stats']['features'] = X_train.columns.tolist() model_stats_dict['train_stats']['F-stat'] = F model_stats_dict['train_stats']['pval'] = pval # Evaluate predictions on test set y_pred = knn.predict(X_test) # Save model datasets model_stats_dict['test_data']['y_test'] = y_test model_stats_dict['test_data']['y_pred'] = y_pred # Save model statistics model_stats_dict['test_stats']['r-squared'] = [knn.score(X_test, y_test)] model_stats_dict['test_stats']['mse'] = [mse(y_test, y_pred)] model_stats_dict['test_stats']['rmse'] = [rmse(y_test, y_pred)] model_stats_dict['test_stats']['mae'] = [mae(y_test, y_pred)] model_stats_dict['test_stats']['mape'] = [mape(y_test, y_pred)] if print_results: print_prediction_metrics(y_test, y_pred)
def linearCoe_hq(X, y, cut=0.05): ## linear model coefficient based===== from sklearn.feature_selection import f_regression f, pval = f_regression(X, y, center=True) subs = np.array([False] * X.shape[1]) subs[pval < cut] = True return(subs)
def getModel(**args): formatting = Pipeline([("other", Extractor(getFormattingFeatures)), ("scaler", StandardScaler())]) question = Pipeline([("extract", Extractor(getFirstWordDict)), ("counter", DictVectorizer())]) topics = Pipeline([ ("extract", Extractor(lambda x: {t["name"]: 1 for t in x["topics"]})), ("counter", DictVectorizer()) ]) none_dict = None if args["none_var"] == True: none_dict = {"none": 1} else: none_dict = {} ctopic = Pipeline([("extract", Extractor(lambda x: {x["context_topic"]["name"]: 1} if x["context_topic"] else none_dict)), ("counter", DictVectorizer())]) topic_question = Pipeline([ ("content", FeatureUnion([("question", question), ("topics", topics), ("ctopic", ctopic)])), ]) """ others = Pipeline([ ("extract", Extractor(lambda x: [1 if x["anonymous"] else 0])), ("scaler", StandardScaler()) ]) """ followers = Pipeline([ ("extract", Extractor(lambda x: [ math.log( sum(t["followers"] for t in x["topics"]) + args["smoother"]) ])), ("scaler", StandardScaler()) ]) k_means = KMeans(n_clusters=96, random_state=20, n_init=3, max_iter=8, tol=1e-3) label_binarizer = LabelBinarizer(sparse_output=True) svr = LinearSVR(C=0.04, loss="squared_epsilon_insensitive") model = Pipeline([ ("union", FeatureUnion([("content", topic_question), ("formatting", formatting), ("followers", followers)])), ("union2", FeatureUnion([("transductive", Pipeline([("k_means", PredictTransformer(k_means)), ("label_binarizer", OmitTargetTransformer(label_binarizer))])), ("pass_through", PassThroughTransformer())])), ("f_sel", SelectKBest(score_func=lambda X, y: f_regression(X, y, center=False), k=args["all_K"])), ("svr", svr) ]) return model
def getModel(**args): formatting = Pipeline([ ("other", Extractor(getFormattingFeatures)), ("scaler", StandardScaler()) ]) question = Pipeline([ ("extract", Extractor(getFirstWordDict)), ("counter", DictVectorizer()) ]) topics = Pipeline([ ("extract", Extractor(lambda x: {t["name"] : 1 for t in x["topics"]})), ("counter", DictVectorizer()) ]) none_dict = None if args["none_var"] == True: none_dict = {"none" : 1} else: none_dict = {} ctopic = Pipeline([ ("extract", Extractor(lambda x: {x["context_topic"]["name"] : 1} if x["context_topic"] else none_dict)), ("counter", DictVectorizer()) ]) topic_question = Pipeline([ ("content", FeatureUnion([ ("question", question), ("topics", topics), ("ctopic", ctopic) ])), ]) """ others = Pipeline([ ("extract", Extractor(lambda x: [1 if x["anonymous"] else 0])), ("scaler", StandardScaler()) ]) """ followers = Pipeline([ ("extract", Extractor(lambda x: [math.log(sum(t["followers"] for t in x["topics"]) + args["smoother"])])), ("scaler", StandardScaler()) ]) k_means = KMeans(n_clusters = 96, random_state = 20, n_init = 3, max_iter = 8, tol = 1e-3) label_binarizer = LabelBinarizer(sparse_output = True) svr = LinearSVR(C = 0.04, loss = "squared_epsilon_insensitive") model = Pipeline([ ("union", FeatureUnion([ ("content", topic_question), ("formatting", formatting), ("followers", followers) ])), ("union2", FeatureUnion([ ("transductive", Pipeline([ ("k_means", PredictTransformer(k_means)), ("label_binarizer", OmitTargetTransformer(label_binarizer)) ])), ("pass_through", PassThroughTransformer()) ])), ("f_sel", SelectKBest(score_func = lambda X, y : f_regression(X, y, center = False), k = args["all_K"])), ("svr", svr) ]) return model
def f_test_univariate_selection(X, y): """Given data instances (X) and their corresponding targets (y), this method indicates which features of X are most correlated with y according to an f_test based univariate feature selection """ f_test, _ = f_regression(X, y) f_test /= np.max(f_test) print('ranked features -- f-test', f_test)
def getTopFeaturesF(df, predictor): y = df[predictor] X = df.drop([predictor], axis=1)._get_numeric_data() columns = X.columns f, pval = f_regression(X, y, center=True) f[np.isnan(f)] = 0 # return rank_to_dict(f, columns) return (dict(zip(columns, f)), dict(zip(columns, pval)))
def feature_correls(X, y, coords): """ Compute correlations between each feature and the label, correcting for spatial auto-correlaiton. """ fvals, pvals = f_regression(X, y) for i in range(len(X[0])): r, p = spatial.spatial_correlation(X[:, i], y, coords) pvals[i] = p return fvals, pvals
def linear_randomForest_Regression(data, target, network): lr = linear_model.LinearRegression(normalize = True) rfr = RandomForestRegressor(n_estimators = 30,max_depth = 12, max_features='auto') kf = KFold(len(target), n_folds=10, shuffle=True, random_state=None) RMSE_LINEAR = [] RMSE_RFR = [] for train_index, test_index in kf: data_train, data_test = data[train_index], data[test_index] target_train, target_test = target[train_index], target[test_index] lr.fit(data_train, target_train) rfr = rfr.fit(data_train, target_train) rmse_linear = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2)) RMSE_LINEAR.append(rmse_linear) rmse_rfr = sqrt(np.mean((rfr.predict(data_test) - target_test) ** 2)) RMSE_RFR.append(rmse_rfr) #scores = cross_validation.cross_val_score(rfr,data_test, target_test.ravel, cv=10) #print np.mean(scores) F, pval = f_regression(data_test, lr.predict(data_test)) print(np.mean(RMSE_RFR)) test_times = np.arange(1,11) plt.figure() plt.plot(test_times, RMSE_LINEAR, label = "RMSE in linear regression with 10-fold cv") plt.plot(test_times, RMSE_RFR, label = "RMSE in random forest regression with 10-fold cv") plt.ylim(0.0, 0.12) #plt.title("RMSE comparison between linear regression and random forest regression") plt.xlabel("cross validation times") plt.ylabel("RMSE") plt.legend() network['predicted_lr'] = lr.predict(data); network['predicted_rfr'] = rfr.predict(data); network_time_target = network.groupby(["Week #", "Day of Week","Backup Start Time - Hour of Day"])["Size of Backup (GB)"].sum() network_time_predict_lr = network.groupby(["Week #", "Day of Week","Backup Start Time - Hour of Day"])["predicted_lr"].sum() network_time_predict_rfr = network.groupby(["Week #", "Day of Week","Backup Start Time - Hour of Day"])["predicted_rfr"].sum() time = np.arange(1, len(network_time_target)+1) plt.figure() plt.scatter(time, network_time_target, s = 15, color = 'red', label = "Actual values over time") plt.scatter(time, network_time_predict_lr, s = 15, color = 'green', label = "predicted values with linear model") plt.xlabel('Time') plt.ylabel('Size of backup(GB)') plt.ylim(-2,12) plt.legend() plt.figure() plt.plot(time[0:120], network_time_predict_rfr[0:120], label = "predicted values with random forest tree model") plt.legend() plt.figure() plt.scatter(lr.predict(data), lr.predict(data) - target, label = "residual VS fitted values") plt.xlabel("fitted values") plt.ylabel("residual") plt.legend() plt.ylim(-0.8,0.4) plt.show() return RMSE_LINEAR
def perform_univariate_linear_regression_tests(X, y): f_test, p_values = f_regression(X, y) f_test /= np.max(f_test) print('+'*79) print('F1-test on log duration: {0:.4f}, activeness: {1:.4f}, mean: {2:.4f}, std: {3:.4f}, ' 'min: {4:.4f}, 25th: {5:.4f}, median: {6:.4f}, 75th: {7:.4f}, max: {8:.4f}'.format(*f_test)) print('F1 p-value on log duration: {0:.4f}, activeness: {1:.4f}, mean: {2:.4f}, std: {3:.4f}, ' 'min: {4:.4f}, 25th: {5:.4f}, median: {6:.4f}, 75th: {7:.4f}, max: {8:.4f}'.format(*p_values)) print('+' * 79)
def f_regression(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5): considered_independent_variables_per_model, patsy_models = \ construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value) y, X = dmatrices(patsy_models[0], df, return_type='dataframe') f_test, r = f_regression(X, y, center=True) logger.info(f_test) logger.info(r) return
def test_f_regression(): # Test whether the F test yields meaningful results # on a simple simulated regression problem X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, y) assert_true((F > 0).all()) assert_true((pv > 0).all()) assert_true((pv < 1).all()) assert_true((pv[:5] < 0.05).all()) assert_true((pv[5:] > 1.e-4).all()) # again without centering, compare with sparse F, pv = f_regression(X, y, center=False) F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False) assert_array_almost_equal(F_sparse, F) assert_array_almost_equal(pv_sparse, pv)
def test_f_regression(self): diabetes = datasets.load_diabetes() df = pdml.ModelFrame(diabetes) result = df.feature_selection.f_regression() expected = fs.f_regression(diabetes.data, diabetes.target) self.assertEqual(len(result), 2) self.assert_numpy_array_almost_equal(result[0], expected[0]) self.assert_numpy_array_almost_equal(result[1], expected[1])
def _k_best_indeces(data, targets, selection_method, k): """get indices for the k best features depending on the scores""" assert k > 0 if selection_method == 'linear': scores, _ = f_regression(data, targets) elif selection_method == 'forest': rfr_sel = RandomForestRegressor(compute_importances=True, random_state=0) scores = rfr_sel.fit(data, targets).feature_importances_ assert not (scores < 0).any() assert len(scores) >= k scores[np.isnan(scores)] = 0 return np.argsort(scores)[-k:]
def test_f_regression(): """ Test whether the F test yields meaningful results on a simple simulated regression problem """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) F, pv = f_regression(X, Y) assert (F > 0).all() assert (pv > 0).all() assert (pv < 1).all() assert (pv[:5] < 0.05).all() assert (pv[5:] > 1.0e-4).all()
def scalePCAcorrelate(df_numerical, df_w_mdata, metadata_cols, transformed): rv = df_numerical.shape[0] if transformed: X_std2 = df_numerical.values.T else: X_std2 = StandardScaler().fit_transform(df_numerical.values.T) rows_n, cols_n = X_std2.shape print "\nPerforming PCA" pca2 = PCA(n_components=100, random_state=42) pca2.fit(X_std2) no1 = pca2.explained_variance_ratio_[0] no2 = pca2.explained_variance_ratio_[1] print "Top two components explain {} and {} of variance.".format(no1, no2) all_cors, p_comp_n, exp_vars, corr_ps = [], [], [], [] all_pvals, p_comp_nF, exp_vars2 = [], [], [] for mdata in metadata_cols: md_arr = np.array(df_w_mdata[mdata]) raw_corrs = [ss.pearsonr(pca2.components_[i, :], md_arr) for i in range(100)] corrs, c_pvals = zip(*raw_corrs) if not np.all(np.isfinite(md_arr)): print "Replacing {} not finite # with 0".format((~np.isfinite(md_arr)).sum()) md_arr[~np.isfinite(md_arr)] = 0 pvals = [f_regression(pca2.components_[i, :].reshape(rv, 1), md_arr)[1][0] for i in range(100)] all_pvals.append(np.array(pvals).min()) all_cors.append(np.array(corrs).max()) pca_comp_no = np.argmax(np.array(corrs)) corr_ps.append(np.array(c_pvals)[pca_comp_no]) pca_comp_no2 = np.argmin(np.array(pvals)) p_comp_n.append(pca_comp_no+1) p_comp_nF.append(pca_comp_no2+1) exp_vars.append(pca2.explained_variance_ratio_[pca_comp_no]) exp_vars2.append(pca2.explained_variance_ratio_[pca_comp_no2]) data_ = np.vstack((all_cors, p_comp_n, exp_vars, corr_ps)).T data_2 = np.vstack((all_pvals, p_comp_nF, exp_vars2)).T colset = ['Correlation', 'Component', 'Explained Variance', 'P-value'] colset2 = ['Pvalue', 'Component_F', 'Explained Variance_F'] to_return = pd.DataFrame(data=data_, index=metadata_cols, columns=colset) f_to_return = pd.DataFrame(data=data_2, index=metadata_cols, columns=colset2) f_to_return.sort_values(['Component_F', 'Pvalue'], ascending=[True, True], inplace=True) to_return.sort_values(['Component', 'Correlation'], ascending=[True, False], inplace=True) final_return = to_return[to_return.Correlation.notnull()] final_f_return = f_to_return[f_to_return.Pvalue.notnull()] return final_return, final_f_return
def process_window_dir(window_dir, model, features): print 'Processing', window_dir alphas = [0.025] only_feature_selection = False X, y = load_files(window_dir, features) X = scale(X) from sklearn.feature_selection import SelectKBest, f_regression featureSelector = SelectKBest(score_func=f_regression,k=20) featureSelector.fit(X,y) print 'Selected features', [FEATURE_NAMES[i] for i in list(featureSelector.get_support(indices=True))] if only_feature_selection: baseline_mean(X,y) baseline_zero(X,y) F, pval = f_regression(X, y) for i,f in enumerate(F): if i < len(FEATURES): name = FEATURES[i] else: name = NLP_FEATURES[i-11] print 'F-Statistic for %s is %f with p-value %f' % (name, f, pval[i]) return None else: scores = [] print 'sum of y is %d' % sum(y) clf = model() clf.fit(X[:30], y[:30]) print 'small score is %g' % clf.score(X[30:60], y[30:60]) # K-fold cross_validation kf = cross_validation.KFold(X.shape[0], n_folds=10, shuffle=True) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for alpha in alphas: clf = model() clf.set_params(alpha=alpha) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) scores.append(score) print '{} score is {} when alpha is {}'.format(model.__name__, score, alpha) return scores
def linear_regression(data, target): lr = linear_model.LinearRegression(normalize = True) kf = KFold(len(target), n_folds=10, shuffle=True, random_state=None) RMSE_LINEAR = [] for train_index, test_index in kf: data_train, data_test = data[train_index], data[test_index] target_train, target_test = target[train_index], target[test_index] lr.fit(data_train, target_train) rmse_linear = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2)) RMSE_LINEAR.append(rmse_linear) #scores = cross_validation.cross_val_score(rfr,data_test, target_test.ravel, cv=10) #print np.mean(scores) F, pval = f_regression(data_test, lr.predict(data_test)) print(pval) test_times = np.arange(1,11) plt.figure() plt.plot(test_times, RMSE_LINEAR, label = "RMSE in linear regression with 10-fold cv") # plt.ylim(0.0, 0.12) #plt.title("RMSE comparison between linear regression and random forest regression") plt.xlabel("cross validation times") plt.ylabel("RMSE") plt.legend() predicted = lr.predict(data); index = np.arange(1, len(predicted)+1) plt.figure() plt.scatter(index, target, s = 15, color = 'red', label = "Actual") plt.scatter(index, predicted, s = 15, color = 'green', label = "Fitted") plt.xlabel('Index') plt.ylabel('MEDV') plt.legend() plt.figure() plt.scatter(predicted,predicted-target,label = "residual VS fitted values") plt.xlabel("fitted values") plt.ylabel("residual") plt.legend() # plt.ylim(-0.8,0.4) plt.show() return RMSE_LINEAR
def f_regression_select(X, y, maxf=300, pvals=True, names=None, verbose=0, old_idx_sel=None): "Select features using f_regression" if names is None: names = ["f_%d" % (i+1) for i in range(X.shape[1])] if not old_idx_sel: old_idx_sel = range(X.shape[1]) f = f_regression(X, y, center=False) # (F-value, p-value, col, name) a = [(f[0][i], f[1][i], old_idx_sel[i], names[i]) for i in range(X.shape[1])] if pvals: a = [e for e in a if e[1] < 0.05] a = sorted(a, reverse=True) idx_sel = [e[2] for e in a[:maxf]] if verbose > 0: b = a[:maxf] def out(): if min(maxf, len(b)) > 100: print("F_select(%d):" % len(b), b[:90], "...", b[-10:], file=sys.stderr) else: print("F_select(%d):" % len(b), b[:maxf], file=sys.stderr) def out2(): print("F_select(%d):" % len(b), file=sys.stderr) def pr(m1, m2): for i in range(m1, m2): row = b[i] print("%10s %10.2f %15g %10d" % (row[3], row[0], row[1], row[2]), file=sys.stderr) n = min(len(b), maxf) m = 90 if n > 100 else n pr(0, m) if n > 100: print("...", file=sys.stderr) pr(len(b)-10, len(b)) if verbose > 1: out2() else: out() return np.asarray(idx_sel, dtype=int)
def RankFeatures(X,Y,names): Y = Y.reshape(len(Y),) ranks = {}; lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear.reg"] = rank_to_dict(np.abs(lr.coef_), names); ridge = Ridge(alpha=7) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.04) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), names) #stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=5) rfe.fit(X,Y) ranks["RFE"] = rank_to_dict([float(i) for i in rfe.ranking_], names, order=-1) rf = RandomForestRegressor() rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) r = {} for name in names: r[name] = float(str(round(np.mean([ranks[method][name] for method in ranks.keys()]), 3))); print(r); methods = sorted(ranks.keys()); ranks["Mean"] = r; methods.append("Mean"); print ("\t%s" % "\t".join(methods)); for name in names: print ("%s\t%s" % (name, "\t".join(map(str, [ranks[method][name] for method in methods]))))
def provideSuggestion(self): y = str(self.comboBox.currentText()) xList = self.headerName xList.remove(y) ddf = self.data[xList] #x=["COST","Gender",'Age','Education'] #y=["CLICKS"] vardict = LazyDict() categorical = [] nonCategorical = [] for i in xList: if self.runnable: if ddf[i].dtype=="object": categorical.append(i) else: vardict.keylist([i], i) nonCategorical.append(i) df = self.data[nonCategorical] for j in categorical: dummy_b = pd.get_dummies(ddf[j],prefix=j) dummy_columns = dummy_b.columns cols = list(dummy_columns[1:len(dummy_columns)]) vardict.keylist(cols, j) df[cols] = dummy_b[dummy_columns[1:len(dummy_columns)]] variables = list(df.columns) X = df.as_matrix() Y = self.data[y].as_matrix() F, pval = feature_selection.f_regression(X, Y) final_variables = [] for i in range(0,len(pval)): if(pval[i]<0.05): if vardict[variables[i]] not in final_variables: final_variables.append(vardict[variables[i]]) self.SelectedVariables = final_variables self.nonSelectedVariables = [x for x in self.headerName if x not in final_variables] print self.SelectedVariables print self.nonSelectedVariables self.createNonselectedTable() self.createSelectedTable()
def test_bias(): """ make sure we get the same result for setting C=unitvec """ S, y = get_example_data() C = np.ones((len(y),1)) from sklearn.feature_selection import f_regression F1, pval1 = f_regression(S, y, center=True) F2, pval2 = f_regression_cov(S, C, y) F3, pval3 = f_regression_cov_alt(S, C, y) # make sure values are the same np.testing.assert_array_almost_equal(F1, F2) np.testing.assert_array_almost_equal(F2, F3) np.testing.assert_array_almost_equal(pval1, pval2) np.testing.assert_array_almost_equal(pval2, pval3)
def getTopFeatures(train_x, train_y, n_features=100): f_val, p_val = f_regression(train_x,train_y) f_val_dict = {} p_val_dict = {} for i in range(len(f_val)): if math.isnan(f_val[i]): f_val[i] = 0.0 f_val_dict[i] = f_val[i] if math.isnan(p_val[i]): p_val[i] = 0.0 p_val_dict[i] = p_val[i] sorted_f = sorted(f_val_dict.iteritems(), key=operator.itemgetter(1),reverse=True) sorted_p = sorted(p_val_dict.iteritems(), key=operator.itemgetter(1),reverse=True) feature_indexs = [] for i in range(0,n_features): feature_indexs.append(sorted_f[i][0]) return feature_indexs
def select_bestwords(D, y, nmax = 100, is_classif=True): """ Select nmax best correleted words in D (list of dicts) with goal = y """ y = np.asarray(y) v = DictVectorizer(sparse=True) try: X = v.fit_transform(D) except ValueError: logger.warning("===Except*** in select_bestwords D:%d y:%d",len(D),len(y)) return (set([])) if is_classif: f=f_classif(X,y) else: f=f_regression(X,y) names = v.get_feature_names() # (F-value, p-value, word) a = [(f[0][i], f[1][i], names[i]) for i in range(len(names))] a = sorted([e for e in a if e[1]<0.05], reverse=True) logger.debug("select_bestwords:%s",a[:16]) top = set([ e[2] for e in a[:nmax] ]) return top
def train_f_selection(features_filename, targets_filename, model_name): """ Does feature selection using an F-test and fits a Ridge model. """ print('Training model', model_name) # Read features and targets Xs, y = read_inputs(features_filename, targets_filename) # Set regularization parameters and number of features regularization_params = {'alpha': np.logspace(2.5, 5, 25)} ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000] # Initialize containers for weights and bias num_features = Xs.shape[1] num_outputs = y.shape[1] weights = np.zeros((num_outputs, num_features)) bias = np.zeros(num_outputs) # And for bookkeeping best_models = {'R2': np.full(num_outputs, -np.inf), 'alpha': np.zeros(num_outputs), 'k': np.zeros(num_outputs)} # Over every output element for i in range(num_outputs): y_i = y[:, i] # Train feature selector _, p_values = f_regression(Xs, y_i) sorted_indices = p_values.argsort() # Number of features with p < 0.01 and p < 0.001 p_value_ks = [(p_values < 0.01).sum(), (p_values < 0.001).sum()] p_value_ks = [x for x in p_value_ks if x!=0] # Train models with different number of features for k in sorted(ks + p_value_ks): # Select best k features selected_features = sorted_indices[:k] Xs_select = Xs[:, selected_features] # Train model (searching for best regularization parameter) cv = GridSearchCV(Ridge(), regularization_params, cv=5, n_jobs=-1).fit(Xs_select, y_i) model = cv.best_estimator_ # If best model yet, store it if cv.best_score_ > best_models['R2'][i]: weights[i, :] = 0 weights[i, selected_features] = model.coef_ bias[i] = model.intercept_ best_models['R2'][i] = cv.best_score_ best_models['alpha'][i] = model.alpha best_models['k'][i] = k # Report and save checkpoint if i%100 == 0: print(i+1, 'out of', num_outputs) print('R2:', best_models['R2'][:i]) print('Alphas:', best_models['alpha'][:i]) print('Ks:', best_models['k'][:i]) print('Saving checkpoint...') checkpoint_name = model_name + '_' + str(i) + '.h5' save_linear_model(checkpoint_name, weights, bias) # Print final cross-validation results print('Final results') print(i+1, 'out of', num_outputs) print('R2:', best_models['R2']) print('Average R2:', best_models['R2'].mean()) print('Alphas:', best_models['alpha']) print('Ks:', best_models['k']) print('Average K:', best_models['k'].mean()) # Save model save_linear_model(model_name + '.h5', weights, bias)
from sklearn import ensemble from sklearn.ensemble import RandomForestRegressor from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn import svm train_feature,train_target=prj4.feature_seclection(file_str='./tweet_data/tweets_#patriots.txt',start_year=2015,start_month=1,start_day=17,start_hour=16, end_year=2015, end_month=2, end_day=1, end_hour=16) test_feature,test_target,startmon,startday,starthour=prj4.sample_seclection (file_str='./test_data/sample5_period1.txt') new_train_feature= SelectKBest(f_regression, k=10).fit_transform(train_feature, train_target) [F,p_value]=f_regression(train_feature, train_target) [a,b]=np.shape(new_train_feature) [m,n]=np.shape(train_feature) index=[] for i in range(b): for j in range(n): same=list(set(new_train_feature[:,i]==train_feature[:,j])) if same[0]==True: index.append(j) new_test_feature=test_feature[:,0] for i in index[1:]: new_test_feature=np.column_stack((new_test_feature,test_feature[:,i]))
def _get_column_f_regression_scores(self): misc.start('_get_column_f_regression_scores') scores = feature_selection.f_regression(self.X_no_nan, self.y)[0] misc.stop('_get_column_f_regression_scores') return scores
# a better choice of alpha: # Stop the user warnings outputs- they are not necessary for the example # as it is specifically set up to be challenging. with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) warnings.simplefilter('ignore', ConvergenceWarning) lars_cv = LassoLarsCV(cv=6).fit(X, y) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y) trees = ExtraTreesRegressor(100).fit(X, y) # Compare with F-score F, _ = f_regression(X, y) plt.figure() for name, score in [('F-test', F), ('Stability selection', clf.scores_), ('Lasso coefs', np.abs(lars_cv.coef_)), ('Trees', trees.feature_importances_), ]: precision, recall, thresholds = precision_recall_curve(coef != 0, score) plt.semilogy(np.maximum(score / np.max(score), 1e-4), label="%s. AUC: %.3f" % (name, auc(recall, precision))) plt.plot(np.where(coef != 0)[0], [2e-4] * n_relevant_features, 'mo', label="Ground truth") plt.xlabel("Features")
w = min(W,dx) image(temppath,imgx,imgy,width=w) imgy = imgy + dy + 20 os.remove(temppath) size(W, HEIGHT+dy+40) else: def pltshow(mplpyplot): mplpyplot.show() # nodebox section end np.random.seed(0) X = np.random.rand(1000, 3) y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000) f_test, _ = f_regression(X, y) f_test /= np.max(f_test) mi = mutual_info_regression(X, y) mi /= np.max(mi) plt.figure(figsize=(15, 5)) for i in range(3): plt.subplot(1, 3, i + 1) plt.scatter(X[:, i], y, edgecolor='black', s=20) plt.xlabel("$x_{}$".format(i + 1), fontsize=14) if i == 0: plt.ylabel("$y$", fontsize=14) plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]), fontsize=16) # plt.show()
def score(self, X, y): f, p = skl_fss.f_regression(X, y) return f