def scaler_constructor(flags: list): if ('sine_advanced' in flags) or ('quad_advanced' in flags): if ('sine_advanced' in flags) and ('quad_advanced' in flags): features = FeatureUnion([ ("sine", preprocessing.FunctionTransformer(np.sin)), ("quadratic", preprocessing.FunctionTransformer(np.square)) ]) print('a') elif ('sine_advanced' in flags): features = preprocessing.FunctionTransformer(np.sin) print('b') else: features = preprocessing.FunctionTransformer(np.square) print('c') if ('norm_advanced' in flags): scaler = Pipeline([('features', features), ('norm', preprocessing.StandardScaler()), ('final_operation', preprocessing.MinMaxScaler())]) print('d') else: scaler = Pipeline([('features', features), ('final_operation', preprocessing.MinMaxScaler())]) print('e') elif ('norm_advanced' in flags): scaler = Pipeline([('norm', preprocessing.StandardScaler()), ('final_operation', preprocessing.MinMaxScaler())]) print('f') else: scaler = preprocessing.MinMaxScaler() print('g') return scaler
def create_estimator(ml_obj, numeric_features, cat_features, date_features): estimator = pipeline.Pipeline(steps=[ ('Feature_processing', pipeline.FeatureUnion(transformer_list=[ ('Numeric_features', pipeline.Pipeline(steps=[( 'selecting', preprocessing.FunctionTransformer( lambda data: data[:, numeric_features], validate=True)), ('scaling', preprocessing.StandardScaler( with_mean=0., with_std=1))])), ('Categical_features', pipeline.Pipeline(steps=[( 'selecting', preprocessing.FunctionTransformer( lambda data: data[:, cat_features], validate=True)), ('hot_encoding', preprocessing.OneHotEncoder( handle_unknown='ignore'))])), ('Date_features', pipeline.Pipeline(steps=[( 'selecting', preprocessing.FunctionTransformer( lambda data: data[:, date_features], validate=True)), ('hot_encoding', preprocessing.OneHotEncoder( handle_unknown='ignore'))])) ])), ('Model_fitting', ml_obj) ]) return estimator #TODO: #make custom score
def sk_function_transformer(): def simple_preprocessor(numpy_x): return numpy_x**2 transformer = sk_preprocessing.FunctionTransformer(simple_preprocessor, validate=True) return transformer
def define_pipeline(): categorical = ('season', 'holiday', 'workingday', ) numerical = ('datetime', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',) # Datetime isn't numerical, but needs to be in the numeric branch pipeline = Pipeline([ # Process cat & num separately, then join back together ('union', FeatureUnion([ ('categorical', Pipeline([ ('select_cat', fe.SelectCols(cols = categorical)), ('onehot', OneHotEncoder()), ])), ('numerical', Pipeline([ ('select_num', fe.SelectCols(cols = numerical)), ('prog_age', fe.AddProgAge()), ('date', fe.DateFormatter()), ('daily_max', fe.DailyGroup(func = np.max, cols = ['weather'], rsuffix = '_dailymax')), ('daily_mean', fe.DailyGroup(func = np.mean, cols = ['temp'], rsuffix = '_dailymean')), ('drop_datetime', fe.SelectCols(cols = ('datetime', 'month'), invert = True)), ('temp', fe.ProcessNumerical(cols_to_square = ('temp', 'atemp', 'humidity'),)), ('rollingweather', fe.RollingWindow(cols = ('weather', ))), ('forecast', fe.WeatherForecast()), # ('bad_weather', fe.BinarySplitter(col = 'weather', threshold = 2)), # ('filter', fe.PassFilter(col='atemp', lb = 15, replacement_style = 'mean')) ('scale', StandardScaler()), ])), ])), ('to_dense', preprocessing.FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), ('clf', GradientBoostingRegressor(n_estimators=100,random_state=2)), ]) return pipeline
def main(): # data = pd.read_csv("colour-data.csv") data = pd.read_csv(sys.argv[1]) X = data # array with shape (n, 3). Divide by 255 y = data # array with shape (n,) of colour words # TODO: build model_rgb to predict y from X. # TODO: print model_rgb's accuracy_score # TODO: build model_lab to predict y from X by converting to LAB colour first. # TODO: print model_lab's accuracy_score data = pd.read_csv("colour-data.csv") rgb_columns = ["R", "G", "B"] data[rgb_columns] = data[rgb_columns].values / 255 X_train, X_test, Y_train, Y_test = model_selection.train_test_split( data[rgb_columns].values, data["Label"].values) model_rgb = GaussianNB() model_rgb = model_rgb.fit(X_train, Y_train) Y_predicted = model_rgb.predict(X_test) print(accuracy_score(Y_test, Y_predicted)) model_lab = pipeline.make_pipeline( preprocessing.FunctionTransformer(my_rgb2lab), GaussianNB()) model_lab = model_lab.fit(X_train, Y_train) Y_predicted_lab = model_lab.predict(X_test) print(accuracy_score(Y_test, Y_predicted_lab)) plot_predictions(model_rgb) plt.savefig('predictions_rgb.png') plot_predictions(model_lab) plt.savefig('predictions_lab.png')
def fit_encoder(encoding_method, df): """ Parameters ---------- encoding_method: {"OneHot", "OneHot_drop_binary", "Identity"} String indicating what encoding method to use. df: pd.DataFrame DataFrame containing only categorical data. Returns ------- sklearn.base.BaseEstimator """ # TODO rather then passing a string this could accept a function if encoding_method == "OneHot": fitted_encoder = preprocessing.OneHotEncoder(handle_unknown="error", sparse=False).fit(df) elif encoding_method == "OneHot_drop_binary": fitted_encoder = preprocessing.OneHotEncoder(drop="if_binary", handle_unknown="error", sparse=False).fit(df) elif encoding_method is None or "Identity": fitted_encoder = preprocessing.FunctionTransformer(func=None, inverse_func=None) else: raise ValueError("Encoding Method not known") return fitted_encoder
def main(infile): data = pd.read_csv(infile) X = data[[ 'R', 'G', 'B' ]] / 255 # array with shape (n, 3). Divide by 255 so components are all 0-1. y = data['Label'] # array with shape (n,) of colour words. X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) model_rgb = GaussianNB() model_rgb.fit(X_train, y_train) accuracy_score = model_rgb.score(X_test, y_test) print("The accuracy score of RGB is %.3g" % accuracy_score) plot_predictions(model_rgb) plt.savefig('predictions_rgb.png') # TODO: build model_rgb to predict y from X. # TODO: print model_rgb's accuracy_score model_lab = sp.make_pipeline(spr.FunctionTransformer(RGB_LAB), GaussianNB(priors=None)) model_lab.fit(X_train, y_train) accuracy_score = model_lab.score(X_test, y_test) print("The accuracy score of LAB is %.3g" % accuracy_score) plot_predictions(model_lab) plt.savefig('predictions_lab.png')
def log_transform(x_train_dum, scale_list): ''' Log Transformer ''' logtf = preprocessing.FunctionTransformer(np.log1p) x_train_logtf = x_train_dum.copy() for i in scale_list: x_train_logtf.iloc[:, i] = logtf.transform(x_train_dum.iloc[:, i]) return x_train_logtf
def fit_scaler(scaling_method, df): """ Parameters ---------- scaling_method: {"MinMax", "Standard", "Identity"} String indicating what scaling method to use. df: pd.DataFrame DataFrame only containing continuous data. Returns ------- sklearn.base.BaseEstimator """ # TODO rather then passing a string this could accept a function if scaling_method == "MinMax": fitted_scaler = preprocessing.MinMaxScaler().fit(df) elif scaling_method == "Standard": fitted_scaler = preprocessing.StandardScaler().fit(df) elif scaling_method is None or "Identity": fitted_scaler = preprocessing.FunctionTransformer(func=None, inverse_func=None) else: raise ValueError("Scaling Method not known") return fitted_scaler
def transform(type, train_frame, validation_frame, test_frame, columns): test_frame_X = test_frame.drop(['y'], axis=1) test_frame_Y = test_frame['y'] train_frame_X = train_frame.drop(['y'], axis=1) train_frame_Y = train_frame['y'] validation_frame_X = validation_frame.drop(['y'], axis=1) validation_frame_Y = validation_frame['y'] if type == 'log': function = preprocessing.FunctionTransformer(log01p, validate=False) elif type == 'bi': function = preprocessing.Binarizer(threshold=0) elif type == 'std': function = preprocessing.StandardScaler().fit(train_frame_X) test_data_X = function.transform(test_frame_X) train_data_X = function.transform(train_frame_X) validation_data_X = function.transform(validation_frame_X) test_set = pd.DataFrame(test_data_X, columns=columns) test_set['y'] = test_frame_Y.values train_set = pd.DataFrame(train_data_X, columns=columns) train_set['y'] = train_frame_Y.values validation_set = pd.DataFrame(validation_data_X, columns=columns) validation_set['y'] = validation_frame_Y.values return [train_set, validation_set, test_set]
def __init__(self, estimator, transform=None): """a container for a trained estimator and transform Input: estimator: a fitted sklearn estimator transform: a fitted sklearn transform For example: >>> from sklearn.datasets import load_iris >>> data = load_iris() >>> d = MLData(*traintest(data.data[:,:3], data.data[:,3], .2)) >>> from sklearn.linear_model import LinearRegression >>> from sklearn.preprocessing import StandardScaler >>> xfm = StandardScaler().fit(d.xtrain) >>> lnr = LinearRegression().fit(xfm.transform(d.xtrain), d.ytrain) >>> e = Estimator(lnr, xfm) >>> [e(*i) for i in d.xtest[:2]] [1.7802194778123053, 1.3775908988859642] >>> e.test(d.xtest)[:2].tolist() [1.7802194778123053, 1.3775908988859642] >>> d.ytest[:2].tolist() [1.8, 1.3] >>> e.score(d.xtest, d.ytest) 0.9440222526291645 """ self.estimator = estimator if transform is None: import sklearn.preprocessing as pre transform = pre.FunctionTransformer() #XXX: or StandardScaler ? self.transform = transform self.function = lambda *x: float( self.test(np.array(x).reshape(1, -1)).reshape(-1))
def define_fuc(x): ''' 通俗的讲,就是把原始的特征放进一个函数中做转换,这个函数出来的值作为新的特征; ''' fuc = np.log1p transformer = preprocessing.FunctionTransformer(fuc) x_t = transformer.transform(x) return x_t
def make_log_plot(state_name, visible=True): state_data = states_data[states_data['state'] == state_name] log_transformer = preprocessing.FunctionTransformer(np.log, validate=True) confirmed_log = log_transformer.fit_transform(state_data['confirmed'].values.reshape(-1, 1)) return go.Scatter(x=state_data['date'], y=confirmed_log.ravel(), mode='lines', line=dict(color='cornflowerblue', width=1.5), name='Confirmed cases', visible = visible )
def transform_normalize(self, X): print("Transforming with log1p and scaling with MinMaxScaler") # transform data with log1p function - data is right skewed transformer = preprocessing.FunctionTransformer(np.log1p, validate=True) X = transformer.transform(X) # normalize - to similarly scale the data X = preprocessing.MinMaxScaler().fit_transform(X) return X
def main(): data = pd.read_csv(sys.argv[1]) X = data[['R', 'G', 'B']].values / 255 y = data['Label'].values X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.3) # TODO: create some models bayes_rgb_model = GaussianNB(priors=None) bayes_lab_model = sp.make_pipeline(spr.FunctionTransformer(RGB_LAB), GaussianNB(priors=None)) knn_rgb_model = KNeighborsClassifier(n_neighbors=9) knn_lab_model = sp.make_pipeline(spr.FunctionTransformer(RGB_LAB), KNeighborsClassifier(n_neighbors=9)) svc_rgb_model = SVC(kernel='linear', C=3) svc_lab_model = sp.make_pipeline(spr.FunctionTransformer(RGB_LAB), SVC(kernel='linear', C=0.1)) # train each model and output image of predictions models = [ bayes_rgb_model, bayes_lab_model, knn_rgb_model, knn_lab_model, svc_rgb_model, svc_lab_model ] # models = [svc_lab_model] for i, m in enumerate( models): # yes, you can leave this loop in if you want. m.fit(X_train, y_train) plot_predictions(m) plt.savefig('predictions-%i.png' % (i, )) print( OUTPUT_TEMPLATE.format( bayes_rgb=bayes_rgb_model.score(X_test, y_test), bayes_lab=bayes_lab_model.score(X_test, y_test), knn_rgb=knn_rgb_model.score(X_test, y_test), knn_lab=knn_lab_model.score(X_test, y_test), svm_rgb=svc_rgb_model.score(X_test, y_test), svm_lab=svc_lab_model.score(X_test, y_test), ))
def gene_feature(data_pd): # numeric columns col_binary = ['holiday', 'workingday'] index_binary = np.asarray([(col in col_binary) for col in data_pd.columns], dtype=bool) # cate columns col_cate = ['season', 'weather'] index_cate = np.asarray([(col in col_cate) for col in data_pd.columns], dtype=bool) # numeric columns col_num = ['temp', 'atemp', 'humidity', 'windspeed'] index_num = np.asarray([(col in col_num) for col in data_pd.columns], dtype=bool) # normal value col_normal = ['month', 'day', 'hour'] normal_num = np.asarray([(col in col_normal) for col in data_pd.columns], dtype=bool) feature_trans_list = [ ('binary_value', Pipeline(steps=[( 'select', preprocessing.FunctionTransformer(lambda x: x[:, index_binary]) ), ('transform', preprocessing.OneHotEncoder())])), ('cate_value', Pipeline(steps=[( 'select', preprocessing.FunctionTransformer(lambda x: x[:, index_cate]) ), ('transform', preprocessing.OneHotEncoder())])), ('numeric_value', Pipeline(steps=[( 'select', preprocessing.FunctionTransformer(lambda x: x[:, index_num]) ), ('transform', preprocessing.StandardScaler(with_mean=0))])), ('normal_value', Pipeline(steps=[( 'select', preprocessing.FunctionTransformer(lambda x: x[:, normal_num]))])) ] feature_union = FeatureUnion(feature_trans_list) feature_set = feature_union.fit_transform(data_pd).toarray() return feature_set
def _log_transform(self): for c in self.num_feats: logt = preprocessing.FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=True) logt.fit(self.df[c].values.reshape(-1, 1)) self.output_df.loc[:, c] = logt.transform(self.df[c].values.reshape( -1, 1)) self.log_transform[c] = logt return self.output_df, self.log_transform
def get_estimator(self): binary = ('binary_variables_processing', preprocessing.FunctionTransformer( lambda data: data[:, Model.binary_data_indices], validate=True)) categorial = ( 'categorical_variables_processing', pipeline.Pipeline( steps=[( 'selecting', preprocessing.FunctionTransformer( lambda data: data[:, Model.categorical_data_indices], validate=True)), ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown='ignore', sparse=False))])) estimator = pipeline.Pipeline( steps=[('feature_processing', pipeline.FeatureUnion( transformer_list=[binary, categorial]) ), ('model_fitting', self.regressor)]) return estimator
def param_tuning_graphs(train_data,dev_data,train_label,pipeline,parameter,param_values): categorical = ('season', 'holiday', 'workingday', ) numerical = ('datetime', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',) # Datetime isn't numerical, but needs to be in the numeric branch pipeline = Pipeline([ # Process cat & num separately, then join back together ('union', FeatureUnion([ ('categorical', Pipeline([ ('select_cat', fe.SelectCols(cols = categorical)), ('onehot', OneHotEncoder()), ])), ('numerical', Pipeline([ ('select_num', fe.SelectCols(cols = numerical)), ('date', fe.DateFormatter()), #('drop_datetime', fe.SelectCols(cols = ('datetime'), invert = True)), ('temp', fe.ProcessNumerical(cols_to_square = ('temp', 'atemp', 'humidity'),)), # ('bad_weather', fe.BinarySplitter(col = 'weather', threshold = 2)), # ('filter', fe.PassFilter(col='atemp', lb = 15, replacement_style = 'mean')) ('scale', StandardScaler()), ])), ])), ('to_dense', preprocessing.FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), #('clf', GradientBoostingRegressor(n_estimators=100,random_state=2)), ]) # Run train and dev data through pipeline for feature engineering features = [c for c in train_data.columns if c not in ['count', 'casual', 'registered', 'log_casual', 'log_registered', 'prog_age']] fe_train_data = pipeline.fit_transform(train_data[features]) fe_dev_data = pipeline.transform(dev_data[features]) row_format = "{:>10}" *(6) rmse_list=[] for i in param_values: t0 = time() if parameter == 'n_estimators': gb = GradientBoostingRegressor(n_estimators=i,learning_rate=0.05,max_depth=10, min_samples_leaf=20,random_state=2) if parameter == 'learning_rate': gb = GradientBoostingRegressor(n_estimators=115,learning_rate=i,max_depth=10, min_samples_leaf=20,random_state=2) if parameter == 'max_depth': gb = GradientBoostingRegressor(n_estimators=115,learning_rate=0.05,max_depth=i, min_samples_leaf=20,random_state=2) if parameter == 'min_samples_leaf': gb = GradientBoostingRegressor(n_estimators=115,learning_rate=0.05,max_depth=10, min_samples_leaf=i,random_state=2) gb.fit(fe_train_data, train_data[train_label]) predicted_y = gb.predict(fe_dev_data) rmse = get_RMSE(actual_values = dev_data[train_label], predicted_values = predicted_y) rmse_list.append(round(rmse,3)) print row_format.format(parameter+":", i, "RMSE:", round(rmse,3), "Runtime:", round((time() - t0),3)) plt.plot(param_values,rmse_list) plt.show() return rmse_list
def main(): data = pd.read_csv(sys.argv[1]) X = data # array with shape (n, 3). Divide by 255 y = data # array with shape (n,) of colour words rgb_columns = ["R","G","B"] data[rgb_columns] = data[rgb_columns].values/255 X_train,X_test,y_train,y_test = model_selection.train_test_split(data[rgb_columns].values,data["Label"].values) bayes_rgb_model = GaussianNB() bayes_lab_model = pipeline.make_pipeline(preprocessing.FunctionTransformer(my_rgb2lab),GaussianNB()) knn_rgb_model = KNeighborsClassifier(15) knn_lab_model = pipeline.make_pipeline(preprocessing.FunctionTransformer(my_rgb2lab),KNeighborsClassifier(15)) svc_rgb_model = svm.SVC(C=30) svc_lab_model = pipeline.make_pipeline(preprocessing.FunctionTransformer(my_rgb2lab),svm.SVC(C=1.0,kernel="linear", decision_function_shape="ovr")) # train each model and output image of predictions models = [bayes_rgb_model, bayes_lab_model, knn_rgb_model, knn_lab_model, svc_rgb_model, svc_lab_model] for i, m in enumerate(models): # yes, you can leave this loop in if you want. m.fit(X_train, y_train) plot_predictions(m) plt.savefig('predictions-%i.png' % (i,)) print(OUTPUT_TEMPLATE.format( bayes_rgb=bayes_rgb_model.score(X_test, y_test), bayes_lab=bayes_lab_model.score(X_test, y_test), knn_rgb=knn_rgb_model.score(X_test, y_test), knn_lab=knn_lab_model.score(X_test, y_test), svm_rgb=svc_rgb_model.score(X_test, y_test), svm_lab=svc_lab_model.score(X_test, y_test), ))
def generate_model(pred_vars, log_transform=True, one_hot_week=False, method="lm"): """ Generate the model for transforming and predicting. ... """ assert method in ['lm', 'poisson'], "method must be one of 'lm' or 'poisson'" if log_transform: ft = preprocessing.FunctionTransformer(np.log) else: ft = preprocessing.FunctionTransformer() if one_hot_week: model_prep = compose.ColumnTransformer( [("onehot_categorical", preprocessing.OneHotEncoder(), ["week_num"]), ("num_scaler", ft, pred_vars)], remainder="drop", ) else: model_prep = compose.ColumnTransformer( [("num_scaler", ft, pred_vars + ['ca_prop'])], remainder="drop", ) if method == 'lm': pipe = pipeline.Pipeline([("preprocessor", model_prep), ("regressor", linear_model.LinearRegression())]) elif method == 'poisson': pipe = pipeline.Pipeline([ ("preprocessor", model_prep), ("regressor", linear_model.PoissonRegressor(alpha=1e-12, max_iter=10000)) ]) return pipe
def __init__(self, verbose=False, have_cache_data=False): self.__net = None self.__verbose = verbose log_transformer = preprocessing.FunctionTransformer( np.log1p, _inv_log1p, validate=True) scale_transformer = preprocessing.MinMaxScaler() self.__pipeline = Pipeline([("log", log_transformer), ("scale", scale_transformer)]) self.__tree_transform = TreeFeaturizer() self.__have_cache_data = have_cache_data self.__in_channels = None self.__n = 0
def make_transformer(func: Callable, **kwargs): """Make an sklearn transformer, to use with transform() function. Parameters ---------- func : Callable function to perform the transform Returns ------- [type] [description] """ transformer = skp.FunctionTransformer(func, kw_args=kwargs) return transformer
def define_pipeline(): categorical = ( 'season', 'holiday', 'workingday', ) numerical = ( 'datetime', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', ) # Datetime isn't numerical, but needs to be in the numeric branch pipeline = Pipeline([ # Process cat & num separately, then join back together ('union', FeatureUnion([ ('categorical', Pipeline([ ('select_cat', fe.SelectCols(cols=categorical)), ('onehot', OneHotEncoder()), ])), ('numerical', Pipeline([ ('select_num', fe.SelectCols(cols=numerical)), ('date', fe.DateFormatter()), ('drop_datetime', fe.SelectCols(cols=('datetime', 'month'), invert=True)), ('fix_bad_vals', fe.FillData(cols=('windspeed', 'humidity'), threshold=1)), ('temp', fe.ProcessNumerical(cols_to_square=('temp', 'atemp', 'humidity'), )), ('rollingweather', fe.RollingWindow(cols=('weather', ))), ('forecast', fe.WeatherForecast()), ('scale', StandardScaler()), ])), ])), ('to_dense', preprocessing.FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), ('clf', GradientBoostingRegressor(n_estimators=100, random_state=2)), ]) return pipeline
def fit(self, on_engine, velocities, accelerations, *args): if on_engine.all(): self.base = self.model = DefaultStartStopModel() else: X = np.column_stack((velocities, accelerations) + args) model = sk_tree.DecisionTreeClassifier(random_state=0, max_depth=4) self.model = sk_pip.Pipeline([('feature_selection', sk_fsel.SelectFromModel(model)), ('classification', model)]) self.model.fit(X, on_engine) model = sk_tree.DecisionTreeClassifier(random_state=0, max_depth=3) self.base = sk_pip.Pipeline([ ('feature_selection', sk_prep.FunctionTransformer(lambda X: X[:, :2])), ('classification', model) ]) self.base.fit(X, on_engine) return self
print("二值化: \n", preprocessing.Binarizer(threshold=3).fit_transform(iris_x)) print("哑编码: \n", preprocessing.OneHotEncoder().fit_transform(iris_y.reshape(-1, 1))) from numpy import vstack, array, nan print( "填充缺失值:\n", preprocessing.Imputer().fit_transform( vstack((array([nan, nan, nan, nan]), iris_x)))) print("多项式变化:\n", preprocessing.PolynomialFeatures().fit_transform(iris_x)) from numpy import log1p print("自定义转换函数:\n", preprocessing.FunctionTransformer(log1p).fit_transform(iris_x)) from sklearn.feature_selection import VarianceThreshold print("方差选择法:\n", VarianceThreshold(threshold=3).fit_transform(iris_x)) from sklearn.feature_selection import SelectKBest, chi2 print("卡方检验:\n", SelectKBest(chi2, k=2).fit_transform(iris_x, iris_y)) from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression # 参数estimator为基模型 # 参数n_features_to_select为选择的特征个数 print( "递归特征消除法:\n", RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris_x, iris_y))
result = polynomial.fit_transform(matrix) print(result) polynomial = preprocessing.PolynomialFeatures(degree = 3, include_bias = False) result = polynomial.fit_transform(matrix) print(result) # 함수 적용하기 matrix = np.array([[100, 200], [300, 150]]) print(matrix) # 100을 결합하기 def intconvert(x): return x + 100 transformer = preprocessing.FunctionTransformer(intconvert) result = transformer.transform(matrix) print(result) print(data['국어']) print(data['국어'].apply(intconvert)) import numpy as np import pandas as pd # array를 입력받아서 z 점수(평균의 표준편차 3 범위) # 밖에 있는 데이터를 리턴해주는 함수 def z_score_outlier(ar) : threshold = 3 # 평균 가져오기
'numerical', Pipeline([ ('select_num', fe.SelectCols(cols=numerical)), ('date', fe.DateFormatter()), ('drop_datetime', fe.SelectCols(cols=('datetime'), invert=True)), ('temp', fe.ProcessNumerical(cols_to_square=('temp', 'atemp', 'humidity'))), # ('bad_weather', fe.BinarySplitter(col = 'weather', threshold = 2)), # ('filter', fe.PassFilter(col='atemp', lb = 15, replacement_style = 'mean')) ('scale', StandardScaler()), ])), ])), ('to_dense', preprocessing.FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), ('clf', GradientBoostingRegressor(n_estimators=100, random_state=2)), ]) #Helper function to calculate root mean squared error def get_RMSE(actual_values, predicted_values): n = len(actual_values) RMSE = np.sqrt( np.sum( ((np.log(predicted_values + 1) - np.log(actual_values + 1))**2) / n)) return RMSE #create custom scorer
def Standard_Features(X_train, mode, X_test): ''' :param X_train: type: dataFrame or 2D-array, mXn, m rows, n columns: n features :param X_test: type: dataFrame or 2D-array, m'Xn', m' rows, n' columns: n' features. when only X_train is scaled (i.e., mode='scale'), X_test does not must be input. :param mode: process feature values modes: 'zscore', 'maxmin', 'log', 'scale' :return: Standard_X_train: 2d Array, m X n, m rows, n columns: n features Standard_X_test: 2d Array, m' X n', m' rows, n' columns: n' features ''' # 每个项目都使用自身的均值方差/不使用归一化,是考虑测试集可能不包括最值,归一化可以用来和标准化比较影响 # CC认为即使改变数据分布也没关系,因为跨项目数据分布本来就和源数据分布不一致 # Xy = Sample.values # DataFrame2Array # X = Sample[:, :-1] # mX(n-1): m samples, n features # y = Sample[:, -1] # mX1: m label values # X_Mean = X.mean(axis=0) # X_Std = X.std(axis=0) # X_Max = X.max(axis=0) # X_Min = X.min(axis=0) # 正规化方法:z-score, 均值为0,方差为1, 适用于属性A的最大值和最小值未知的情况,或有超出取值范围的离群数据的情况。 if mode == 'zscore': print( '* Do Z-score on source & target datasets according to source ...' ) zscore_scaler = preprocessing.StandardScaler( copy=True, with_mean=True, with_std=True) # Binarizer, Imputer, LabelBinarizer zscore_scaler.fit(X_train) X_train_zscore = zscore_scaler.transform(X_train) X_test_zscore = zscore_scaler.transform(X_test) return X_train_zscore, X_test_zscore # 规范化方法:max-min normalization,原始数据的线性变换,使结果映射到[0,1]区间 # 实现特征极小方差的鲁棒性以及在稀疏矩阵中保留零元素。(鲁棒性:表征控制系统对特性或参数扰动的不敏感性) elif mode == 'maxmin': print( '* Do max-min on source & target datasets according to source ...' ) min_max_scaler = preprocessing.MinMaxScaler() X_train_minmax = min_max_scaler.fit_transform(X_train) X_test_minmax = min_max_scaler.transform(X_test) return X_train_minmax, X_test_minmax if mode == 'zscore_t': print( '* Do Z-score on source & target datasets according to target ...' ) zscore_scaler = preprocessing.StandardScaler( copy=True, with_mean=True, with_std=True) # Binarizer, Imputer, LabelBinarizer zscore_scaler.fit(X_test) X_test_zscore = zscore_scaler.transform(X_test) X_train_zscore = zscore_scaler.transform(X_train) return X_train_zscore, X_test_zscore # 规范化方法:max-min normalization,原始数据的线性变换,使结果映射到[0,1]区间 # 实现特征极小方差的鲁棒性以及在稀疏矩阵中保留零元素。(鲁棒性:表征控制系统对特性或参数扰动的不敏感性) elif mode == 'maxmin_t': print( '* Do max-min on source and target datasets according to target ...' ) min_max_scaler = preprocessing.MinMaxScaler() X_test_minmax = min_max_scaler.fit_transform(X_test) X_train_minmax = min_max_scaler.transform(X_train) return X_train_minmax, X_test_minmax elif mode == 'log': if X_train.any() >= 0: # ?? 为何有负数还为True # # print('Do log(x+1) on source and target datasets...') log_scaler = preprocessing.FunctionTransformer( np.log1p, validate=True) # log1p = log(1+x) X_train_log = log_scaler.fit_transform(X_train) if X_test.all() >= 0: X_test_log = log_scaler.transform(X_test) return X_train_log, X_test_log else: raise ValueError('test data exists negative values') # return None else: raise ValueError('training data exists negative values') # return None elif mode == 'scale': print( '* Do score(mean=0, std=1) on source and target separately....' ) X_train_scaled = preprocessing.scale(X_train) X_test_scaled = preprocessing.scale(X_test) return X_train_scaled, X_test_scaled elif mode == 'maxminscale': print( '* Do max-min on source and target separately....' ) X_train_minmaxscaled = preprocessing.MinMaxScaler().fit_transform( X_train) X_test_minmaxscaled = preprocessing.MinMaxScaler().fit_transform( X_test) return X_train_minmaxscaled, X_test_minmaxscaled elif mode == 'logscale': print( '* Do log(x+1) on source and target separately....' ) X_train_logscaled = preprocessing.FunctionTransformer( np.log1p, validate=True).fit_transform(X_train) # log1p = log(1+x) X_test_logscaled = preprocessing.FunctionTransformer( np.log1p, validate=True).fit_transform(X_test) return X_train_logscaled, X_test_logscaled else: raise ValueError('the value of mode is wrong, please check...')
# generate some sparse data xtrain = np.random.uniform(0, 100, size=(10, 4)) target = model(xtrain.T).T xtest = np.random.uniform(0, 100, size=(10, 4)) test = model(xtest.T).T # define some model constraints equations = """ 3*b + c > -0.75 4.5*b - d > 11.0 """ var = list('abcd') equations = simplify(equations, variables=var) cf = generate_constraint(generate_solvers(equations, variables=var)) if __name__ == '__main__': # build a kernel-transformed regressor ta = pre.FunctionTransformer(func=vectorize(cf, axis=1)) tp = pre.PolynomialFeatures(degree=3) e = lin.LinearRegression() # train and score, then test and score xtrain_ = tp.fit_transform(ta.fit_transform(xtrain)) assert 1.0 == e.fit(xtrain_, target).score(xtrain_, target) xtest_ = tp.fit_transform(ta.fit_transform(xtest)) assert 1 - e.score(xtest_, test) <= 1e-2 # EOF