class CustomScaler(TransformerMixin): def __init__(self, *args, **kwargs): self.scaler = StandardScaler(*args, **kwargs) self.cont_col_names = [ 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold' ] # takes X_enc, a pandas dataframe where discrete vars are one hot encoded def fit(self, X, y=None): self.scaler.fit(X[self.cont_col_names], y) return self # takes X_enc, a pandas dataframe where discrete vars are one hot encoded def transform(self, X, y=None, copy=None): continuous_cols = self.scaler.transform(X[self.cont_col_names]) discrete_cols = X.drop(columns=self.cont_col_names).values return np.concatenate([continuous_cols, discrete_cols], axis=1) def get_params(self, *args, **kwargs): return self.scaler.get_params(*args, **kwargs)
def standard_scaler(df: pd.DataFrame, columns_to_scale: List[str]) -> LearnerReturnType: """ Fits a standard scaler to the dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to scale. It must contain all columns listed in `columns_to_scale`. columns_to_scale : list of str A list of names of the columns for standard scaling. """ scaler = StandardScaler() scaler.fit(df[columns_to_scale].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_data = scaler.transform(new_data_set[columns_to_scale].values) new_cols = pd.DataFrame(data=new_data, columns=columns_to_scale).to_dict('list') return new_data_set.assign(**new_cols) p.__doc__ = learner_pred_fn_docstring("standard_scaler") log = {'standard_scaler': { 'standard_scaler': scaler.get_params(), 'transformed_column': columns_to_scale}} return p, p(df), log
def trainAndEvaluate(dataset, dataset_test, n_components=40, dimReduction='PCA', classifier="SVC", preproc_speaker=False): ## Pre processing if(preproc_speaker): print("preprocessing has been computed previously") else: standard_scaler = StandardScaler() standard_scaler.fit(dataset['data']) dataset['data'] = standard_scaler.fit_transform(dataset['data'], standard_scaler.get_params()) dataset_test['data'] = standard_scaler.transform(dataset_test['data'], standard_scaler.get_params()) # ## Dimensionality reduction # # # if (dimReduction == 'LDA'): # dimRed = LinearDiscriminantAnalysis(n_components=n_components) # dataset['data'] = dimRed.fit(dataset['data'], dataset['target']).transform(dataset['data']) # dataset_test['data'] = dimRed.transform(dataset_test['data']) # else: # if (dimReduction == 'PCA'): # dimRed = PCA(n_components=n_components) # elif (dimReduction == 'FA'): # dimRed = FeatureAgglomeration(n_clusters=n_components) # dataset['data'] = dimRed.fit_transform(dataset['data']) # dataset_test['data'] = dimRed.transform(dataset_test['data']) ## Classifier initialisation if (classifier == 'SVC'): clf = SVC(C=1, class_weight='balanced', verbose=1, probability=True) elif (classifier == 'kNN'): clf = neighbors.KNeighborsClassifier(n_neighbors=10) elif (classifier == 'tree'): clf = DecisionTreeClassifier(class_weight ='balanced', random_state=1) print("Training...") clf.fit(dataset['data'], dataset['target']) print("Prediciting...") predicted = clf.predict(dataset_test['data']) report = classification_report(dataset_test['target'], predicted, target_names=dataset_test['target_names']) accuracy = np.mean(predicted == dataset_test['target']) cnf_matrix = confusion_matrix(dataset_test['target'], predicted) return accuracy, cnf_matrix, report
def stand(): # 标准化 data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]] standard = StandardScaler() temp = standard.fit_transform(data) print("样本均值:", standard.mean_) print("样本方差:", standard.var_) print("样本参数:", standard.get_params()) print(temp) return None
def standard(): """ Method to load a zero mean and unit variance StandardScaler RETURN: scaler """ scaler = StandardScaler(copy=True) utils.display_get_params('StandardScaler Description', scaler.get_params()) return (scaler)
class StandardScaler(FeatureTransformAlgorithm): r"""Implementation of feature standard scaling algorithm. Date: 2020 Author: Luka Pečnik License: MIT Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html See Also: * :class:`niaaml.preprocessing.feature_transform.FeatureTransformAlgorithm` """ Name = 'Standard Scaler' def __init__(self, **kwargs): r"""Initialize StandardScaler. """ super(StandardScaler, self).__init__() self.__std_scaler = StdScaler() def fit(self, x, **kwargs): r"""Fit implemented transformation algorithm. Arguments: x (pandas.core.frame.DataFrame): n samples to fit transformation algorithm. """ self.__std_scaler.fit(x) def transform(self, x, **kwargs): r"""Transforms the given x data. Arguments: x (pandas.core.frame.DataFrame): Data to transform. Returns: pandas.core.frame.DataFrame: Transformed data. """ return self.__std_scaler.transform(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return FeatureTransformAlgorithm.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__std_scaler.get_params()))
class ScalingImplementation(EncodedInvariantImplementation): """ Class for application of Scaling operation on data, where only not encoded features (were not converted from categorical using OneHot encoding) are used :param params: optional, dictionary with the arguments """ def __init__(self, **params: Optional[dict]): super().__init__() if not params: # Default parameters self.operation = StandardScaler() else: self.operation = StandardScaler(**params) self.params = params def get_params(self): return self.operation.get_params()
def standard_scaler(df: pd.DataFrame, columns_to_scale: List[str]) -> LearnerReturnType: """ Fits a standard scaler to the dataset. The default behaviour is to replace the original values. To store the transformed values in a new column, specify `prefix` or `suffix` in the parameters, or specify a dictionary with the desired column mapping using the `columns_mapping` parameter. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with columns to scale. It must contain all columns listed in `columns_to_scale`. columns_to_scale : list of str A list of names of the columns for standard scaling. """ scaler = StandardScaler() scaler.fit(df[columns_to_scale].values) def p(new_data_set: pd.DataFrame) -> pd.DataFrame: new_data = scaler.transform(new_data_set[columns_to_scale].values) new_cols = pd.DataFrame(data=new_data, columns=columns_to_scale).to_dict('list') return new_data_set.assign(**new_cols) p.__doc__ = learner_pred_fn_docstring("standard_scaler") log = { 'standard_scaler': { 'standard_scaler': scaler.get_params(), 'transformed_column': columns_to_scale } } return p, p(df), log
def scale_data(trainX, testX): """ Scale data 2D :param trainX: (array) :param testX: (array) :return: trainX: (array) testX: (array) """ # remove overlap cut = int(trainX.shape[1] / 2) longX = trainX[:, -cut:, :] # flatten windows longX = longX.reshape((longX.shape[0] * longX.shape[1], longX.shape[2])) # flatten train and test flatTrainX = trainX.reshape( (trainX.shape[0] * trainX.shape[1], trainX.shape[2])) flatTestX = testX.reshape( (testX.shape[0] * testX.shape[1], testX.shape[2])) # standardize s = StandardScaler() # fit on training data s.fit(longX) # print("MEAN:") # print(s.mean_) # print("------------------------------------------") # print("VAR:") # print(s.var_) # print("------------------------------------------") # print("STD:") # print(s.scale_) print(s.get_params(True)) # apply to training and test data longX = s.transform(longX) flatTrainX = s.transform(flatTrainX) flatTestX = s.transform(flatTestX) # reshape flatTrainX = flatTrainX.reshape((trainX.shape)) flatTestX = flatTestX.reshape((testX.shape)) return flatTrainX, flatTestX
def test_params(self): estimator = StandardScaler(with_mean=False) params = estimator.get_params() params.update( {'estimator': estimator, 'reshapes': None, 'sample_dim': None}) # check params set in constructor wrapper = wrap(estimator) self.assertEqual(wrapper.get_params(), params) self.assertEqual(wrapper.with_mean, False) # check params set by attribute wrapper.with_std = False params.update({'with_std': False}) self.assertEqual(wrapper.get_params(), params) # check params set with set_params wrapper.set_params(copy=False) params.update({'copy': False}) self.assertEqual(wrapper.get_params(), params)
def normalize_xs(self): """ Standarization 2D data """ cut = int(self.x_train.shape[1] / 2) longX = self.x_train[:, -cut:, :] # flatten windows longX = longX.reshape( (longX.shape[0] * longX.shape[1], longX.shape[2])) # flatten train and test flatTrainX = self.x_train.reshape( (self.x_train.shape[0] * self.x_train.shape[1], self.x_train.shape[2])) flatTestX = self.x_test.reshape( (self.x_test.shape[0] * self.x_test.shape[1], self.x_test.shape[2])) # standardize s = StandardScaler() # fit on training data s.fit(longX) print("MEAN:") print(s.mean_) print("------------------------------------------") print("VAR:") print(s.var_) print("------------------------------------------") print("STD:") print(s.scale_) print(s.get_params(True)) # apply to training and test data longX = s.transform(longX) flatTrainX = s.transform(flatTrainX) flatTestX = s.transform(flatTestX) # reshape self.x_train = flatTrainX.reshape((self.x_train.shape)) self.x_test = flatTestX.reshape((self.x_test.shape))
def test_params(self): estimator = StandardScaler(with_mean=False) params = estimator.get_params() params.update({ "estimator": estimator, "reshapes": None, "sample_dim": None }) # check params set in constructor wrapper = wrap(estimator) self.assertEqual(wrapper.get_params(), params) self.assertEqual(wrapper.with_mean, False) # check params set by attribute wrapper.with_std = False params.update({"with_std": False}) self.assertEqual(wrapper.get_params(), params) # check params set with set_params wrapper.set_params(copy=False) params.update({"copy": False}) self.assertEqual(wrapper.get_params(), params)
spatial_size=FeatureVectorConfig.SPATIALSIZE, hist_feat=FeatureVectorConfig.HISTOGRAMFEATURES, hist_bins=FeatureVectorConfig.HISTOGRAMBINS, hog_feat=FeatureVectorConfig.HOGFEATURES) t2 = time.time() print(round(t2-t, 2), 'Seconds to extract HOG features...') # Create an array stack of feature vectors X = np.vstack((car_features, notcar_features)).astype(np.float64) # Fit a per-column scaler X_scaler = StandardScaler().fit(X) # Apply the scaler to X scaled_X = X_scaler.transform(X) # save the scaler print('X_scaler: ', X_scaler, ", get_params:", X_scaler.get_params(deep=True), ", mean:", X_scaler.mean_, ", std:", X_scaler.std_) print('saving scaler to: ', SCALERFILENAME) #SaveAndRestoreClassifier.saveScalerFitX(X, SCALERFILENAME) SaveAndRestoreClassifier.saveScaler(X_scaler, SCALERFILENAME) # Define the labels vector y = np.hstack((np.ones(len(car_features)), np.zeros(len(notcar_features)))) # Split up data into randomized training and test sets rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split( scaled_X, y, test_size=0.2, random_state=rand_state) print('Using:',FeatureVectorConfig.ORIENTATIONBINS,'orientations',FeatureVectorConfig.PIXELSPERCELL, 'pixels per cell and', FeatureVectorConfig.CELLSPERBLOCK,'cells per block') print('Feature vector length:', len(X_train[0]))
def standardization_speaker(df_train, df_test): standard_scaler = StandardScaler() # SP1 df_train1 = df_train[(df_train.target_names=="Sp1")] features = df_train1.loc[:, df_train1.columns != 'name'] features = features.loc[:, features.columns != 'target_names'] features = features.loc[:, features.columns != 'language'] column_names = features.columns.values.tolist() standard_scaler.fit(features) df_train1 = standard_scaler.fit_transform(features, standard_scaler.get_params()) df_test1 = df_test[(df_test.target_names=="Sp1")] features = df_test1.loc[:, df_test1.columns != 'name'] features = features.loc[:, features.columns != 'target_names'] features = features.loc[:, features.columns != 'language'] df_test1 = standard_scaler.transform(features, standard_scaler.get_params()) # SP2 df_train2 = df_train[(df_train.target_names=="Sp2")] features = df_train2.loc[:, df_train2.columns != 'name'] features = features.loc[:, features.columns != 'target_names'] features = features.loc[:, features.columns != 'language'] standard_scaler.fit(features) df_train2 = standard_scaler.fit_transform(features, standard_scaler.get_params()) df_test2 = df_test[(df_test.target_names=="Sp2")] features = df_test2.loc[:, df_test2.columns != 'name'] features = features.loc[:, features.columns != 'target_names'] features = features.loc[:, features.columns != 'language'] df_test2 = standard_scaler.transform(features, standard_scaler.get_params()) # SP3 df_train3 = df_train[(df_train.target_names=="Sp3")] features = df_train3.loc[:, df_train3.columns != 'name'] features = features.loc[:, features.columns != 'target_names'] features = features.loc[:, features.columns != 'language'] standard_scaler.fit(features) df_train3 = standard_scaler.fit_transform(features, standard_scaler.get_params()) df_test3 = df_test[(df_test.target_names=="Sp3")] features = df_test3.loc[:, df_test3.columns != 'name'] features = features.loc[:, features.columns != 'target_names'] features = features.loc[:, features.columns != 'language'] df_test3 = standard_scaler.transform(features, standard_scaler.get_params()) #To dataframe type df_train1 = pd.DataFrame(data = df_train1, columns = column_names) df_train2 = pd.DataFrame(data = df_train2, columns = column_names) df_train3 = pd.DataFrame(data = df_train3, columns = column_names) df_test1 = pd.DataFrame(data = df_test1, columns = column_names) df_test2 = pd.DataFrame(data = df_test2, columns = column_names) df_test3 = pd.DataFrame(data = df_test3, columns = column_names) #concat df_train_fin = pd.concat([df_train1, df_train2, df_train3]) df_test_fin = pd.concat([df_test1, df_test2, df_test3]) #Reindex df_train_fin.index = range(df_train_fin.shape[0]) df_test_fin.index = range(df_test_fin.shape[0]) #concat with final columns df_train_fin = pd.concat([df_train_fin, df_train.loc[:, df_train.columns == 'language'], df_train.loc[:, df_train.columns == 'name'],df_train.loc[:, df_train.columns == 'target_names']], axis = 1) df_test_fin = pd.concat([df_test_fin, df_test.loc[:, df_test.columns == 'language'], df_test.loc[:, df_test.columns == 'name'],df_test.loc[:, df_test.columns == 'target_names']], axis = 1) #randomize df_train=df_train_fin.sample(frac=1,random_state=1) df_test=df_test_fin.sample(frac=1,random_state=1) return df_train, df_test
scaler = StandardScaler() means = np.mean(X_train) std = np.std(X_train) print means[0] scaler.mean_ = np.zeros(len(means)) scaler.std_ = np.ones(len(means)) for i in range(len(means)): scaler.mean_[i] = means[i] scaler.std_[i] = std[i] print scaler.mean_ #scaler.mean_ = #X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) print scaler.get_params(deep=True) print scaler.mean_ print scaler.std_ sys.exit() # Let's retrain a new model on the first subset call the **training set**: # In[15]: from sklearn.ensemble import AdaBoostClassifier as ABC from sklearn.tree import DecisionTreeClassifier as DC dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train)) abc = ABC(dt,algorithm='SAMME', n_estimators=8, learning_rate=0.5)
def forecast_call_ml(zipcode): # url= "https://api.worldweatheronline.com/premium/v1/weather.ashx?" zipcode = zipcode response = requests.get( f"{url}key={lw_key}&q={zipcode}&num_of_days=7&tp=24&mca=no&aqi=yes&format=json" ).json() response = response["data"] weather_dict = { "Dates": [], "Cloudcover": [], "Humidity": [], "PrecipInch": [], "Pressure": [], "FeelsLike": [], "HeatIndex": [], "MaxTemp": [], "MinTemp": [], "SunHours": [], "UVIndex": [], } weather_dict["Dates"] = ([ response["weather"][i]["date"] for i in range(len(response["weather"])) ]) weather_dict["Cloudcover"] = ([ response["weather"][i]["hourly"][0]["cloudcover"] for i in range(len(response["weather"])) ]) weather_dict["Humidity"] = ([ response["weather"][i]["hourly"][0]["humidity"] for i in range(len(response["weather"])) ]) weather_dict["PrecipInch"] = ([ response["weather"][i]["hourly"][0]["precipInches"] for i in range(len(response["weather"])) ]) weather_dict["Pressure"] = ([ response["weather"][i]["hourly"][0]["pressure"] for i in range(len(response["weather"])) ]) weather_dict["FeelsLike"] = ([ response["weather"][i]["hourly"][0]["FeelsLikeF"] for i in range(len(response["weather"])) ]) weather_dict["HeatIndex"] = ([ response["weather"][i]["hourly"][0]["HeatIndexF"] for i in range(len(response["weather"])) ]) weather_dict["MaxTemp"] = ([ response["weather"][i]["maxtempF"] for i in range(len(response["weather"])) ]) weather_dict["MinTemp"] = ([ response["weather"][i]["mintempF"] for i in range(len(response["weather"])) ]) weather_dict["SunHours"] = ([ response["weather"][i]["sunHour"] for i in range(len(response["weather"])) ]) weather_dict["UVIndex"] = ([ response["weather"][i]["hourly"][0]["uvIndex"] for i in range(len(response["weather"])) ]) weather_df = pd.DataFrame.from_dict(weather_dict, orient='index').transpose() weather_df = weather_df.apply(pd.to_numeric, errors='ignore') weather_df["TempDelta"] = weather_df.MaxTemp - weather_df.MinTemp weather_df["BarChange"] = weather_df["Pressure"].pct_change() weather_df["HeatChange"] = weather_df["HeatIndex"].pct_change() weather_df["HumChange"] = weather_df["Humidity"].pct_change() weather_df = weather_df.iloc[1:] new_migraine_df = weather_df.drop("Dates", axis=1) forecast_data = json.loads(weather_df.to_json(orient="records")) #Pull data from MongoDB collection = mongo.db.history history_df = pd.DataFrame(list(collection.find())) #Pre-Processing of Data hist_ml_df = history_df.drop(columns=["Dates", "_id", "index"]) # Assign X (data) and y (target) X = hist_ml_df.drop("Migraine", axis=1) y = hist_ml_df["Migraine"] print(X.shape, y.shape) #Split our data into training and testing X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) #Fit on the training data, use StandardScaler X_scaler = StandardScaler().fit(X_train) X_scaler.get_params() X_train_scaled = X_scaler.transform(X_train) X_test_scaled = X_scaler.transform(X_test) print(f"y_train value counts: {y_train.value_counts}") print(f"y_test value counts: {y_test.value_counts}") #Model creation model = SVC(kernel="linear") # Create the GridSearch estimator along with a parameter object containing the values to adjust param_grid = {'C': [1, 5, 10, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005]} grid = GridSearchCV(model, param_grid, verbose=3) # Fit the model using the grid search estimator.This will take the SVC model and try each combination of parameters grid.fit(X_train_scaled, y_train) # List the best parameters for this dataset print(grid.best_params_) # List the best score print(grid.best_score_) # print(grid.score({X_test_scaled}, {y_test})) #Use model to make predictions with the hypertuned model predictions = grid.predict(X_test_scaled) # Run model on forecast data to formulate predictions X_new_scaled = X_scaler.transform(new_migraine_df) forecast_predictions = grid.predict(X_new_scaled) print(f"`forecast_predictions{forecast_predictions}") lists = forecast_predictions.tolist() json_str = json.dumps(lists) print("Predictions inserted") print(f'Forecast_Data{forecast_data}') # render an index.html template and pass it the data you retrieved from the database # return (f"We did it! Machine learning achieved! {forecast_data} {json_str}") # return render_template("results_index.html") return render_template("results_index.html", forecast_predictions=json_str, forecast_data=forecast_data)
#参考https://www.cnblogs.com/cola-1998/p/10218276.html #参考https://blog.csdn.net/weixin_39175124/article/details/79463993 #参考https://blog.csdn.net/onthewaygogoing/article/details/79871559 from sklearn.preprocessing import StandardScaler data = [[-1,0],[1,0],[1,1],[1,1]] scaler = StandardScaler() scaler.fit(data) print(scaler.mean_)#求均值 print(scaler.scale_)#求标准差 #===================================================================== import numpy as np import warnings warnings.filterwarnings("ignore")#有个数据格式的警告,忽略掉 x_train = np.arange(10).reshape(5,2) x_test = np.arange(3,7).reshape(2,2) y = [1,0,0,0,1] ss = StandardScaler(copy=True, with_mean=True, with_std=True) #调用StandardScaler类,此处参数为默认,copy 如果为false,就会用归一化的值替代原来的值,with_mean 在处理sparse CSR或者 CSC matrices 一定要设置False不然会超内存 print(x_train,x_test) z = ss.fit_transform(x_train)#等同于ss.fit(x_train).transform(x_train)即先拟合x_train数据,然后将其标准化为均值为0、标准差为1的数据 w = ss.fit(x_train)#运行fit方法拟合得到均值和方差等参数,fit的第二个参数为y=标签数据,默认为None print(ss.n_samples_seen_,ss.mean_,ss.var_,ss.scale_) #参数解释:n_samples_seen_样本数量,mean_每个特征的均值,var_每个特征方差,scale_每个特征标准差 x_train = w.transform(x_train) x_test = w.transform(x_test)#用训练集的拟合参数来标准化测试集,机器学习中有很多假设,这里假设了训练集的样本采样足够充分 print(z) print(x_train,x_test)#转换后的训练和测试数据 #如果原始数据的分布 不 接近于一般正态分布,则标准化的效果会不好 print(ss.get_params(deep=True))#返回StandardScaler对象的设置参数 print(ss.inverse_transform(x_test,copy=True))#StandardScaler()会保存标准化参数并且逆向转换
def scaling(data): scaler = StandardScaler() data = scaler.fit_transform(data) dict_labels = dict() dict_labels["scaler"] = scaler.get_params(deep=True) return data, dict_labels
def test_set_params(self): scaler = StandardScaler() wrapper = SKLearnWrapper(module=scaler) self.assertEqual(scaler.get_params()["with_mean"], True) wrapper.set_params(with_mean=False, ) self.assertEqual(scaler.get_params()["with_mean"], False)
def test_get_params(self): scaler = StandardScaler() wrapper = SKLearnWrapper(module=scaler) self.assertEqual(wrapper.get_params(), scaler.get_params())
#!/usr/bin/env python # encoding: utf-8 """ @author: payneLi @time: 18-7-11 下午4:05 @email: [email protected] """ from sklearn.preprocessing import StandardScaler data = [[1., -1., 3.], [2., 4., 2.], [4., 6., -1.]] ss = StandardScaler() target = ss.fit_transform(data) mean = ss.mean_ std = ss.var_ params = ss.get_params() print("target:", target, "\nmean:", mean, "\nstd:", std, "\nparams:", params)
def parameter_runs(regressions, n_comp = 60): start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") df = pandas.read_csv("training_set_ltv.csv") # df = df[df["bookings_pre_first_ride"] <2] scales = ["StandardScaler"] #, "MinMax", "Robust"] tf = data_prep(df) print(tf["non_cancelled_rides_after_first"].value_counts()) class_names = ["return" if x > 0 else "one-time" for x in tf["non_cancelled_rides_after_first"]] tf_train, tf_train2, tf_test = data_cleanup.train_validate_test_split(tf ,train_percent = 0.8 ,validate_percent = 0.1 ,seed =20) # X_train, X_test, y_train, y_test, X_columns= data_cleanup.create_test_and_train_data( # tf, "non_cancelled_rides_after_first") X_train, y_train, X_columns = data_cleanup.split_xyc(tf_train, "non_cancelled_rides_after_first") X_train2, y_train2, X_columns = data_cleanup.split_xyc(tf_train2, "non_cancelled_rides_after_first") X_test, y_test, X_columns = data_cleanup.split_xyc(tf_test, "non_cancelled_rides_after_first") scale = "StandardScaler" if scale == "StandardScaler": scaler = StandardScaler() elif scaler == "MinMax": scaler = MinMaxScaler() elif scaler == "Robust": scaler = RobustScaler() scaler.fit(X_train) # Don't cheat - fit only on training data print (scaler.get_params()) pickle.dump(scaler, open("data_scaler.pkl", "wb")) scaler_filename = "scaler.save" joblib.dump(scaler, scaler_filename) # And now to load... # print (set(y_train)) scaler = joblib.load(scaler_filename) X_train = scaler.transform(X_train) # pca = PCA(n_components=n_comp, svd_solver='full') # pca.fit(X_train) # X_train = pca.transform(X_train) # X_test = pca.transform(scaler.transform(X_test)) # X_train2 = pca.transform(scaler.transform(X_train2)) selector = SelectKBest(k= kbest).fit(X_train, y_train) X_train = selector.transform(X_train) X_train2 = selector.transform(X_train2) X_test = selector.transform(X_test) # X_columns = X_columns[selector.get_support()] # create temporary dataframes to add predictions to X_train2_t = pandas.DataFrame(X_train2) #, columns = X_columns) X_test_t = pandas.DataFrame(X_test)# , columns = X_columns) for estimator_conf in regressions: input_name = "with_cc_" suffix = "_" + estimator_conf['name'].replace(" ", "_") + "_" + scale output_name = input_name + suffix with open("test_log_" + output_name + ".txt", "w") as test_log: print(output_name) # print (X_train) # print(len(X_train), len(X_train[0])) # print(len(X_test),len(X_test[0]), len(y_test)) a= estimator_conf['instance'].fit(X_train, y_train) test_log.write(estimator_conf['name'] + "\n") output_name = input_name + "_" + estimator_conf['name'].replace(" ", "_") pickle.dump(a, open(output_name + ".pkl", "wb")) #classification_visualizer(a, X_test, y_test, output_name) test_log.write("score \t " +str(a.score(X_test, y_test)) + "\n") test_log.write("train performance\n") cnf_matrix =confusion_matrix(y_train, a.predict(X_train)) test_log.write( str(cnf_matrix) + "\n" ) test_log.write(str( cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis])+ "\n") test_log.write("test performance\n") cnf_matrix =confusion_matrix(y_test, a.predict(X_test)) test_log.write(str( cnf_matrix) + "\n") test_log.write(str( (cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]))+ "\n") training_manager.add_values_to_table("run_results" , [(output_name ,start_time # what time it was ,str(a.score(X_test, y_test)) # classification score ,int(cnf_matrix[0][0]) ,int(cnf_matrix[0][1]) ,int(cnf_matrix[1][0]) ,int(cnf_matrix[1][1]) )] , conn) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization') plt.savefig("cnf_m_" + output_name + ".pdf") # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig("cnf_m_normed_" + output_name + ".pdf") # print (len(y_proba(a, X_train))) # print (len(X_train)) # print (type(X_train)) # X_train["prediction_1"] = pandas.Series(y_proba(a, X_train)) # X_test["prediction_" + output_name] = y_proba(a, X_test) X_train2_t[output_name + "_prediction"] = y_proba(a, X_train2) X_test_t[output_name + "_prediction"] = y_proba(a, X_test) #y_test_dict[output_name] = y_proba(a, X_test) good_cols = ["bookings_pre_first_ride"] good_cols = X_columns print (len(X_train2[0])) X_train2 = clean_df(X_train2_t, good_cols) print (len(X_train2.columns)) X_test = clean_df(X_test_t, good_cols) for estimator_conf in regressions: input_name = "second_with_email_" suffix = "_" + estimator_conf['name'].replace(" ", "_") + "_" + scale + "_pca_" +str(n_comp) output_name = input_name + suffix with open("test_log_" + output_name + ".txt", "w") as test_log: a= estimator_conf['instance'].fit(X_train2, y_train2) test_log.write(estimator_conf['name'] + "\n") output_name = input_name + "_" + estimator_conf['name'].replace(" ", "_") pickle.dump(a, open(output_name + ".pkl", "wb")) classification_visualizer(a, X_test, y_test, output_name) test_log.write("score \t " +str(a.score(X_test, y_test)) + "\n") test_log.write("train performance\n") cnf_matrix =confusion_matrix(y_train2, a.predict(X_train2)) test_log.write( str(cnf_matrix) + "\n" ) test_log.write(str( cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis])+ "\n") test_log.write("test performance\n") cnf_matrix =confusion_matrix(y_test, a.predict(X_test)) test_log.write(str( cnf_matrix) + "\n") test_log.write(str( (cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]))+ "\n") training_manager.add_values_to_table("run_results" , [(output_name ,start_time # what time it was ,str(a.score(X_test, y_test)) # classification score ,int(cnf_matrix[0][0]) ,int(cnf_matrix[0][1]) ,int(cnf_matrix[1][0]) ,int(cnf_matrix[1][1]) )] , conn) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization') plt.savefig("cnf_m_" + output_name + ".pdf") # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix') plt.savefig("cnf_m_normed_" + output_name + ".pdf")