def prepare_data(x_train,indexx): #Convert numeric data #indexx-- >> ID primary key of the table text=[] text_= x_train.select_dtypes(include="object") text=text_.columns.values.tolist() id=x_train[indexx].tolist() x_train.drop(indexx,1) from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() for i in text: housing_cat = x_train[i] housing_cat_encoded = encoder.fit_transform(housing_cat.astype(str)) x_train = x_train.drop(i, 1) x_train[i] = housing_cat_encoded #Create Pipeline from sklearn.preprocessing import Imputer as SimpleImputer from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) x_train_arry = num_pipeline.fit_transform(x_train) train_x=pd.DataFrame(x_train_arry,columns=x_train.columns) train_x[indexx]=id return train_x
def rph_graph(X, y, columns): my_model = GradientBoostingRegressor() regression_columns = columns my_imputer = SimpleImputer() X_regression = my_imputer.fit_transform(X) my_model.fit(X_regression, y) my_plots = plot_partial_dependence( my_model, features=[0, 1, 2], # column numbers of plots we want to show X=X_regression, # raw predictors data. feature_names=regression_columns, # labels on graphs grid_resolution=10) # number of values to plot on x axis
def rph_create_output_file(model, test_data, id_column, column_to_predict, output_file_path): test_X = test_data.drop(columns=[id_column]) my_imputer = SimpleImputer() test_X = my_imputer.fit_transform(test_X) predictions = model.predict(test_X) my_submission = pd.DataFrame({ id_column: test_data[id_column], column_to_predict: predictions }) my_submission.to_csv(output_file_path, index=False) print("\nCSV created")
def rph_cross_validation(train_X, train_y): train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size = 0.20, random_state=1) my_imputer = SimpleImputer() train_X = my_imputer.fit_transform(train_X) test_X = my_imputer.transform(test_X) early_stopping_rounds = 30 xgb_model = XGBRegressor(n_estimators=600, learning_rate=0.06) fit_params={'early_stopping_rounds': early_stopping_rounds, 'eval_metric': 'mae', 'verbose': False, 'eval_set': [[test_X, test_y]]} xgb_cv = cross_val_score(xgb_model, train_X, train_y, cv = 5, scoring = 'neg_mean_absolute_error', fit_params = fit_params) xgb_model.fit(train_X, train_y, early_stopping_rounds=early_stopping_rounds, eval_set=[(test_X, test_y)], verbose=False) return xgb_cv, xgb_model
def prepare_data(x_train,method): #Convert numeric data if method=="test": id=x_train['PassengerId'].tolist() x_train.drop('PassengerId',1) from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() housing_cat = x_train["Sex"] housing_cat_encoded = encoder.fit_transform(housing_cat) x_train=x_train.drop("Sex",1) x_train["Sex"]=housing_cat_encoded housing = x_train['Embarked'].astype(str) housing = encoder.fit_transform(housing) x_train=x_train.drop('Embarked',1) x_train['Embarked']=housing #Create Pipeline from sklearn.preprocessing import Imputer as SimpleImputer from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) x_train_arry = num_pipeline.fit_transform(x_train) train_x=pd.DataFrame(x_train_arry,columns=x_train.columns) if method == "test": train_x['PassengerId']=id return train_x
from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), # In[60]: from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) housing_num_tr = num_pipeline.fit_transform(housing_num) # In[61]: import sklearn sklearn.__version__ # In[62]:
""" # importing libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # importing the dataset dataset = pd.read_csv("Data.csv") X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # Taking care of missing data: Not needed in template from sklearn.preprocessing import SimpleImputer missingvalues = SimpleImputer(missing_values=np.nan, strategy="mean", verbose=0) missingvalues = missingvalues.fit(X[:, 1:]) X[:, 1:] = missingvalues.transform(X[:, 1:]) # Encoding Categorical Data: Not needed in template from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer ct = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough') X = np.array(ct.fit_transform(X), dtype=np.float) from sklearn.preprocessing import LabelEncoder y = LabelEncoder().fit_transform(y)
strat_test_set = loaddata.loc[test_index] for _set in (strat_train_set, strat_test_set): _set.drop("income_cat", axis=1, inplace=True) train_y = strat_train_set['median_house_value'] train_x = strat_train_set.drop('median_house_value', axis=1) test_y = strat_test_set['median_house_value'] test_x = strat_test_set.drop('median_house_value', axis=1) num_col = [x for x in train_x.columns.values if x != 'ocean_proximity'] cat_col = ['ocean_proximity'] with timer('PipeLine'): num_pipe = Pipeline([('Selector', Selector(num_col)), ('imputer', SimpleImputer(strategy='median')), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler())]) cat_pipe = Pipeline([('Encoder', encoding(cat_col))]) union = FeatureUnion(transformer_list=[('num_pipe', num_pipe), ('cat_pipe', cat_pipe)]) housing_x = union.fit_transform(train_x) housing_x_t = union.transform(test_x) min_score = 2147483647 min_param = None with timer('parameter search'):
sample_incomplete_rows # In[70]: housing.describe() # In[48]: # Let's use Scikit-Learn Imputer class to fill missing values from sklearn.preprocessing import SimpleImputer imputer = SimpleImputer(strategy='median') # In[49]: # Remove the text attribute because median can only be calculated on numerical attributes housing_num = housing.drop('ocean_proximity', axis=1) # In[50]: # Fit the imputer instance to the training data
def transform(self, X): return X[self.selected_clmns] # In[7]: # let's creta a pipleline for two steps (selec coloumns and mputer for missed numeric values of the selected columns) from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer as SimpleImputer # num_pipeline = Pipeline([ # ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])), # ("imputer", SimpleImputer(strategy="median")), # ]) numeric_values_pipe= Pipeline([('numiric_ coloumns' ,DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])), ('imputer_for_missed' ,SimpleImputer(strategy='median'))]) # In[38]: num =numeric_values_pipe.fit_transform(train)# see below we selected only 4 num coloumns and get the missed values based on median # In[9]: # let's select a categorical data create an imputer for thier missed data #we build a calss to fill missed cat data with most frequent class MostFrequentImputer(BaseEstimator, TransformerMixin): def fit(self, X, y=None):
"xgbrg__verbose": False} searchCV = GridSearchCV( my_pipeline, cv=5, param_grid=param_grid, fit_params=fit_params ) searchCV.fit(train_X, train_y) # Alternative to impute as a preprocessor: from sklearn.compose import ColumnTransformer from sklearn.preprocessing SimpleImputer, OneHotEncoder # Preprocessing for numerical data numerical_transformer = SimpleImputer(strategy='constant') # Preprocessing for categorical data categorical_transformer = Pipeline( steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ] ) # Pipeline return a transformer piped # Bundle preprocessing for numerical and categorical data preprocessor = ColumnTransformer( transformers=[ ('num', numerical_transformer, numerical_cols), ('cat', categorical_transformer, categorical_cols)
# Data Preprocessing # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('Data.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # Taking care of missing data from sklearn.preprocessing import SimpleImputer imputer = SimpleImputer(missing_values='NaN', strategy='mean', axis=0) imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3])