def data_prepartion(): num_attribs = ["yearsExperience", "milesFromMetropolis"] cat_attribs = ["companyId", "jobType", "degree", "major", "industry"] num_pipeline = Pipeline([ ('imputer', DataFrameImputer(num_attribs)), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('imputer', DataFrameImputer(cat_attribs)), ('label_binarizer', OneHotEncoder()), ]) data_pipeline = FeatureUnion( transformer_list=[("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline)]) return data_pipeline
def transform_data(self, housing_data): data = housing_data.drop('median_house_value', axis=1) self.housing_num = data.select_dtypes(include=[np.number]) self.num_attribs = list(self.housing_num) self.cat_attribs = list(data.select_dtypes(include=[np.object])) self.num_pipeline = Pipeline([ ('selector' , DataFrameSelector (self.num_attribs )), ('imputer' , Imputer (strategy="median")), ('attribs_adder', CombinedAttributesAdder( )), ('std_caller' , StandardScaler ( )) ]) self.cat_pipeline = Pipeline([ ('selector' , DataFrameSelector (self.cat_attribs )), ('cat_encoder' , OneHotEncoder (sparse=False )) ]) self.full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", self.num_pipeline), ("cat_pipeline", self.cat_pipeline) ])
return pd.read_csv('datasets/'+fileName) trainData = load_titanic_data('train.csv') testData = load_titanic_data('test.csv') num_pipeline = Pipeline([ ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])), ("imputer", Imputer(strategy="median")) ]) num_pipeline.fit_transform(trainData) cat_pipeline = Pipeline([ ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])), ("imputer", MostFrequentImputer()), ("cat_encoder", OneHotEncoder(sparse=False)), ]) cat_pipeline.fit_transform(trainData) preprocessed_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline) ]) train_data_preprocessed = preprocessed_pipeline.fit_transform(trainData) train_data_labels = trainData["Survived"] svm_classifier = SVC() svm_classifier.fit(train_data_preprocessed, train_data_labels)
ordinal_encoder = OrdinalEncoder() housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) housing_cat_encoded[:10] # In[62]: ordinal_encoder.categories_ # **Warning**: earlier versions of the book used the `LabelBinarizer` or `CategoricalEncoder` classes to convert each categorical value to a one-hot vector. It is now preferable to use the `OneHotEncoder` class. Right now it can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)). So for now we import it from `future_encoders.py`, but when Scikit-Learn 0.20 is released, you can import it from `sklearn.preprocessing` instead: # In[63]: from future_encoders import OneHotEncoder cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) housing_cat_1hot # By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method: # In[64]: housing_cat_1hot.toarray() # Alternatively, you can set `sparse=False` when creating the `OneHotEncoder`: # In[65]: cat_encoder = OneHotEncoder(sparse=False) housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
######################################################################################################################## # pipe line for preprocessing the data num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('cat_encoder', OneHotEncoder(sparse=False)), ]) # concate the data full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) housing_prepared = full_pipeline.fit_transform(housing) ######################################################################################################################## # perform grid search on the data ######################################### grid search CV dataExtract = housing_prepared[1:100] labelExtract = housing_labels[1:100]
num_pipeline = Pipeline([('std_scalar', StandardScaler())]) housing_num_tr = num_pipeline.fit_transform(housing_tr) ''' #%% ''' housing['total_bedrooms'] = housing_tr['total_bedrooms'] housing['rooms_per_household'] = housing_tr['rooms_per_household'] housing['population_per_household'] = housing_tr['population_per_household'] housing['bedrooms_per_room'] = housing_tr['bedrooms_per_room'] ''' num_attribs = list(housing) num_attribs.remove('ocean_proximity') cat_attribs = ['ocean_proximity'] full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs), ('cat', OneHotEncoder(), cat_attribs)]) housing_prepared = full_pipeline.fit_transform(housing) #%% Select and train a model lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) #%% housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) #%% lin_mae = mean_absolute_error(housing_labels, housing_predictions) #%%
#Taking care of missing data from sklearn.preprocessing import Imputer imputer=Imputer(strategy="median") #We want to replace each attribute's missing values with the median of that attribute housing_num=housing.drop("ocean_proximity",axis=1) #Copy withouth the text attribute imputer.fit(housing_num) # imputer.statistics_ #Shows the results (median for each attribute) X=imputer.transform(housing_num) #Transform the training set by replacing the missing values by the learned values housing_tr=pd.DataFrame(X,columns=housing_num.columns)#Convert the array to a DataFrame #Handling Text and Categorical Attributes housing_cat=housing["ocean_proximity"] #Categorical attribute. It is a serie housing_cat=pd.DataFrame(data=housing_cat) #DataFrame converts from serie --> DataFrame (includes the index) from future_encoders import OneHotEncoder #This function assign 1 for when the instance has that category cat_encoder=OneHotEncoder(sparse=False) housing_cat_1hot=cat_encoder.fit_transform(housing_cat) #Then, create a sparse matrix with the location of nonzeros. housing_cat_1hot cat_encoder.categories_ #Custom Transformer from sklearn.base import BaseEstimator,TransformerMixin rooms_ix,bedrooms_ix,population_ix,household_ix=3,4,5,6 #Columns (location) of each attribute class CombinedAttributesAdder(BaseEstimator,TransformerMixin): #The New class is using two base classes def __init__(self,add_bedrooms_per_room=True): #_init_ is the constructor for the class self.add_bedrooms_per_room=add_bedrooms_per_room def fit(self,X,y=None): return self #Nothing else to do
for set_ in (train_set, test_set): set_.drop("income_cat", axis=1, inplace=True) housing = train_set.drop("median_house_value", axis=1) housing_labels = train_set["median_house_value"].copy() num_attribs = list(housing.drop("ocean_proximity", axis=1)) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([ ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) housing_prepared = full_pipeline.fit_transform(housing) print(housing_prepared) lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) print("RMSE: ", np.sqrt(lin_mse))
# Create attributes for indexing num_attribs = list(housing.drop('ocean_proximity', axis=1)) cat_attribs = ['ocean_proximity'] # Build pipeline for pre-processing num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAddr()), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('cat_encoder', OneHotEncoder(sparse=False)), #Rem sparse see ln 19 ]) full_pipeline = FeatureUnion(transformer_list=[ ('num_pipeline', num_pipeline), ('cat_pipeline', cat_pipeline), ]) # Process data with pipeline housing_prepared = full_pipeline.fit_transform(housing) ## Train some Models # Declare instance for a support vector machine svm_reg = SVR()
if __name__ == "__main__": # Load the dataset df = load_data("Data/data.json") # Select features and target features = df.drop("score", axis=1) y = df["score"].copy() numeric_values = features.drop(["mood", "weather", "activity"], axis=1) # returns a copy of the dataframe num_attribs = list(numeric_values) cat_attribs = ["mood", "weather", "activity"] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), # Own transformation ('imputer', Imputer(strategy="median")), ('std_scaler', StandardScaler()), ]) full_pipeline = ColumnTransformer([ ("num_pipline", num_pipeline, num_attribs), ("cat_pipline", OneHotEncoder(), cat_attribs), ]) X = full_pipeline.fit_transform(features) forest_reg = RandomForestRegressor() forest_reg.fit(X, y)
def transform(self, X, y=None): return X[self.attributes_names].values # pipeline to transform numeric data num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), #num col ('imputer', Imputer(strategy="median")), #missing value ('attribs_adder', AddAttributes()), #customize add col ('std_scaler', StandardScaler()), #feature scaling ]) #the first transformer Imputer: input is dataframe, output is numpy array #housing_num_tr = num_pipeline.fit_transform(housing_num) #pipeline to transform text data from future_encoders import OneHotEncoder, OrdinalEncoder cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), #text col ('cat_pipline', OneHotEncoder(sparse=False)), #turn text to num ]) #join the features by two pipeline full_pipeline = FeatureUnion(transformer_list=[ ('num_pipeline', num_pipeline), ('cat_pipeline', cat_pipeline), ]) #housing_prepared = cat_pipeline.fit_transform(housing) housing_prepared = num_pipeline.fit_transform(housing) #encoder=OrdinalEncoder() #OneHotEncoder(sparse=False) #b = housing[cat_attribs] #a = encoder.fit_transform(b) """ ------------ Train and Evaluate ------------------------- 4. Training 4.1 LinearRegression(): input: numpy arrary or dataframe, output: np array