Beispiel #1
0
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

# In[62]:

ordinal_encoder.categories_

# **Warning**: earlier versions of the book used the `LabelBinarizer` or `CategoricalEncoder` classes to convert each categorical value to a one-hot vector. It is now preferable to use the `OneHotEncoder` class. Right now it can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see [PR #10521](https://github.com/scikit-learn/scikit-learn/issues/10521)). So for now we import it from `future_encoders.py`, but when Scikit-Learn 0.20 is released, you can import it from `sklearn.preprocessing` instead:

# In[63]:

from future_encoders import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

# By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

# In[64]:

housing_cat_1hot.toarray()

# Alternatively, you can set `sparse=False` when creating the `OneHotEncoder`:

# In[65]:

cat_encoder = OneHotEncoder(sparse=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot
Beispiel #2
0
#Taking care of missing data
from sklearn.preprocessing import Imputer
imputer=Imputer(strategy="median")                   #We want to replace each attribute's missing values with the median of that attribute
housing_num=housing.drop("ocean_proximity",axis=1)   #Copy withouth the text attribute
imputer.fit(housing_num)                             #
imputer.statistics_                                  #Shows the results (median for each attribute)
X=imputer.transform(housing_num)                     #Transform the training set by replacing the missing values by the learned values
housing_tr=pd.DataFrame(X,columns=housing_num.columns)#Convert the array to a DataFrame

#Handling Text and Categorical Attributes
housing_cat=housing["ocean_proximity"]                      #Categorical attribute. It is a serie
housing_cat=pd.DataFrame(data=housing_cat)                  #DataFrame converts from serie --> DataFrame (includes the index)

from future_encoders import OneHotEncoder                   #This function assign 1 for when the instance has that category
cat_encoder=OneHotEncoder(sparse=False)
housing_cat_1hot=cat_encoder.fit_transform(housing_cat)     #Then, create a sparse matrix with the location of nonzeros.
housing_cat_1hot
cat_encoder.categories_


#Custom Transformer
from sklearn.base import BaseEstimator,TransformerMixin

rooms_ix,bedrooms_ix,population_ix,household_ix=3,4,5,6                                    #Columns (location) of each attribute

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):                             #The New class is using two base classes
    def __init__(self,add_bedrooms_per_room=True):                                           #_init_ is the constructor for the class
        self.add_bedrooms_per_room=add_bedrooms_per_room
    def fit(self,X,y=None):
        return self                                                                        #Nothing else to do
    def transform(self,X,y=None):                                                          #This function does the combination of parameters as it was done before