Beispiel #1
0
    def test_no_parent(self):
        """
        When using no priors, the functionalities should be the same as for
        m estimator.
        """

        te = sktools.NestedTargetEncoder(
            cols=self.col,
            feature_mapping=dict(col_1=self.parent_col),
            m_prior=0,
            m_parent=0,
        )

        m_te = MEstimateEncoder(cols=self.col, m=0)
        pd.testing.assert_frame_equal(te.fit_transform(self.X, self.y),
                                      m_te.fit_transform(self.X, self.y))
Beispiel #2
0
 def MEstimate_Encoding(self,
                        m: float = 1.0,
                        sigma: float = 0.05,
                        randomized: bool = False):
     """
     M估计量编码是目标编码的一个简化版本
     :param m:
     :param sigma:
     :param randomized:
     :return:
     """
     self.encoder = MEstimateEncoder(cols=self.cols,
                                     m=m,
                                     sigma=sigma,
                                     randomized=randomized)
    2)  # Mean of all prices
cars['make_group_count'] = cars.groupby('make').price.transform('count')
cars['make_group_mean'] = cars.groupby('make').price.transform('mean').round(2)
cars['weight'] = cars.make_group_count / (cars.make_group_count + m)
cars['make_smooth_encoded'] = cars.weight * cars.make_group_mean + (
    1 - cars.weight) * cars.overall_price_mean
print(cars[[
    'make', 'make_group_count', 'make_group_mean', 'price',
    'overall_price_mean', 'weight', 'make_smooth_encoded'
]])

# The category_encoders package in scikit-learn-contrib implements an m-estimate
# encoder, which we'll use to encode our 'make' feature.
from category_encoders import MEstimateEncoder
# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=['make'], m=5.0)
# Fit the enconder
X = df.copy()
y = X.pop('price')
X_encoded = encoder.fit_transform(X, y)
print(
    X_encoded
)  #if you compare 'make' feature with 'make_smooth_encoded' it's the same.

# Use Cases for Target Encoding
# Target encoding is great for:
# High-cardinality features: A feature with a large number of categories can be
# troublesome to encode: a one-hot encoding would generate too many features
# and alternatives, like a label encoding, might not be appropriate for that
# feature. A target encoding derives numbers for the categories using the
# feature's most important property: its relationship with the target.
df.select_dtypes(["object"]).nunique()

df["SaleType"].value_counts()

# Encoding split
X_encode = df.sample(frac=0.20, random_state=0)
y_encode = X_encode.pop("SalePrice")

# Training split
X_pretrain = df.drop(X_encode.index)
y_train = X_pretrain.pop("SalePrice")

# YOUR CODE HERE: Create the MEstimateEncoder
# Choose a set of features to encode and a value for m
encoder =  MEstimateEncoder(
    cols=["Neighborhood"],
    m=1.0,
)

# Fit the encoder on the encoding split
encoder.fit(X_encode, y_encode)

# Encode the training split
X_train = encoder.transform(X_pretrain, y_train)

feature = encoder.cols

plt.figure(dpi=90)
ax = sns.distplot(y_train, kde=True, hist=False)
ax = sns.distplot(X_train[feature], color='r', ax=ax, hist=True, kde=False, norm_hist=True)
ax.set_xlabel("SalePrice");
Beispiel #5
0
from sklearn.decomposition import PCA

# Create principal components
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)


## Target Encoding
from category_encoders import MEstimateEncoder

# Create the encoder instance. Choose m to control noise.
encoder = MEstimateEncoder(cols=["Zipcode"], m=5.0)

# Fit the encoder on the encoding split.
encoder.fit(X_encode, y_encode)

# Encode the Zipcode column to create the final training data
X_train = encoder.transform(X_pretrain)


## Preprocessing
mode_binary = Pipeline([
    ('encoder', SimpleImputer(strategy = 'most_frequent')),
    ('binary', BinaryEncoder())])

transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(handle_unknown = 'ignore'), [ 'hotel', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type']),
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
reg.fit(Xtrain, ytrain)
preds = reg.predict(Xval)
print(
    f'The validation RMSLE error for baseline model is: {RMSLE(np.exp(yval), np.exp(preds))}'
)

sub_preds = reg.predict(test[cols])
submit['units_sold'] = np.exp(sub_preds)
submit.head(2)

submit.to_csv('sub_baseline_v1.csv', index=False)

from category_encoders import TargetEncoder, MEstimateEncoder
encoder = MEstimateEncoder()
encoder.fit(train['store_id'], train['units_sold'])
train['store_encoded'] = encoder.transform(train['store_id'],
                                           train['units_sold'])
test['store_encoded'] = encoder.transform(test['store_id'], test['units_sold'])

encoder.fit(train['sku_id'], train['units_sold'])
train['sku_encoded'] = encoder.transform(train['sku_id'], train['units_sold'])
test['sku_encoded'] = encoder.transform(test['sku_id'], test['units_sold'])
skus = train.sku_id.unique()
print(skus[:2])

test_preds = test.copy()
test_preds.tail(2)

        return self

    def transform(self, X):
        # encode categorical features
        X = pd.get_dummies(X)
        # add missing features
        for col in self.cat_features:
            if col not in X:
                X[col] = 0
        #for colname in X.select_dtypes(["category", "object"]):
        #    X[colname], _ = X[colname].factorize()
        return X
      
# choose cols to drop
cols_to_drop = ['Id', 'PoolQC', 'MoSold', 'PoolQC', 'MiscFeature',
              'Alley','Fence', 'FireplaceQu', 'PoolArea', 'MiscVal',
              'LowQualFinSF','GarageYrBlt', 'GarageCond', 'GarageType',
              'GarageFinish', 'GarageQual','BsmtFinSF2', 'BsmtExposure',
              'BsmtQual', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2', 
              'MasVnrArea','MasVnrType']

# create pipeline
pipeline = Pipeline(steps=[
  ('feature_selector', FeatureSelector(cols_to_drop=cols_to_drop)),
  ('neighbor_encoder', MEstimateEncoder(cols=["Neighborhood"], m=5.0)),
  ('lf_imputer', LFImputer()),
  ('ordinal_transformer', OrdinalTransformer()),
  ('cardinal_transformer', CardinalTransformer(cat_features)),
  ('standard_scaler', StandardScaler()),
])
Beispiel #8
0
from matplotlib.pylab import rcParams 
rcParams['figure.figsize']=10,6

df1=np.log1p(df['UnitPrice'])

df.drop("UnitPrice", axis = 1, inplace = True)

!pip install category_encoders

from category_encoders import MEstimateEncoder

df=df.astype({'StockCode': 'category','Description': 'category'})

category_list=['StockCode','Description']

encoder_final=MEstimateEncoder()
encoder_final.fit(df[category_list], df1)

cat_enc = encoder_final.transform(df[category_list], df1)
continuous_train = df.drop(columns= category_list)
df = pd.concat([cat_enc,continuous_train],axis=1)

test_enc=encoder_final.transform(df2[category_list])
continuous_test=df2.drop(columns= category_list)
df2=pd.concat([test_enc,continuous_test],axis=1)

df2.head()

sns.distplot(np.log1p(df['Quantity']))

from scipy import stats