reg = RandomForestRegressor()
reg.fit(Xtrain, ytrain)
preds = reg.predict(Xval)
print(
    f'The validation RMSLE error for baseline model is: {RMSLE(np.exp(yval), np.exp(preds))}'
)

sub_preds = reg.predict(test[cols])
submit['units_sold'] = np.exp(sub_preds)
submit.head(2)

submit.to_csv('sub_baseline_v1.csv', index=False)

from category_encoders import TargetEncoder, MEstimateEncoder
encoder = MEstimateEncoder()
encoder.fit(train['store_id'], train['units_sold'])
train['store_encoded'] = encoder.transform(train['store_id'],
                                           train['units_sold'])
test['store_encoded'] = encoder.transform(test['store_id'], test['units_sold'])

encoder.fit(train['sku_id'], train['units_sold'])
train['sku_encoded'] = encoder.transform(train['sku_id'], train['units_sold'])
test['sku_encoded'] = encoder.transform(test['sku_id'], test['units_sold'])
skus = train.sku_id.unique()
print(skus[:2])

test_preds = test.copy()
test_preds.tail(2)


def sku_model(sku, cols_to_use, reg):
X_encode = df.sample(frac=0.20, random_state=0)
y_encode = X_encode.pop("SalePrice")

# Training split
X_pretrain = df.drop(X_encode.index)
y_train = X_pretrain.pop("SalePrice")

# YOUR CODE HERE: Create the MEstimateEncoder
# Choose a set of features to encode and a value for m
encoder =  MEstimateEncoder(
    cols=["Neighborhood"],
    m=1.0,
)

# Fit the encoder on the encoding split
encoder.fit(X_encode, y_encode)

# Encode the training split
X_train = encoder.transform(X_pretrain, y_train)

feature = encoder.cols

plt.figure(dpi=90)
ax = sns.distplot(y_train, kde=True, hist=False)
ax = sns.distplot(X_train[feature], color='r', ax=ax, hist=True, kde=False, norm_hist=True)
ax.set_xlabel("SalePrice");

X = df.copy()
y = X.pop("SalePrice")
score_base = score_dataset(X, y)
score_new = score_dataset(X_train, y_train)
Beispiel #3
0
rcParams['figure.figsize']=10,6

df1=np.log1p(df['UnitPrice'])

df.drop("UnitPrice", axis = 1, inplace = True)

!pip install category_encoders

from category_encoders import MEstimateEncoder

df=df.astype({'StockCode': 'category','Description': 'category'})

category_list=['StockCode','Description']

encoder_final=MEstimateEncoder()
encoder_final.fit(df[category_list], df1)

cat_enc = encoder_final.transform(df[category_list], df1)
continuous_train = df.drop(columns= category_list)
df = pd.concat([cat_enc,continuous_train],axis=1)

test_enc=encoder_final.transform(df2[category_list])
continuous_test=df2.drop(columns= category_list)
df2=pd.concat([test_enc,continuous_test],axis=1)

df2.head()

sns.distplot(np.log1p(df['Quantity']))

from scipy import stats 
y=np.log1p(df['UnitPrice'])