Esempio n. 1
0
def get_svr_pipeline(countries_threshold=0.97, utc_threshold=0.95, log=False):
    preprocessing = Pipeline(steps=[
        ('countries',
         CategoricalThresholdTransformer(
             'country#cat', threshold=countries_threshold, log=log)),
        ('utc_offset',
         CategoricalThresholdTransformer(
             'utc_offset#cat', threshold=utc_threshold, log=log)),
        ('calculated_pop', CalculatedPopTransformer()),
    ])

    numeric_transformer = Pipeline(steps=[
        ('log', LogTransformer(exclude_columns=[])),
        ('scale', MinMaxScaler()),
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

    transformers = ColumnTransformer(transformers=[
        ('numeric_log', numeric_transformer,
         selector(dtype_exclude=['object', 'category'])),
        ('categorical', categorical_transformer,
         selector(dtype_include=['object', 'category'])),
    ],
                                     remainder='passthrough')

    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('transformations', transformers),
        ('model', SVR(C=0.5, epsilon=0.01, gamma='scale', cache_size=1999)),
    ])
    return pipeline
Esempio n. 2
0
def preprocess():
    numeric_transformer = StandardScaler(with_mean=True, with_std=True)
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer,
             selector(dtype_exclude=object)),  #self.numeric_features),
            ('cat', categorical_transformer, selector(dtype_include=object)
             )  #self.categorical_features)
        ],
        remainder='passthrough')
    return preprocessor
Esempio n. 3
0
def get_preprocessor():
    numeric_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])
    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude=["category", "object"])),
                      ('cat', categorical_transformer,
                       selector(dtype_include=["category"]))])
    return preprocessor
    def prepocess(self):
        """
        Preprocess the data through normalization of numeric variables and categorical transformations.
        """

        numeric_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer,
                 selector(dtype_exclude=object)),  #self.numeric_features),
                ('cat', categorical_transformer, selector(dtype_include=object)
                 )  #self.categorical_features)
            ],
            remainder='passthrough')
Esempio n. 5
0
    def preprocessor(self):
        """Pipeline with numerical pipeline combined with categorical pipeline

        Returns
        -------
        Pipeline
            Pipeline with num and cat transformers
        """
        return ColumnTransformer(transformers=[
            ("num", self.numerical_transformer,
             selector(dtype_exclude="category")),
            (
                "cat",
                self.categorical_transformer,
                selector(dtype_include="category"),
            ),
        ])
Esempio n. 6
0
    def pre_encoder(x):
        str_encode = Pipeline(steps=[('miss', SimpleImputer()), ('strings',
                                                                 x)])
        num_encode = Pipeline(
            steps=[('miss', SimpleImputer()), ('scaler', StandardScaler())])

        pre_encode = ColumnTransformer(
            transformers=[('categoricals', str_encode,
                           selector(dtype_exclude=['float'])),
                          ('numericals', num_encode,
                           selector(dtype_include=['float']))])
        parameters = [{
            'pre_encode__categoricals': [str_encode],
            'pre_encode__categoricals__miss__strategy': ['most_frequent']
        }, {
            'pre_encode__numericals': [num_encode],
            'pre_encode__numericals__miss__strategy':
            ['mean', 'median', 'most_frequent']
        }]
        # return preprocessor and their parameters
        return [parameters, pre_encode]
def create_and_run_pipeline_GDCV(X,
                                 y,
                                 param_grid,
                                 num_cv=10,
                                 clf_obj=LogisticRegression(),
                                 random_state=42):

    # Reproduce the identical fit/score process
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state)

    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    param_grid = {
        'preprocessor__num__imputer__strategy': ['mean', 'median'],
        'classifier__C': [0.1, 1.0, 10, 100],
    }

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude="category")),
                      ('cat', categorical_transformer,
                       selector(dtype_include="category"))])

    clf = Pipeline(steps=[('preprocessor',
                           preprocessor), ('classifier', clf_obj)])

    grid_search = GridSearchCV(clf, param_grid, cv=num_cv)
    grid_search.fit(X_train, y_train)

    clf_name = str(clf_obj).split('(')[0]
    print(("best %s from grid search: %.3f" %
           (clf_name, grid_search.score(X_test, y_test))))
    return clf
Esempio n. 8
0
def get_linear_pipeline(alpha=1,
                        countries_threshold=0.97,
                        utc_threshold=0.95,
                        log=False):
    preprocessing = Pipeline(steps=[
        ('countries',
         CategoricalThresholdTransformer(
             'country#cat', threshold=countries_threshold, log=log)),
        ('utc_offset',
         CategoricalThresholdTransformer(
             'utc_offset#cat', threshold=utc_threshold, log=log)),
        ('calculated_pop', CalculatedPopTransformer()),
    ])

    numeric_transformer = Pipeline(steps=[
        ('log', LogTransformer(exclude_columns=[])),
        ('poli', PolynomialFeatures(2)),
        ('scale', MinMaxScaler()),
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

    transformers = ColumnTransformer(transformers=[
        ('numeric_log', numeric_transformer,
         selector(dtype_exclude=['object', 'category'])),
        ('categorical', categorical_transformer,
         selector(dtype_include=['object', 'category'])),
    ],
                                     remainder='passthrough')

    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing),
        ('transformations', transformers),
        ('model', Ridge(alpha=alpha)),
    ])
    return pipeline
 def transform(self, X, y=None):
     """Transform features of length less than self.threshold
     with ordinal encoder
     """
     dX = X.copy()  # use deep copy!!
     enc = OrdinalEncoder()
     cats = selector(dtype_include='object')(X)
     cats_to_encode = list(filter(lambda x: len(x) < self.threshold, cats))
     nums_to_keep = set(X.columns).difference(set(cats))
     for i in dX.columns:
         if i in cats_to_encode:
             dX.loc[:,
                    i] = enc.fit_transform(dX.loc[:, i].to_numpy().reshape(
                        -1, 1)).astype('int')
     return dX
Esempio n. 10
0
def q4():
    # Retorne aqui o resultado da questão 4.
    X = df.drop(['Region','Country'],axis=1)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, selector(dtype_exclude="category"))
    ])

    preprocessor.fit(X)
    res = preprocessor.transform(df_test.drop(['Region','Country'],axis=1)).tolist()
    return round(res[0][9],3)
    def transform(self, X, y=None):
        """Transform features of length less than self.threshold
        with ordinal encoder
        """
        temp = pd.DataFrame(index=range(X.shape[0]))  # initialize a Dataframe
        enc = OrdinalEncoder()
        cats = selector(dtype_include='object')(X)
        cats_to_encode = list(filter(lambda x: len(x) < self.threshold, cats))
        nums_to_keep = set(X.columns).difference(set(cats))
        m = 0
        for i in set(cats_to_encode):
            temp[i] = enc.fit_transform(X.loc[:, i].to_numpy().reshape(
                -1, 1)).astype('int')

        return pd.concat([
            temp, X.loc[:, [z for z in cats if z not in cats_to_encode]],
            X.loc[:, nums_to_keep]
        ],
                         axis=1)
def seed_everything(seed=1903):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything(seed=2020)

os.chdir('/kaggle/working')
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
sample_submission = pd.read_csv(
    '../input/tabular-playground-series-mar-2021/sample_submission.csv')

select_numeric_features = selector(dtype_include='number')
numeric_features = select_numeric_features(
    train
)  # 記得 scaleing for linear models with regularization. Without regularization, linear models doesn't need to be scaled simply for prediction.

train_id = train.loc[:, 'id']
test_id = test.loc[:, 'id']
train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1))
num_features = selector(dtype_include='number')(train.drop('target', axis=1))

cat_preprocessor = Pipeline(steps=[('oh', OneHotEncoder(
    handle_unknown='ignore')), ('ss', StandardScaler(with_mean=False))])
num_preprocessor = Pipeline(steps=[('pt', PowerTransformer(
test_strat["Attrition"].value_counts(normalize=True)


# Sample data
train, test = train_test_split(ames, test_size=0.3, random_state=123)

# Extract features and response
features = train.drop(columns="Sale_Price")
label = train["Sale_Price"]



# SciKit-Learn does not automatically transform categorical features so we need to 
# apply a one-hot transformer. We will discuss this more thoroughly in the next chapter.
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('cat', categorical_transformer, selector(dtype_include="object"))])

knn_fit = Pipeline(steps=[('preprocessor', preprocessor),
                          ('knn', KNeighborsRegressor(metric='euclidean'))])

                          
# Specify resampling strategy
cv = RepeatedKFold(n_splits=10, n_repeats=5)

# Create grid of hyperparameter values
hyper_grid = {'knn__n_neighbors': range(3, 26)}

# Tune a knn model using grid search
grid_search = GridSearchCV(knn_fit, hyper_grid, cv=cv, scoring='neg_mean_squared_error')
results = grid_search.fit(features, label)
Esempio n. 14
0
# %%
X_train.info()

# %% [markdown]
# While some features are numeric, some have been tagged as `category`. These
# features need to be encoded such that our random forest can
# deal with them. The simplest solution is to use an `OrdinalEncoder`.
# Regarding, the numerical features, we don't need to do anything. Thus, we
# will create preprocessing steps to take care of the encoding.

# %%
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_selector = selector(dtype_include="category")
preprocessor = make_column_transformer(
    (OrdinalEncoder(), categorical_selector),
    remainder="passthrough",
)

X_train_preprocessed = pd.DataFrame(
    preprocessor.fit_transform(X_train),
    columns=(
        categorical_selector(X_train) +
        [col for col in X_train.columns
         if col not in categorical_selector(X_train)]
    )
)
X_train_preprocessed.head()
def wrapper_feature_transformer_ensembles_trees_clf_v2(
    X,
    y,
    X_test,
    y_test,
    n_estimator=10,
    transformer=None,
    clf_obj=LogisticRegression(max_iter=1000)):

    numeric_transformer = Pipeline(
        steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude="category")),
                      ('cat', categorical_transformer,
                       selector(dtype_include="category"))])

    scaler = StandardScaler()
    scaler.fit(X)
    x_train_scaled = scaler.transform(X)
    x_test_scaled = scaler.transform(X_test)

    clf_name = str(clf_obj).split('(')[0]

    if transformer is not None:
        transformer.fit(x_train_scaled)
        x_train_scaled = transformer.transform(x_train_scaled)
        x_test_scaled = transformer.transform(x_test_scaled)
        pass

    # It is important to train the ensemble of trees on a different subset
    # of the training data than the linear regression model to avoid
    # overfitting, in particular if the total number of leaves is
    # similar to the number of training samples
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(x_train_scaled,
                                                                y,
                                                                test_size=0.5,
                                                                random_state=0)

    # Unsupervised transformation based on totally random trees
    rt = RandomTreesEmbedding(max_depth=3,
                              n_estimators=n_estimator,
                              random_state=0)
    rt_clf = sklearn.base.clone(clf_obj)
    pipeline = make_pipeline(rt, rt_clf)
    pipeline.fit(X_train, y_train)
    y_pred_rt = pipeline.predict(X_test)
    fpr_rt_clf, tpr_rt_clf, _ = roc_curve(y_test, y_pred_rt)

    # Supervised transformation based on random forests
    rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
    rf_enc = OneHotEncoder()
    rf_clf = sklearn.base.clone(clf_obj)
    rf.fit(X_train, y_train)
    rf_enc.fit(rf.apply(X_train))
    rf_clf.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

    y_pred_rf_clf = rf_clf.predict(rf_enc.transform(rf.apply(x_test_scaled)))
    fpr_rf_clf, tpr_rf_clf, _ = roc_curve(y_test, y_pred_rf_clf)

    # Supervised transformation based on gradient boosted trees
    grd = GradientBoostingClassifier(n_estimators=n_estimator)
    grd_enc = OneHotEncoder()
    grd_clf = sklearn.base.clone(clf_obj)
    grd.fit(X_train, y_train)
    grd_enc.fit(grd.apply(X_train)[:, :, 0])
    grd_clf.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

    y_pred_grd_clf = grd_clf.predict(
        # grd_enc.transform(grd.apply(x_test_scaled)[:, :, 0]))[:, 1]
        grd_enc.transform(grd.apply(x_test_scaled)[:, :, 0]))
    fpr_grd_clf, tpr_grd_clf, _ = roc_curve(y_test, y_pred_grd_clf)

    # The gradient boosted model by itself
    y_pred_grd = grd.predict(x_test_scaled)
    fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)

    # The random forest model by itself
    y_pred_rf = rf.predict(X_test)
    fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_clf, tpr_rt_clf, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_clf, tpr_rf_clf, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_clf, tpr_grd_clf, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve - {clf_name}')
    plt.legend(loc='best')
    plt.show()

    plt.figure(2)
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr_rt_clf, tpr_rt_clf, label='RT + LR')
    plt.plot(fpr_rf, tpr_rf, label='RF')
    plt.plot(fpr_rf_clf, tpr_rf_clf, label='RF + LR')
    plt.plot(fpr_grd, tpr_grd, label='GBT')
    plt.plot(fpr_grd_clf, tpr_grd_clf, label='GBT + LR')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title(f'ROC curve (zoomed in at top left) - {clf_name}')
    plt.legend(loc='best')
    plt.show()

    pass
def get_pipeline(model):
    """
    Generates a scikit-learn modeling pipeline with model as the final step.

    :param model: instantiated model
    :returns: scikit-learn pipeline
    """
    numeric_transformer = Pipeline(steps=[
        ('mouse_movement_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'mouse_movement',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('propensity_score_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'propensity_score',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('completeness_score_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'completeness_score',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('profile_score_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'profile_score',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('average_stars_clipper',
         FunctionTransformer(clip_feature_bounds,
                             validate=False,
                             kw_args={
                                 'feature': 'average_stars',
                                 'cutoff': 0,
                                 'new_amount': 0,
                                 'clip_type': 'lower'
                             })),
        ('ratio_creator',
         FunctionTransformer(create_ratio_column,
                             validate=False,
                             kw_args={
                                 'col1': 'profile_score',
                                 'col2': 'activity_score'
                             })),
        ('log_creator', TakeLog()),
        ('dict_creator', FeaturesToDict()),
        ('dict_vectorizer', DictVectorizer(sparse=False)),
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('feature_selector', SelectPercentile(f_classif)),
    ])

    categorical_transformer = Pipeline(steps=[
        ('date_transformer',
         FunctionTransformer(convert_column_to_datetime,
                             validate=False,
                             kw_args={'feature': 'acquired_date'})),
        ('month_extractor',
         FunctionTransformer(extract_month_from_date,
                             validate=False,
                             kw_args={'date_col': 'acquired_date'})),
        ('quarter_extractor',
         FunctionTransformer(convert_month_to_quarter,
                             validate=False,
                             kw_args={
                                 'month_col': 'month',
                                 'mapping_dict': MONTH_TO_QUARTER_DICT
                             })),
        ('year_extractor',
         FunctionTransformer(extract_year_from_date,
                             validate=False,
                             kw_args={'date_col': 'acquired_date'})),
        ('date_dropper',
         FunctionTransformer(drop_features,
                             validate=False,
                             kw_args={'feature_list': FEATURES_TO_DROP})),
        ('imputer',
         FunctionTransformer(fill_missing_values,
                             validate=False,
                             kw_args={'fill_value': CATEGORICAL_FILL_VALUE})),
        ('category_combiner', CombineCategoryLevels()),
        ('dict_creator', FeaturesToDict()),
        ('dict_vectorizer', DictVectorizer(sparse=False)),
        ('feature_selector', SelectPercentile(chi2)),
    ])

    preprocessor = ColumnTransformer(
        transformers=[('numeric_transformer', numeric_transformer,
                       selector(dtype_include='number')),
                      ('categorical_transformer', categorical_transformer,
                       selector(dtype_exclude='number'))],
        remainder='passthrough',
    )

    pipeline = Pipeline(steps=[(
        'data_mapper',
        FunctionTransformer(ensure_features_are_standardized,
                            validate=False,
                            kw_args={'feature_mapping': FEATURE_DTYPE_MAPPING})
    ), ('preprocessor',
        preprocessor), ('variance_thresholder',
                        VarianceThreshold()), ('model', model)])

    return pipeline
# %% 選出categorical features方法二
from sklearn.compose import make_column_selector as selector


class dummyTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(
            X,
            columns=selector(dtype_include='object')(X))  # return pd.DataFrame


tr4 = ColumnTransformer(transformers=[('dum', dummyTransformer(),
                                       selector(dtype_include='object'))])
tr4.fit_transform(df)  # Note: returns a ndarray

# %%
# ANCHOR Custom transformers
# ------------------------------- EXPERIMENT 3 ------------------------------- #
import random
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector as selector
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"),
)

# %% [markdown]
# Then, we can create a preprocessor which will dispatch the categorical
# columns to the categorical pipeline and the numerical columns to the
# numerical pipeline

# %%
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector

preprocessor_linear = make_column_transformer(
    (num_pipe, selector(dtype_include="number")),
    (cat_pipe, selector(dtype_include="category")),
    n_jobs=2,
)

# %% [markdown]
# Finally, we connect our preprocessor with our
# :class:`~sklearn.linear_model.LogisticRegression`. We can then evaluate our
# model.

# %%
from sklearn.linear_model import LogisticRegression

lr_clf = make_pipeline(preprocessor_linear, LogisticRegression(max_iter=1000))

# %%
Esempio n. 19
0
def xgboost_model(line_file, trip_file, weather_file):
    df_route = pd.read_csv('./leavetimes_by_line/' + line_file,
                           keep_default_na=True,
                           sep=',\s+',
                           delimiter=';',
                           skipinitialspace=True)
    df_route = df_route.drop([
        'DATASOURCE', 'PLANNEDTIME_DEP', 'ACTUALTIME_DEP', 'PASSENGERS',
        'PASSENGERSIN', 'PASSENGERSOUT', 'DISTANCE', 'SUPPRESSED',
        'JUSTIFICATIONID', 'LASTUPDATE', 'NOTE'
    ], 1)

    df_trips = pd.read_csv(trip_file,
                           keep_default_na=True,
                           sep=',\s+',
                           delimiter=';',
                           skipinitialspace=True)
    df_trips = df_trips.drop([
        'DATASOURCE', 'TENDERLOT', 'SUPPRESSED', 'JUSTIFICATIONID', 'BASIN',
        'ACTUALTIME_ARR', 'ACTUALTIME_DEP', 'PLANNEDTIME_ARR', 'LASTUPDATE',
        'NOTE'
    ], 1)
    df_trips = df_trips.rename(
        columns={'PLANNEDTIME_DEP': 'TRIPS_PLANNEDTIME_DEP'})

    df_weather = pd.read_csv(weather_file)
    df_weather.drop([
        'dt', 'timezone', 'city_name', 'lat', 'lon', 'temp_min', 'temp_max',
        'sea_level', 'grnd_level', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
        'weather_description', 'wind_deg', 'weather_icon'
    ], 1)

    def drop_UTC(str):
        return str.replace("+0000 UTC", "")

    df_weather['date'] = df_weather['dt_iso'].apply(drop_UTC)
    df_weather = df_weather.drop(['dt_iso'], 1)
    df_weather['date'] = pd.to_datetime(df_weather['date'])

    df_weather = df_weather[[
        'date', 'temp', 'feels_like', 'pressure', 'humidity', 'wind_speed',
        'clouds_all', 'weather_main'
    ]]

    df_weather['weather_main'] = df_weather['weather_main'].astype('category')

    df = pd.merge(df_route, df_trips, on=['DAYOFSERVICE', 'TRIPID', 'ROUTEID'])
    df['TRIPID'] = df['TRIPID'].astype('object')
    df = df[[
        'DAYOFSERVICE', 'LINEID', 'ROUTEID', 'DIRECTION', 'TRIPID',
        'PROGRNUMBER', 'STOPPOINTID', 'PLANNEDTIME_ARR', 'ACTUALTIME_ARR',
        'VEHICLEID', 'TRIPS_PLANNEDTIME_DEP'
    ]]

    import re

    def tidy_datetime(time_str):
        if 'JAN' in time_str:
            return str(re.sub('JAN', '01', time_str))
        elif 'FEB' in time_str:
            return str(re.sub('FEB', '02', time_str))
        elif 'MAR' in time_str:
            return str(re.sub('MAR', '03', time_str))
        elif 'APR' in time_str:
            return str(re.sub('APR', '04', time_str))
        elif 'MAY' in time_str:
            return str(re.sub('MAY', '05', time_str))
        elif 'JUN' in time_str:
            return str(re.sub('JUN', '06', time_str))
        elif 'JUL' in time_str:
            return str(re.sub('JUL', '07', time_str))
        elif 'AUG' in time_str:
            return str(re.sub('AUG', '08', time_str))
        elif 'SEP' in time_str:
            return str(re.sub('SEP', '09', time_str))
        elif 'OCT' in time_str:
            return str(re.sub('OCT', '10', time_str))
        elif 'NOV' in time_str:
            return str(re.sub('NOV', '11', time_str))
        elif 'DEC' in time_str:
            return str(re.sub('DEC', '12', time_str))
        return time_str

    df['DAYOFSERVICE'] = df['DAYOFSERVICE'].apply(tidy_datetime)

    df['DAYOFSERVICE'] = pd.to_datetime(
        df['DAYOFSERVICE'],
        format='%d-%m-%y %H:%M:%S').dt.strftime('%Y-%m-%d %H:%M:%S')
    df['DAYOFSERVICE'] = pd.to_datetime(df['DAYOFSERVICE'])
    df['timestamp'] = df.apply(lambda x: x['DAYOFSERVICE'] + pd.Timedelta(
        seconds=x['TRIPS_PLANNEDTIME_DEP']),
                               axis=1)
    df = df.sort_values(
        ["timestamp", "PROGRNUMBER"],
        ascending=(True, True)).apply(lambda x: x.reset_index(drop=True))

    df['timestamp'] = pd.to_datetime(df['timestamp'])

    df['DAYOFWEEK'] = df['timestamp'].dt.dayofweek
    df['MONTH'] = df['timestamp'].dt.month
    df['DAY'] = df['timestamp'].dt.day
    df['date'] = df['timestamp'].dt.round('H')

    df = pd.merge(df, df_weather, on=['date'])
    df = df.drop(['date'], 1)

    holiday_list = [
        '2018-01-01', '2018-03-17', '2018-03-20', '2018-03-30', '2018-04-01',
        '2018-04-02', '2018-05-07', '2018-06-04', '2018-06-21', '2018-08-06',
        '2018-09-23', '2018-10-29', '2018-12-21', '2018-12-24', '2018-12-25',
        '2018-12-26', '2018-12-31'
    ]

    def holiday(time_str):
        if str(time_str) in holiday_list:
            return 1
        return 0

    df['HOLIDAY'] = df['DAYOFSERVICE'].dt.date.apply(holiday)

    df1 = df.apply(lambda x: x.reset_index(drop=True))

    df1['TRIPID'] = df1['TRIPID'].astype('category')
    df1['STOPPOINTID'] = df1['STOPPOINTID'].astype('category')
    df1['VEHICLEID'] = df1['VEHICLEID'].astype('category')
    df1['LINEID'] = df1['LINEID'].astype('category')
    df1['ROUTEID'] = df1['ROUTEID'].astype('category')
    df1['DIRECTION'] = df1['DIRECTION'].astype('category')
    df1['DAYOFWEEK'] = df1['DAYOFWEEK'].astype('category')
    df1['MONTH'] = df1['MONTH'].astype('category')
    df1['DAY'] = df1['DAY'].astype('category')
    df1['HOLIDAY'] = df1['HOLIDAY'].astype('category')
    df1['weather_main'] = df1['weather_main'].astype('category')
    df1['PROGRNUMBER'] = df1['PROGRNUMBER'].astype('int64')
    df1['clouds_all'] = df1['clouds_all'].astype('float64')

    df1 = df1[[
        'DAYOFSERVICE',
        'LINEID',
        'ROUTEID',
        'DIRECTION',
        'TRIPID',
        'PROGRNUMBER',
        'STOPPOINTID',
        'PLANNEDTIME_ARR',
        'ACTUALTIME_ARR',
        'VEHICLEID',
        'TRIPS_PLANNEDTIME_DEP',
        'timestamp',
        'DAYOFWEEK',
        'DAY',
        'HOLIDAY',
        'temp',
        'feels_like',
        # 'pressure',
        # 'humidity',
        # 'wind_speed',
        'clouds_all',
        'weather_main'
        #  'weather_id'
    ]]

    df_rev = df1.copy()
    df_rev = df_rev.drop([
        'DAYOFSERVICE', 'TRIPID', 'PLANNEDTIME_ARR', 'STOPPOINTID',
        'timestamp', 'DAY', 'VEHICLEID'
    ],
                         axis=1)
    numeric_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='median')), ('scaler',
                                                    StandardScaler())])

    categorical_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='missing')
                ), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer,
                       selector(dtype_exclude="category")),
                      ('cat', categorical_transformer,
                       selector(dtype_include="category"))])

    # X_train, X_test, y_train, y_test = train_test_split(df_rev.drop(['ACTUALTIME_ARR'], axis=1), df_rev['ACTUALTIME_ARR'], test_size=0.1, shuffle=False, stratify = None)

    param_grid = {
        'colsample_bytree': [0.1, 0.5, 0.8, 1],
        'learning_rate': [0.001, 0.01, 0.1, 1],
        'max_depth': [5, 10, 15],
        'n_estimators': [50, 100, 150, 200]
    }

    grid_search = Pipeline(
        steps=[('preprocessor', preprocessor),
               ('grid_search',
                GridSearchCV(XGBRegressor(), param_grid, cv=5))])
    grid_search.fit(df_rev.drop(['ACTUALTIME_ARR'], axis=1),
                    df_rev['ACTUALTIME_ARR'])

    result = pd.DataFrame(grid_search['grid_search'].cv_results_).sort_values(
        'mean_test_score',
        ascending=False)[0:5].apply(lambda x: x.reset_index(drop=True))

    param_learning_rate = result.loc[0]['param_learning_rate']
    param_max_depth = result.loc[0]['param_max_depth']
    param_n_estimators = result.loc[0]['param_n_estimators']
    param_colsample_bytree = result.loc[0]['param_colsample_bytree']

    clf_XG = Pipeline(
        steps=[('preprocessor', preprocessor),
               ('classifier',
                XGBRegressor(colsample_bytree=param_colsample_bytree,
                             learning_rate=param_learning_rate,
                             max_depth=param_max_depth,
                             n_estimators=param_n_estimators))])

    # clf_XG = Pipeline(steps=[('preprocessor', preprocessor),
    #                     ('classifier', XGBRegressor(colsample_bytree = 1, learning_rate = 0.1,max_depth = 10, n_estimators = 200))])

    clf_XG.fit(df_rev.drop(['ACTUALTIME_ARR'], axis=1),
               df_rev['ACTUALTIME_ARR'])

    # print("model score: %.7f" % clf_XG.score(X_test, y_test))

    joblib.dump(clf_XG,
                './pickle_file_XG/XG_' + df_rev.iloc[0]['LINEID'] + '.pkl')
# ```python
# categories = [data[column].unique()
#               for column in data[categorical_columns]]
# OrdinalEncoder(categories=categories)
# ```

# %%
import pandas as pd

df = pd.read_csv("../datasets/adult-census.csv")

# %%
target_name = "class"
target = df[target_name]
data = df.drop(columns=[target_name, "fnlwgt"])

# %%
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)
data_categorical = data[categorical_columns]

# %%
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression

# Write your code here.
Esempio n. 21
0
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

df = pd.read_csv("../datasets/adult-census.csv")

# %%
target_name = "class"
target = df[target_name].to_numpy()
data = df.drop(columns=[target_name, "fnlwgt"])

# %%
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_include=["int", "float"])
categorical_columns_selector = selector(dtype_exclude=["int", "float"])
numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

categories = [
    data[column].unique() for column in data[categorical_columns]]

# %% [markdown]
# ## Reference pipeline (no numerical scaling and integer-coded categories)
#
# First let's time the pipeline we used in the main notebook to serve as a reference:

# %%
# %%time
 def transform(self, X):
     return pd.DataFrame(
         X,
         columns=selector(dtype_include='object')(X))  # return pd.DataFrame
Esempio n. 23
0
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "fnlwgt", "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.2, random_state=42)

# %%
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough',
    sparse_threshold=0)

# This line is currently required to import HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([("preprocessor", preprocessor),
                  ("classifier",
                   HistGradientBoostingClassifier(random_state=42))])

# %% [markdown]
#
# Use the previously defined model (called `model`) and using two nested `for`
Esempio n. 24
0
    steps=[
        ("impute", SimpleImputer()),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)

complete_pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "estimator",
            DecisionTreeClassifier(min_samples_leaf=10, max_depth=4),
        ),
    ]
)

complete_pipeline.fit(X_train, y_train_true)
import pandas as pd

df = pd.read_csv("../datasets/adult-census.csv")

target_name = "class"
target = df[target_name]

data = df.drop(columns=[target_name, "fnlwgt"])

# %% [markdown]
# We only keep numerical features

# %%
from sklearn.compose import make_column_selector as selector

numerical_columns_selector = selector(dtype_exclude=object)
numerical_columns = numerical_columns_selector(data)
numerical_columns

data_numeric = data[numerical_columns]

# %% [markdown]
# We do a train-test split for evaluation

# %%
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data_numeric, target, random_state=42)

# %% [markdown]
Esempio n. 26
0
            SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = [
    'applicant_age', 'derived_sex', 'derived_race', 'derived_ethnicity',
    'loan_type', 'county_code', 'denial_reason-1'
]

categorical_transformer = Pipeline(steps=[
    ('encoder', OrdinalEncoder()),
    #('imputer', SimpleImputer(strategy='constant', fill_value='mode')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer,
                   selector(dtype_exclude='object')),
                  ('cat', categorical_transformer,
                   selector(dtype_include='object'))])

total_features = [
    'income', 'loan_amount', 'tract_minority_population_percent',
    'applicant_age', 'derived_sex', 'derived_race', 'derived_ethnicity',
    'loan_type', 'county_code', 'denial_reason-1'
]

# In[22]:

#Model Training and Testing

#Select Features for models
X = fin_data[[
Esempio n. 27
0
from sklearn.preprocessing import StandardScaler

categorical_preprocessor = OneHotEncoder()
numerical_preprocessor = StandardScaler()

# Subsequently, create a `ColumnTransformer` to redirect the specific columns
# a preprocessing pipeline.

# %%

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor,
      selector(dtype_include=object)),
     ('num-preprocessor', numerical_preprocessor,
      selector(dtype_include='number'))],
    remainder='passthrough',
    sparse_threshold=0)

# Finally, concatenate the preprocessing pipeline with a logistic regression.

# %%

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

model = make_pipeline(preprocessor, LogisticRegression())

# Use a `RandomizedSearchCV` to find the best set of hyperparameters by tuning
def ridgeCLF_objective(trial):
    seed_everything(seed=2020)

    TOGGLE_BAY_CAT_ENCODER = True
    if TOGGLE_BAY_CAT_ENCODER:
        temp = train_encoded.drop('target', axis=1).columns
        cat_features = [i for i in temp if i.startswith('cat') and not i.endswith('_code')]
        num_features = [i for i in temp if i not in cat_features and not i.endswith('_code')]
        enc_features = [i for i in temp if i.endswith('_code')]
    else:
        cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1))
        num_features = selector(dtype_include='number')(train.drop('target', axis=1))

    #categorical features zone
    cat_preprocessor = Pipeline(steps=[
        ('oh', OneHotEncoder(handle_unknown='ignore')),
        ('ss', StandardScaler(with_mean=False))
    ])

    # MAX_OF_CARDINALITY = trial.suggest_categorical('max_cardi', [100])
    # def get_low_cardinality_features(df):
    #     cols = df \
    #         .select_dtypes(['object', 'category']) \
    #         .apply(lambda col: col.nunique()) \
    #         .loc[lambda x: x <= MAX_OF_CARDINALITY] \
    #         .index.tolist()     
    #     return df.loc[:, cols]

    # cat_low_cardi_preprocessor = Pipeline([
    #     ('cat_low', FunctionTransformer(func=get_low_cardinality_features)),
    #     ('oh', OneHotEncoder(handle_unknown='ignore')),
    #     ('ss', StandardScaler(with_mean=False))        
    # ])    

    # def get_high_cardinality_features(df):
    #     cols = df \
    #         .select_dtypes(['object', 'category']) \
    #         .apply(lambda col: col.nunique()) \
    #         .loc[lambda x: x > MAX_OF_CARDINALITY] \
    #         .index.tolist()     
    #     return df.loc[:, cols]    

    # SMOOTHING = 0.2182996635284694 # trial.suggest_float('smooth', 0.001, 1.0)
    # cat_high_cardi_preprocessor = Pipeline([
    #     ('cat_high', FunctionTransformer(func=get_high_cardinality_features)),
    #     ('te', TargetEncoder(smoothing=SMOOTHING)),
    #     ('ss', StandardScaler(with_mean=False))        
    # ])    

    def generate_num_polynomial(X):
        cols = X.columns
        for i in range(len(cols)-1):
            for j in range(i+1, len(cols)):
                colname = cols[i] + '_' + cols[j]
                X[colname] = X[cols[i]] * X[cols[j]]
        for i in range(len(cols)-1):
            colname= cols[i] + '^2'
            X[colname] = X[cols[i]].pow(2)
        return X

    num_polynomial = Pipeline([
        ('interact', FunctionTransformer(func=generate_num_polynomial))
    ])        

    num_polynomial_switch = trial.suggest_categorical('ph', [True])

    # numerical features zone
    if num_polynomial_switch:
        num_preprocessor = Pipeline(steps=[ 
            ('ac', num_polynomial),
            ('pt', PowerTransformer(method='yeo-johnson')),
            ('ss', StandardScaler())                                   
        ])
    else:        
        num_preprocessor = Pipeline(steps=[ 
            ('pt', PowerTransformer(method='yeo-johnson')),
            ('ss', StandardScaler())                                   
        ]) 

    enc_preprocessor = Pipeline(steps=[
        ('pt', PowerTransformer(method='yeo-johnson')), # I think it doen's make sense to transform probability values.
        ('ss', StandardScaler())          
    ])

    if TOGGLE_BAY_CAT_ENCODER:
        preprocessor = ColumnTransformer(transformers=[ 
            ('cat', cat_preprocessor, cat_features),
            ('enc', enc_preprocessor, enc_features),
            # ('cat_low', cat_low_cardi_preprocessor, cat_features),
            # ('cat_high', cat_high_cardi_preprocessor, cat_features),
            ('num', num_preprocessor, num_features)                                                       
        ])        
    else:
        preprocessor = ColumnTransformer(transformers=[ 
            ('cat', cat_preprocessor, cat_features),
            # ('cat_low', cat_low_cardi_preprocessor, cat_features),
            # ('cat_high', cat_high_cardi_preprocessor, cat_features),
            ('num', num_preprocessor, num_features)                                                       
        ])

    # if conduct hyperparameter tunning with Optuna, take the comment off in the next line.
    # alpha = trial.suggest_loguniform('clf_alpha', 0.001, 10.0) # [0.001, 10] the first 200 rounds lead to best para = 9.961215980791827. [10, 1e4] the first 60 rounds lead to 9983.72346180751. [1e4, 1e8] leads to 40482.85448271827. <<--- the best lambad so far.
    model = Pipeline(steps=[
        ('prep', preprocessor),
        ('clf', RidgeClassifier(class_weight='balanced', alpha=40482.85448271827, fit_intercept=False))
    ])

    if TOGGLE_BAY_CAT_ENCODER:
        X = train_encoded.drop('target', axis=1)
        y = train_encoded['target']        
    else:
        X = train.drop('target', axis=1)
        y = train['target']

    skf = StratifiedKFold(n_splits=2, shuffle=True)

    scores = cross_val_score(model, X, y, scoring='roc_auc', cv=3, n_jobs=-1) # remove n_jobs=-1 to avoid "Timeout or by a memory leak."
    return scores.mean()
Esempio n. 29
0
# can use this information to dispatch the categorical columns to the
# ``categorical_transformer`` and the remaining columns to the
# ``numerical_transformer``.

###############################################################################
# .. note:: In practice, you will have to handle yourself the column data type.
#    If you want some columns to be considered as `category`, you will have to
#    convert them into categorical columns. If you are using pandas, you can
#    refer to their documentation regarding `Categorical data
#    <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_.

from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer,
                   selector(dtype_exclude="category")),
                  ('cat', categorical_transformer,
                   selector(dtype_include="category"))])
clf = Pipeline(steps=[('preprocessor',
                       preprocessor), ('classifier', LogisticRegression())])

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

###############################################################################
# The resulting score is not exactly the same as the one from the previous
# pipeline becase the dtype-based selector treats the ``pclass`` columns as
# a numeric features instead of a categorical feature as previously:

selector(dtype_exclude="category")(X_train)
# `category` columns when loading the data with ``fetch_openml``. Therefore, we
# can use this information to dispatch the categorical columns to the
# ``categorical_transformer`` and the remaining columns to the
# ``numerical_transformer``.

###############################################################################
# .. note:: In practice, you will have to handle yourself the column data type.
#    If you want some columns to be considered as `category`, you will have to
#    convert them into categorical columns. If you are using pandas, you can
#    refer to their documentation regarding `Categorical data
#    <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_.

from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="category")),
    ('cat', categorical_transformer, selector(dtype_include="category"))
])

# Reproduce the identical fit/score process
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

###############################################################################
# Using the prediction pipeline in a grid search
###############################################################################
# Grid search can also be performed on the different preprocessing steps
# defined in the ``ColumnTransformer`` object, together with the classifier's
# hyperparameters as part of the ``Pipeline``.