Ejemplo n.º 1
0
 def fit(self, X, y=None):
     self.encoder = OrdinalEncoder([self.order + list(self.undefined)])
     self.encoder.fit([[x] for x in self.order + list(self.undefined)])  # Argument irrelevant
     return self
from sklearn import metrics

from sklearn.model_selection import GridSearchCV

scale = StandardScaler()

# Los datos a usar estan disponibles en un repositorio de github
file = "https://raw.githubusercontent.com/fhernanb/datos/master/iris.txt"
datos = pd.read_csv(file, sep='\t')
datos.head()

# Vamos a seleccionar las variables de interes
datos = datos[["Species", "Sepal.Width", "Sepal.Length"]]

# Vamos a convertir la variable Species a numérica
enc = OrdinalEncoder()
enc.fit(datos[["Species"]])
datos["y"] = enc.transform(datos[["Species"]])

# Explorando la variable respuesta
import seaborn as sns
sns.countplot(x='Species', data=datos)

# Creando X e y
y = datos["y"]
X = datos[["Sepal.Length", "Sepal.Width"]]

# Para escalar los valores de X
scaledX = scale.fit_transform(X)

# Creando train y test
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as MSE, mean_absolute_error as MAE

import pickle

outback = pd.read_csv('../data/outback.csv')

ordinal = ['condition', 'title_status'] # Ordinal Encoder
categorical = ['cylinders', 'fuel', 'transmission', 'paint_color', 'model'] # OHE
numerical = ['year', 'miles']

y = outback.USD
X = outback.drop('USD', axis=1)

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

ct = ColumnTransformer(transformers = [
                            ('ordinalEncoder', encoder, ordinal),
                            ('oneHotEncoder', ohe, categorical )
                            ],
                        remainder='passthrough')

regressor = RandomForestRegressor(n_estimators=20)

pipe = Pipeline(steps=[('preprocess', ct),
                        ('model', regressor)])

parameters = {
  'model__n_estimators':list(range(5,110,5)),
Ejemplo n.º 4
0
#Alternativa 3 - Completarlos con el promedio de los datos

median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True)

#El siguiente paso es volver los datos que son textuales a numericos, porque
#asi funciona machine learning, para esto tenemos dos formas de hacer las cosas

#Primer forma - la hacemos dependiendo segun cuantas clasificaciones hayan
# el problema es que los modelos normalmente entre mas distanciados estan los
#numeros quiere decir que son menos semejantes entre si.

from sklearn.preprocessing import OrdinalEncoder

housing_cat = housing[["ocean_proximity"]]
ordinal_enconder = OrdinalEncoder()
housing_cat_encoded = ordinal_enconder.fit_transform(housing_cat)
"""
print(housing_cat_encoded[:10])  
"""

#Ahora usamos otro metodo que se llama onehotencoder en donde lo que hace es que
#crea un vector con la cantidad total de clasificaciones que hay de ese atributo
#y procede a poner en uno el valor que le corresponda y cero los otros

from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot.toarray()
Ejemplo n.º 5
0
def test_ordinal_encoder_raise_categories_shape():
    cats = ['Low', 'Medium', 'High']
    msg = ("Categories are expected to be either list or array-like, but ")

    with pytest.raises(TypeError, match=msg):
        _ = OrdinalEncoder(categories=cats)
Ejemplo n.º 6
0
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV

#  data/default_of_credit_card_clients/default_of_credit_card_clients.xls
dataset = pd.read_excel(
    r'..\..\data\default_of_credit_card_clients\default_of_credit_card_clients.xls',
    skiprows=1)

dataset.pop('ID')
y = LabelEncoder().fit_transform(
    dataset.pop('default payment next month').values)

cat_si_step = ('si', SimpleImputer(strategy='constant',
                                   fill_value=-99))  # This is for training
ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')
            )  # This is for testing
oe_step = ('le', OrdinalEncoder())
num_si_step = ('si', SimpleImputer(strategy='median'))
sc_step = ('sc', StandardScaler())

cat_pipe = Pipeline([cat_si_step, ohe_step])
num_pipe = Pipeline([num_si_step, sc_step])
bin_pipe = Pipeline([oe_step])

transformers = [
    ('cat', cat_pipe, [
        'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5',
        'PAY_6'
    ]),
    ('num', num_pipe, [
        'LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
        'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
Ejemplo n.º 7
0
    return (jobs.reshape((len(jobs), 1)))


def sector(df, sector_dict):
    sectors = df.EmploymentSector.apply(lambda x: sector_dict[x]).to_numpy()
    return (sectors.reshape((len(sectors), 1)))


transformer_JobTitle = FunctionTransformer(lambda df: JobTitle(df, job_dict))
transformer_sector = FunctionTransformer(lambda df: sector(df, sector_dict))

# categorical columns with ordinal encoding
cat_cols = ["ManageStaff", "EmploymentStatus"]
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='constant'),
    OrdinalEncoder(),
)

transformer_nk = make_column_transformer(
    (cat_pipeline, cat_cols), (transformer_JobTitle, ["JobTitle"]),
    (transformer_sector, ["EmploymentSector"]))


def get_elements(key):
    L = key.split(",")
    Result = []
    i = 0
    while i < len(L):
        word = L[i]
        if word.find("(") != -1:
            while word.find(")") == -1:
Ejemplo n.º 8
0
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate

categorical_columns = [
    "weather",
    "season",
    "holiday",
    "workingday",
]
categories = [
    ["clear", "misty", "rain"],
    ["spring", "summer", "fall", "winter"],
    ["False", "True"],
    ["False", "True"],
]
ordinal_encoder = OrdinalEncoder(categories=categories)

gbrt_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
    ),
    HistGradientBoostingRegressor(categorical_features=range(4), ),
)

# %%
#
# Lets evaluate our gradient boosting model with the mean absolute error of the
# relative demand averaged accross our 5 time-based cross-validation splits:
               ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057, '坏瓜'],
               ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099, '坏瓜'],
               ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161, '坏瓜'],
               ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198, '坏瓜'],
               ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370, '坏瓜'],
               ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042, '坏瓜'],
               ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103, '坏瓜']]
    #特征值列表
    labels = ['色泽', '根蒂', '敲击', '纹理', '脐部', '触感', '密度', '含糖率']
    #整理出数据集和标签
    X = np.array(dataSet)[:, :8]
    Y = np.array(dataSet)[:, 8]

    #对X进行编码
    from sklearn.preprocessing import OrdinalEncoder
    oriencode = OrdinalEncoder(categories='auto')
    oriencode.fit(X[:, :6])
    Xdata = oriencode.transform(X[:, :6])  #编码后的数据
    print(oriencode.categories_)  #查看分类标签
    Xdata = np.hstack((Xdata, X[:, 6:].astype(float)))

    #对Y进行编码
    from sklearn.preprocessing import LabelEncoder
    labelencode = LabelEncoder()
    labelencode.fit(Y)
    Ylabel = labelencode.transform(Y)  #得到切分后的数据
    labelencode.classes_  #查看分类标签
    labelencode.inverse_transform(Ylabel)  #还原编码前数据
    X = pd.DataFrame(Xdata, columns=labels)
    Y = pd.Series(Ylabel)
Ejemplo n.º 10
0
#     print ('Number of non-attacks: ', y.value_counts () [0])
#     print ('Number of attacks: ', y.value_counts () [1])
# else:
#     # Undersampling
#     ros = RandomUnderSampler (random_state=42)

#     X, y = ros.fit_resample (X, y)

#     print ('Number of non-attacks: ', y.value_counts () [0])
#     print ('Number of attacks: ', y.value_counts () [1])


###############################################################################
### Encode categorical features
print ('Encoding categorical features (ordinal encoding).')
my_encoder = OrdinalEncoder ()
df ['flg'] = my_encoder.fit_transform (df ['flg'].values.reshape (-1, 1))
df ['pr'] = my_encoder.fit_transform (df ['pr'].values.reshape (-1, 1))
print ('Objects:', list (df.select_dtypes ( ['object']).columns))


###############################################################################
## Quick sanity check
###############################################################################
display_general_information (df)


###############################################################################
## Split dataset into train and test sets
###############################################################################
### Dataset too big? Drop, uncomment the next lines.
Ejemplo n.º 11
0
print('数据的行列',data_all.shape)

delete = ['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no','id_name', 'latest_query_time', 'source', 'loans_latest_time', 'first_transaction_time']
data_all = data_all.drop(delete,axis=1)
print('删除无用数据的行列',data_all.shape)

data_all = data_all.drop(['student_feature'],axis=1)

from sklearn.impute import SimpleImputer
for i in range(data_all.shape[1]):
    feature = data_all.iloc[:,i].values.reshape(-1,1)  #sklearn中特征矩阵必须是二维
    imp_mode = SimpleImputer(strategy='most_frequent')
    data_all.iloc[:,i] = imp_mode.fit_transform(feature)

from sklearn.preprocessing import OrdinalEncoder
data_all['reg_preference_for_trad'] = OrdinalEncoder().fit_transform(data_all['reg_preference_for_trad'].values.reshape(-1,1))

#查找标签值对应的索引
for i in range(data_all.shape[1]):
    if data_all.columns[i] == 'status':
        print(i)

y = data_all.iloc[:,38]
X = data_all.drop(['status'],axis=1)



#划分数据集
X = data_all.drop(['status'],axis=1)
y = data_all['status']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2018)
Ejemplo n.º 12
0
def test_date_pipeline():
    df = create_dataset(num=0, cat=0, date=5, target=False, size=5000)
    train_df = df.iloc[:-1000]
    test_df = df.iloc[-1000:]

    date_pipeline = Pipeline(steps=[
        Step("date", DateFeatures()),
        Step(
            "derived_processing",
            ColumnsProcessor(branches=[
                Step("num_derived",
                     Wrap(StandardScaler()),
                     types=[VarType.NUM]),
                Step(
                    "cat_derived",
                    OrdCat(min_support=0, use_other=False),
                    types=[VarType.CAT],
                ),
            ]),
        ),
    ])
    train = date_pipeline.fit_transform(to_task_data(train_df))
    test = date_pipeline.transform(to_task_data(test_df))

    for data in [train, test]:
        assert data.column_types[:5] == [ColumnType(VarType.NUM)] * 5
        assert set([c.var_type
                    for c in data.column_types[5:]]) == set([VarType.CAT])
        assert all([c.level > 0 for c in data.column_types[5:]])

    date_features = DateFeatures()
    dates_train = date_features.fit_transform(to_task_data(train_df))
    dates_test = date_features.transform(to_task_data(test_df))

    num_train = take_columns(dates_train, types=[VarType.NUM])
    cat_train = take_columns(dates_train, types=[VarType.CAT])
    scaler = StandardScaler()
    enc = OrdinalEncoder()

    num_train = scaler.fit_transform(num_train.X)
    cat_train = enc.fit_transform(cat_train.X)
    cat_train = cat_train + 1

    assert np.all(
        np.isclose(num_train,
                   take_columns(train, types=[VarType.NUM]).X))
    assert np.all(
        np.isclose(cat_train,
                   take_columns(train, types=[VarType.CAT]).X))

    num_test = take_columns(dates_test, types=[VarType.NUM])
    cat_test = take_columns(dates_test, types=[VarType.CAT])
    num_test = scaler.transform(num_test.X)
    cat_test = enc.transform(cat_test.X)
    cat_test = cat_test + 1

    assert np.all(
        np.isclose(num_test,
                   take_columns(test, types=[VarType.NUM]).X))
    assert np.all(
        np.isclose(cat_test,
                   take_columns(test, types=[VarType.CAT]).X))
Ejemplo n.º 13
0
    keras.layers.Dense(50, activation='relu'),
    keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
model4.compile('adam', 'binary_crossentropy')
model4.fit(xtrain.values, ytrain, validation_data=(xtest, ytest), epochs=100, batch_size=32)
eval_model(ytest, model4.predict_classes(xtest.values))

#%%
from sklearn.preprocessing import OrdinalEncoder
df['international_plan'] = df['international_plan'].map({'No':0, 'Yes':1})
df['voice_mail_plan'] = df['voice_mail_plan'].map({'No':0, 'Yes':1})
xtrain, xtest, ytrain, ytest = train_test_split(df.drop('churn', axis=1), df.churn)
xtrain_state = xtrain['state']
xtest_state = xtest['state']
oe = OrdinalEncoder()
xtrain_state = oe.fit_transform(xtrain_state.values.reshape(-1, 1))
xtest_state = oe.transform(xtest_state.values.reshape(-1, 1))
xtrain = xtrain.drop('state', axis=1)
xtest = xtest.drop('state', axis=1)

#%%
import keras
state_in = keras.Input(shape=(1,))
rest_in = keras.Input(shape=(18,))
emb = keras.layers.Embedding(51,5)(state_in)
emb_reshaped = keras.layers.Reshape((5,))(emb)
concat = keras.layers.Concatenate()([emb_reshaped, rest_in])
d1 = keras.layers.Dense(50, activation='relu')(concat)
d2 = keras.layers.Dense(25, activation='relu')(d1)
out = keras.layers.Dense(1, activation='sigmoid')(d2)
Ejemplo n.º 14
0
 def fit(self, X, y=None):
     self.encoder = OrdinalEncoder([self.values + list(self.undefined)], dtype=int)
     self.encoder.fit([[x] for x in self.values + list(self.undefined)])  # Argument irrelevant
     return self
Ejemplo n.º 15
0
 def encode_objects(X_train):
     oe = OrdinalEncoder()
     oe.fit(X_train)
     X_train_enc = oe.transform(X_train)
     return X_train_enc
Ejemplo n.º 16
0
             'Native Hawaiian/Oth Pac Island': '5',
             'Not Applicable': '-1',
             'NHISP': '3',
             'not available': '-1',
             '2+RACE': '6',
             ' ': '-1'}

enc_ethnicity = ethnicity.copy()
for key, val in D_matching.items():
    enc_ethnicity[ethnicity == key] = val
enc_ethnicity = enc_ethnicity.to_numpy(dtype=int)[p]

employer = Xg[:,301:305]
enc_employer = (employer * np.arange(1,5)).sum(axis=1)

clean_jt_enc = OrdinalEncoder().fit_transform(clean_jt[:,None])[:,0]
Xg_matching = np.zeros((len(Xg), 4))
Xg_matching[:,0] = clean_jt_enc
Xg_matching[:,1] = Xg[:,300]
Xg_matching[:,2] = enc_employer
Xg_matching[:,3] = enc_ethnicity

Xg_fuzzy = Xg[:,:303].copy()
Xg_fuzzy[:,301] = enc_employer
Xg_fuzzy[:,302] = enc_ethnicity

# ------------- ATE estimation with machine-learning models ------------- #

def AIPW_estimator(ms, mg, Xs, Xg, ys, yg):
    
    """
Ejemplo n.º 17
0
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(dtype=int)
transformed = oe.fit_transform(X_train_census[categorical_features])
transformed = pd.DataFrame(data=transformed,
                           columns=categorical_features,
                           index=X_train_census.index)
Ejemplo n.º 18
0
# In[19]:

# Augment training data
print("Augment data...")
df_augm_x, augm_y = DuplicateData(BattleResults, y)

# In[22]:

# Define preprocessing pipeline and fit sklearn scalers to data
numerical_attributes = [
    'HP_1', 'Attack_1', 'Defense_1', 'Sp_Atk_1', 'Sp_Def_1', 'Speed_1', 'HP_2',
    'Attack_2', 'Defense_2', 'Sp_Atk_2', 'Sp_Def_2', 'Speed_2'
]
type_attributes = ['Type1_1', 'Type2_1', 'Type1_2', 'Type2_2']
TypeEncoder = OrdinalEncoder()

full_pipeline = Pipeline([
    ("Add Types", AddPokemonTypes()),
    (
        "Individual Feature Preprocessing",
        ColumnTransformer([
            ("Drop", "drop",
             ['Name_1', 'Name_2', 'Price_1', 'Price_2', 'BattleResult']),
            ("Numerical Attributes", StandardScaler(),
             numerical_attributes),  # Faulty when not using augmented data
            ("Boolean", "passthrough", ['Legendary_1', 'Legendary_2']),
            ("Level", LevelScaler(), ['Level_1', 'Level_2']),
            ("Weather", OrdinalEncoder(), ['WeatherAndTime']),
            ("Types", TypeEncoder, type_attributes)
        ])),
Ejemplo n.º 19
0
def main():
    config = Configuration()  # Get config for data directory

    checker = ConfigChecker(config, None, 'preprocessing', training=None)
    checker.pre_init_checks()

    config.import_timestamps()
    number_data_sets = len(config.datasets)

    # list of all examples
    examples: [np.ndarray] = []
    labels_of_examples: [str] = []
    failure_times_of_examples: [str] = []
    window_times_of_examples: [str] = []

    attributes = None

    for i in range(number_data_sets):
        print('\n\nImporting dataframe ' + str(i) + '/' +
              str(number_data_sets - 1) + ' from file')

        # read the imported dataframe from the saved file
        path_to_file = config.datasets[i][0] + config.filename_pkl_cleaned

        with open(path_to_file, 'rb') as f:
            df: pd.DataFrame = pickle.load(f)

        # cleaning moved to separate script because of computational demands
        # df = clean_up_dataframe(df, config)

        # split the dataframe into the configured cases
        cases_df, labels_df, failures_df = split_by_cases(df, i, config)
        print("cases_df: ", len(cases_df))
        print("labels_df: ", len(labels_df))
        print("failures_df: ", len(failures_df), ": ", failures_df)

        if i == 0:
            attributes = np.stack(df.columns, axis=0)

        del df
        gc.collect()

        # split the case into examples, which are added to the list of of all examples
        number_cases = len(cases_df)
        for y in range(number_cases):
            df = cases_df[y]

            if len(df) <= 0:
                print(i, y, 'empty')
                print(
                    "df: ",
                    df,
                )
                continue

            start = df.index[0]
            end = df.index[-1]
            secs = (end - start).total_seconds()
            print('\nSplitting case', y, '/', number_cases - 1,
                  'into examples. Length:', secs, " start: ", start, " end: ",
                  end)
            split_into_examples(df, labels_df[y], examples, labels_of_examples,
                                config.time_series_length,
                                config.interval_in_seconds, config,
                                failure_times_of_examples, failures_df[y],
                                window_times_of_examples, y, i)
        del cases_df, labels_df, failures_df
        gc.collect()

    # convert lists of arrays to numpy array
    examples_array = np.stack(examples, axis=0)
    labels_array = np.stack(labels_of_examples, axis=0)
    failure_times_array = np.stack(failure_times_of_examples, axis=0)
    window_times_array = np.stack(window_times_of_examples, axis=0)

    del examples, labels_of_examples, failure_times_of_examples, window_times_of_examples
    gc.collect()

    # print("config.use_over_lapping_windows: ", config.use_over_lapping_windows)
    if config.use_over_lapping_windows:
        print('\nExecute train/test split with failure case consideration')
        # define groups for GroupShuffleSplit
        enc = OrdinalEncoder()
        enc.fit(failure_times_array.reshape(-1, 1))
        failure_times_array_groups = enc.transform(
            failure_times_array.reshape(-1, 1))
        # print("groups: ",failure_times_array_groups)
        # group_kfold = GroupKFold(n_splits=2)

        gss = GroupShuffleSplit(n_splits=1,
                                test_size=config.test_split_size,
                                random_state=config.random_seed)

        for train_idx, test_idx in gss.split(examples_array, labels_array,
                                             failure_times_array_groups):
            print("TRAIN:", train_idx, "TEST:", test_idx)
        # split_idx in gss.split(examples_array, labels_array, failure_times_array_groups)
        # train_idx = split_idx[0]
        # test_idx = split_idx[1]
        # print("train_idx:",train_idx)

        x_train, x_test = examples_array[train_idx], examples_array[test_idx]
        y_train, y_test = labels_array[train_idx], labels_array[test_idx]
        failure_times_train, failure_times_test = failure_times_array[
            train_idx], failure_times_array[test_idx]
        window_times_train, window_times_test = window_times_array[
            train_idx], window_times_array[test_idx]

        print("X_train: ", x_train.shape, " X_test: ", x_test.shape)
        print("Y_train: ", y_train.shape, " Y_train: ", y_test.shape)
        print("Failure_times_train: ", failure_times_train.shape,
              " Failure_times_test: ", failure_times_test.shape)
        print("Window_times_train: ", window_times_train.shape,
              " Window_times_test: ", window_times_test.shape)
        print("Classes in the train set: ", np.unique(y_train))
        print("Classes in the test set: ", np.unique(y_test))
        # print("Classes in train and test set: ", np.unique(np.concatenate(y_train, y_test)))

    else:
        # split into train and test data set
        print('\nExecute train/test split')
        x_train, x_test, y_train, y_test = train_test_split(
            examples_array,
            labels_array,
            test_size=config.test_split_size,
            random_state=config.random_seed)

    # Sort both datasets by the cases for easier handling
    '''
    x_train = x_train[y_train.argsort()]
    y_train = np.sort(y_train)

    x_test = x_test[y_test.argsort()]
    y_test = np.sort(y_test)
    '''

    print('Training data set shape: ', x_train.shape)
    print('Training label set shape: ', y_train.shape)
    print('Test data set shape: ', x_test.shape)
    print('Test label set shape: ', y_test.shape, '\n')

    # normalize each sensor stream to contain values in [0,1]
    x_train, x_test = normalise(x_train, x_test, config)

    x_train, x_test, = x_train.astype('float32'), x_test.astype('float32')

    # save the np arrays
    print('\nSave to np arrays in ' + config.training_data_folder)

    print('Step 1/5')
    np.save(config.training_data_folder + 'train_features_4_.npy', x_train)
    print('Step 2/5')
    np.save(config.training_data_folder + 'test_features_4_.npy', x_test)
    print('Step 3/5')
    np.save(config.training_data_folder + 'train_labels_4_.npy', y_train)
    print('Step 4/5')
    np.save(config.training_data_folder + 'test_labels_4_.npy', y_test)
    print('Step 5/5')
    np.save(config.training_data_folder + 'feature_names_4_.npy', attributes)
    print()

    if config.use_over_lapping_windows:
        print('Saving additional data if overlapping windows are used')

        # Contains the associated time of a failure (if not no failure) for each example
        print('Step 1/4')
        np.save(config.training_data_folder + 'train_failure_times_4_.npy',
                failure_times_train)
        print('Step 2/4')
        np.save(config.training_data_folder + 'test_failure_times_4_.npy',
                failure_times_test)
        print('Step 3/4')
        # Contains the start and end time stamp for each training example
        np.save(config.training_data_folder + 'train_window_times_4_.npy',
                window_times_train)
        print('Step 4/4')
        np.save(config.training_data_folder + 'test_window_times_4_.npy',
                window_times_test)
Ejemplo n.º 20
0
# %%
# We define a predictive model based on a random forest. Therefore, we will make
# the following preprocessing steps:
#
# - use :class:`~sklearn.preprocessing.OrdinaleEcnoder` to encode the
#   categorical features;
# - use :class:`~sklearn.impute.SimpleImputer` to fill missing values for
#   numerical features using a mean strategy.
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

categorical_encoder = OrdinalEncoder(handle_unknown="use_encoded_value",
                                     unknown_value=-1,
                                     encoded_missing_value=-1)
numerical_pipe = SimpleImputer(strategy="mean")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_columns),
        ("num", numerical_pipe, numerical_columns),
    ],
    verbose_feature_names_out=False,
)

rf = Pipeline([
    ("preprocess", preprocessing),
    ("classifier", RandomForestClassifier(random_state=42)),
])
Ejemplo n.º 21
0
def encode_feature(df, feature):
    df[feature] = df[feature].fillna("Unkown")
    ord_enc = OrdinalEncoder()
    return ord_enc.fit_transform(df[[feature]]).reshape(
        (len(df[[feature]]), 1))
adult_census = pd.read_csv("../datasets/adult-census.csv")

target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name, "education-num"])

data_train, data_test, target_train, target_test = train_test_split(
    data, target, train_size=0.2, random_state=42)

# %%
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)
preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor,
      selector(dtype_include=object))],
    remainder='passthrough', sparse_threshold=0)

# This line is currently required to import HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", HistGradientBoostingClassifier(random_state=42))
])
Ejemplo n.º 23
0
def test_ordinal_encoder(X):
    enc = OrdinalEncoder()
    exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
    enc = OrdinalEncoder(dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp)
hist_one_hot = make_pipeline(one_hot_encoder,
                             HistGradientBoostingRegressor(random_state=42))

# %%
# Gradient boosting estimator with ordinal encoding
# -------------------------------------------------
# Next, we create a pipeline that will treat categorical features as if they
# were ordered quantities, i.e. the categories will be encoded as 0, 1, 2,
# etc., and treated as continuous features.

from sklearn.preprocessing import OrdinalEncoder
import numpy as np

ordinal_encoder = make_column_transformer(
    (OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan),
     make_column_selector(dtype_include='category')),
    remainder='passthrough')

hist_ordinal = make_pipeline(ordinal_encoder,
                             HistGradientBoostingRegressor(random_state=42))

# %%
# Gradient boosting estimator with native categorical support
# -----------------------------------------------------------
# We now create a :class:`~ensemble.HistGradientBoostingRegressor` estimator
# that will natively handle categorical features. This estimator will not treat
# categorical features as ordered quantities.
#
# Since the :class:`~ensemble.HistGradientBoostingRegressor` requires category
# values to be encoded in `[0, n_unique_categories - 1]`, we still rely on an
Ejemplo n.º 25
0
# Subset training and testing maps
X_train = X[1000:3000, 1000:3000, 0:X.shape[2]]
y_train = y[1000:3000, 1000:3000]
X_test = X[3000:4000, 1000:3000, 0:X.shape[2]]
y_test = y[3000:4000, 1000:3000]

print("X_train shape {}".format(X_train.shape))
print("y_train shape {}".format(y_train.shape))
print("X_test shape {}".format(X_test.shape))
print("y_test shape {}".format(y_test.shape))

# In[ ]:

# Encode response arrays
y_train = OrdinalEncoder().fit_transform(y_train)
y_test = OrdinalEncoder().fit_transform(y_test)

# In[ ]:

# Plot training map
plt.rcParams['figure.figsize'] = [6.4, 4.8]
plt.rcParams['figure.dpi'] = 144

plt.subplot(121)
plt.imshow(X_train[:, :, 1], cmap=plt.cm.Greys_r)
plt.title('X (training)')

plt.subplot(122)
plt.imshow(y_train, cmap=plt.cm.get_cmap('magma'))
plt.title('y (training)')
Ejemplo n.º 26
0
data_categorical = data[categorical_columns]
data_categorical.head()

# %%
print(f"The datasets is composed of {data_categorical.shape[1]} features")

# %% [markdown]
# ### Encoding ordinal categories
#
# The most intuitive strategy is to encode each category with a different
# number. The `OrdinalEncoder` will transform the data in such manner.

# %%
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
data_encoded = encoder.fit_transform(data_categorical)

print(f"The dataset encoded contains {data_encoded.shape[1]} features")
data_encoded[:5]

# %% [markdown]
# We can see that the categories have been encoded for each feature (column)
# independently. We can also note that the number of features before and after
# the encoding is the same.
#
# However, one has to be careful when using this encoding strategy. Using this
# integer representation can lead the downstream models to make the assumption
# that the categories are ordered: 0 is smaller than 1 which is smaller than 2,
# etc.
#
Ejemplo n.º 27
0
#plt.show()

# Use z-score to handle outlier
idx_bmi = find_outliers(data['bmi'])
data = data.loc[idx_bmi == False]
#print(data.info())

#print(data.head())


print("\n############ Before encoding ############")
print(data.head())
print(data.info())

# Encoding : OrdinalEncoder
ordinalencoder = OrdinalEncoder()

# gender
# ['Female', 'Male', 'Other'] = [0,1,2]
gender = data[["gender"]]
gender_encod = ordinalencoder.fit_transform(gender)
data['gender']=gender_encod


# ever_married
# ['No', 'Yes'] = [0,1]
married = data[["ever_married"]]
married_encod = ordinalencoder.fit_transform(married)
data['ever_married']=married_encod

# work_type
Ejemplo n.º 28
0
                                                                             2]

titanic["Fare_cat"] = pd.cut(titanic.Fare_p_person,
                             bins=[0, 13, 30, 513],
                             include_lowest=True)

titanic["Age_cat"] = pd.cut(titanic["Age"].astype(int),
                            bins=[-1, 20, 40, 60, 90])

sex_pip = Pipeline([
    ("one_hot", OneHotEncoder(categories=[pd.Series.unique(titanic.Sex)]))
])

fare_cat_pip = Pipeline([
    ("fare_std",
     OrdinalEncoder(categories=[pd.Series.unique(titanic.Fare_cat)]))
])

pclass_pip = Pipeline([("pclass_std", StandardScaler())])

age_cat_pip = Pipeline([
    ("age_cat_std",
     OrdinalEncoder(categories=[pd.Series.unique(titanic.Age_cat)]))
])

fsize_pip = Pipeline([("fsize_std", StandardScaler())])

cabin_pip = Pipeline([
    ("cabin_1hot",
     OneHotEncoder(categories=[pd.Series.unique(titanic.Cabin_letter)]))
])
Ejemplo n.º 29
0
test_original = mlib.csv_to_df(path)
test_df = test_original.copy()

# # Create list of features desired for training
feature_list = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_list = ['Survived']

# Define Numeric Pipeline
num_pipe = Pipeline([('imputer_mean', SimpleImputer(strategy='mean')),
                     ('std_scalar', StandardScaler())])

# Define Categorical Pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    #('ohe' , OneHotEncoder()),
    ('oe', OrdinalEncoder())
])

#Combining Pipes into full pipeline - Train Data
full_pipeline, train_features, target_features, post_trans_train_feature = mlib.Full_PipeLine(
    train_df, feature_list, target_list, num_pipe, cat_pipe)

# Combining Pipes into full pipeline - Test Data
full_pipeline_test, test_features, empty, empty = mlib.Full_PipeLine(
    test_df, feature_list, [], num_pipe, cat_pipe)

# Transform data using final combined pipeline - Train
train_features_prep = full_pipeline.fit_transform(train_features)

# Transform data using final combined pipeline - Test
test_features_prep = full_pipeline.fit_transform(test_features)
Ejemplo n.º 30
0
 def fit_transform(self, X):
     self.encoder = OrdinalEncoder()
     self.n_categorical = (X.dtypes == "category").sum()
     self.encoder.fit(X[X.columns[:self.n_categorical]])
     return self.transform(X)