コード例 #1
0
def test_ordinal_encoder(X):
    enc = OrdinalEncoder()
    exp = np.array([[0, 1, 0],
                    [1, 0, 0]], dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
    enc = OrdinalEncoder(dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp)
コード例 #2
0
def test_ordinal_encoder_raise_categories_shape():

    X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T
    cats = ['Low', 'Medium', 'High']
    enc = OrdinalEncoder(categories=cats)
    msg = ("Shape mismatch: if categories is an array,")

    with pytest.raises(ValueError, match=msg):
        enc.fit(X)
コード例 #3
0
def test_ordinal_encoder_inverse():
    X = [['abc', 2, 55], ['def', 1, 55]]
    enc = OrdinalEncoder()
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
    msg = re.escape('Shape of the passed X data is not correct')
    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
コード例 #4
0
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OrdinalEncoder(categories=cats)
    exp = np.array([[0.], [1.]])
    assert_array_equal(enc.fit_transform(X), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OrdinalEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2)
コード例 #5
0
def test_ordinal_encoder_raise_missing(X):
    ohe = OrdinalEncoder()

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit_transform(X)

    ohe.fit(X[:1, :])

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.transform(X)
コード例 #6
0
ファイル: lazy.py プロジェクト: Crispae/Project
CLASSIFIERS.append(("XGBClassifier", xgboost.XGBClassifier))
CLASSIFIERS.append(("LGBMClassifier", lightgbm.LGBMClassifier))
# CLASSIFIERS.append(('CatBoostClassifier',catboost.CatBoostClassifier))

numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(
    strategy="mean")), ("scaler", StandardScaler())])

categorical_transformer_low = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoding", OneHotEncoder(handle_unknown="ignore", sparse=False)),
])

categorical_transformer_high = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    # 'OrdianlEncoder' Raise a ValueError when encounters an unknown value. Check https://github.com/scikit-learn/scikit-learn/pull/13423
    ("encoding", OrdinalEncoder()),
])

# Helper function


def get_card_split(df, cols, n=11):
    """
    Splits categorical columns into 2 lists based on cardinality (i.e # of unique values)
    Parameters
    ----------
    df : Pandas DataFrame
        DataFrame from which the cardinality of the columns is calculated.
    cols : list-like
        Categorical columns to list
    n : int, optional (default=11)
コード例 #7
0
1. sklearn.preprocessing.OrdinalEncoder - Takes an array-like of strings or integers and creates an 
#                                         encoder to transform the data into an array of integer categories.
# sklearn.preprocessing.OneHotEncoder - Takes nominal data in an array-like and encodes into a binary array with
#                                       one place per feature.

#Extended Exercise

#1. Unsure, though it looks like if you 'fit()' a dataset and it's NOT already ordered correctly the function
#  will categorise the data, but not necessarily in the order you want?

#2. Using Customer survey data for value of primary residence from University of Michigan.
%run setup.ipy

import quandl
import my_secrets
quandl.ApiConfig.api_key = my_secrets.QUANDL_API_KEY

housing_prices = quandl.get("UMICH/SOC22-University-of-Michigan-Consumer-Survey-Current-Market-Value-of-Primary-Residence")
housing_prices = housing_prices.iloc[:,0:6] #Trim data down to price categories only
housing_prices

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

ord_enc = OrdinalEncoder()
ord_enc.fit(housing_prices)
ord_prices = ord_enc.transform(housing_prices)

hot_enc = OneHotEncoder(categories='auto')
hot_enc.fit(housing_prices)
hot_enc.transform(housing_prices).toarray()
コード例 #8
0
    #Transform and pipeline
    onehot_transformer = Pipeline(
        steps=[('imputer',
                SimpleImputer(strategy='constant', fill_value='None')
                ), ('encode', OneHotEncoder(handle_unknown='ignore'))])

    numeric_transformer = Pipeline(
        steps=[('imputer', IterativeImputer(KNeighborsRegressor(
            n_neighbors=3))), ('scaler', StandardScaler())])

    preprocess = ColumnTransformer(transformers=[(
        'onehot', onehot_transformer,
        onehot_columns), (
            'num', numeric_transformer,
            numeric_columns), ('binary', OrdinalEncoder(), binary_columns)])

    pipeline = Pipeline(steps=[('preprocess', preprocess), ('mod', model)])

    #Fit the current model
    cv = RepeatedKFold(n_splits=5, n_repeats=10)

    full_pipeline = GridSearchCV(pipeline,
                                 param_grid=params,
                                 cv=cv,
                                 n_jobs=-1,
                                 verbose=1)

    print('Fitting', name,
          'model **************************************************')
    full_pipeline.fit(X, Y)
コード例 #9
0
    def __init__(
        self,
        regs_path: Path,
        chag_path: Path,
        regs_min: Optional[int] = 4,
    ) -> None:
        # Read dataframe
        regs_df = pd.read_csv(regs_path)

        # Print information
        print(f"Read dataset in {regs_path}")
        print(f"Original regs shape: {regs_df.shape}")

        # get counting information
        regs_counts = regs_df['registant'].value_counts()
        chag_counts = regs_df['challengeId'].value_counts()
        print(f"Original registants size: {regs_counts.size}")
        print(f"Original challenges size: {chag_counts.size}")

        # remove sparse item in counts
        regs_counts = regs_counts[regs_counts >= regs_min]

        # Remove sparse item
        regs_df = regs_df[regs_df['registant'].isin(regs_counts.index)]
        print(f"Filter dataframe shape: {regs_df.shape}")

        # Add previous and period columns
        regs_df = regs_df.sort_values(by=['registant', 'date'])
        regs_df['previousId'] = regs_df['challengeId']
        regs_df['period'] = regs_df['date'].str[:7]

        # Shift previous column
        regs_df['previousId'] = regs_df['previousId'].shift(
            periods=1).fillna(0).astype('int64')

        # Set first item non for each user
        regs_df = regs_df.sort_values(by=['registant', 'date'])
        first_mask = regs_df.duplicated(subset=['registant'], keep='first')
        regs_df['previousId'] = regs_df['previousId'].where(first_mask, -1)

        # Read attr dataframe
        chag_df: pd.DataFrame = regs_df[['challengeId', 'period']]
        chag_df = chag_df.drop_duplicates(subset=['challengeId'])
        attr_df = pd.read_csv(chag_path,
                              converters={
                                  'technologies': literal_eval,
                                  'platforms': literal_eval
                              })
        chag_df = pd.merge(left=chag_df,
                           right=attr_df,
                           how='inner',
                           on=['challengeId'])
        # Add default row
        print(chag_df.columns)
        chag_df.loc[-1] = (-1, '2005-01', '2005-01-01', 0, [], [])
        chag_df = chag_df.sort_values(by=['date'])

        # Add encoder
        chag_encoder = OneHotEncoder(categories='auto',
                                     handle_unknown='ignore')
        regs_encoder = OneHotEncoder(categories='auto', handle_unknown='error')
        period_encoder = OrdinalEncoder(categories='auto')
        tech_binarizer = MultiLabelBinarizer(sparse_output=True)
        plat_binarizer = MultiLabelBinarizer(sparse_output=True)

        chag_encoder.fit(regs_df[['challengeId']])
        regs_encoder.fit(regs_df[['registant']])
        period_encoder.fit(chag_df[['period']])
        tech_binarizer.fit(chag_df['technologies'].tolist())
        plat_binarizer.fit(chag_df['platforms'].tolist())

        # Split dataset to train, valid, test
        regs_df = regs_df.sort_values(by=['date'])

        last_mask = regs_df.duplicated(subset=['registant'], keep='last')
        remain_df = regs_df[last_mask]
        test_df = regs_df[~last_mask]

        last_mask = remain_df.duplicated(subset=['registant'], keep='last')
        train_df = remain_df[last_mask]
        valid_df = remain_df[~last_mask]

        # Add default config
        self.config_db()
        self._chag_df = chag_df
        self._df_dict: Dict[str, pd.DataFrame] = {
            'train': train_df,
            'valid': valid_df,
            'test': test_df
        }

        self._regs_encoder = regs_encoder
        self._chag_encoder = chag_encoder
        self._period_encoder = period_encoder
        self._tech_binarizer = tech_binarizer
        self._plat_binarizer = plat_binarizer

        regs_size = regs_encoder.categories_[0].size
        chag_size = chag_encoder.categories_[0].size
        seq_size = tech_binarizer.classes_.size + plat_binarizer.classes_.size

        self.feat_dim = regs_size + 2 * chag_size + 2 * seq_size
        self.user_size = regs_size
コード例 #10
0
def test_ordinal_encoder(X):
    enc = OrdinalEncoder()
    exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
    enc = OrdinalEncoder(dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp)
コード例 #11
0
test_original = mlib.csv_to_df(path)
test_df = test_original.copy()

# # Create list of features desired for training
feature_list = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
target_list = ['Survived']

# Define Numeric Pipeline
num_pipe = Pipeline([('imputer_mean', SimpleImputer(strategy='mean')),
                     ('std_scalar', StandardScaler())])

# Define Categorical Pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # ('ohe' , OneHotEncoder()),
    ('oe', OrdinalEncoder())
])

#Combining Pipes into full pipeline - Train Data
full_pipeline, train_features, target_features, post_trans_train_feature = mlib.Full_PipeLine(
    train_df, feature_list, target_list, num_pipe, cat_pipe)

# Combining Pipes into full pipeline - Test Data
full_pipeline_test, test_features, empty, post_transform_test_features = mlib.Full_PipeLine(
    test_df, feature_list, [], num_pipe, cat_pipe)

# Transform data using final combined pipeline - Train
train_features_prep = full_pipeline.fit_transform(train_features)

# Transform data using final combined pipeline - Test
test_features_prep = full_pipeline.fit_transform(test_features)
コード例 #12
0
def transform_columns(X_train,
                      X_test,
                      column_dict,
                      cat_trans="onehot_encoding",
                      num_trans="standard_scaling"):
    """
    Transforms categorical and numerical features based on user input.

    Arguments
    ---------
    X_train: pandas.core.frame.DataFrame
        A pandas dataframe for training set
    X_test: pandas.core.frame.DataFrame
        A pandas dataframe for test set
    column_dict: dictionary
        A dictionary with keys = 'numeric','categorical' and
        values = a list of columns that fall into each respective category.
    cat_trans: list
        transformation method for categorical features(default - 'ohe')
    num_trans: list
        transformation method for numerical features
        (default - 'StandardScaler')

    Returns
    -------
    dict
        A python dictionary with transformed training and
        test set with keys X_train and X_test respectively
    Examples
    --------
    df_train = pd.DataFrame({'a':[1, 2, 3],
                             'b':[1.2, 3.4, 3.0],
                             'c':['A','B','C']})
    df_test = pd.DataFrame({'a':[6, 2],
                            'b':[0.5, 9.2],
                            'c':['B', 'B']})
    transform_columns(df_train, df_test, {'numeric':['a', 'b'],
    'categorical':['c']})
    """

    # checking user inputs

    # assertions for test and train set inputs
    assert isinstance(X_train, pd.DataFrame), "X_train should be a DataFrame"
    assert isinstance(X_test, pd.DataFrame), "X_test should be a DataFrame"
    assert not isinstance(X_train.columns, pd.RangeIndex), \
        "column names must be strings"
    assert not isinstance(X_test.columns, pd.RangeIndex), \
        "column names must be strings"

    # assertions for dictionary input
    assert isinstance(column_dict, dict),\
        "column_dict should be a python dictionary"
    assert len(column_dict) == 2, \
        "column_dict should have 2 keys - 'numeric' and 'categorical'"

    for key in column_dict.keys():
        assert key in ['numeric', 'categorical'],\
            "column_dict keys can be only 'numeric' and 'categorical'"

    # assertions for transformation inputs
    assert isinstance(num_trans, str), "num_trans should be a string"
    assert isinstance(cat_trans, str), "cat_trans should be a string"
    assert num_trans == "standard_scaling" or num_trans == "minmax_scaling",\
        "transformation method for numeric columns can only" \
        " be 'minmax_scaling' or 'standard_scaling'"
    assert cat_trans == "onehot_encoding" or cat_trans == "label_encoding",\
        "transformation method for categorical columns can only be" \
        " 'label_encoding' or 'onehot_encoding'"

    # Check train set and test set columns are the same
    assert np.array_equal(X_train.columns, X_test.columns),\
        "X_train and X_test must have the same columns"

    for key, values in column_dict.items():
        for column in values:
            assert column in X_train.columns,\
                "columns in dictionary must be in dataframe"

    numeric = column_dict['numeric']
    categorical = column_dict['categorical']

    if cat_trans == 'onehot_encoding':

        if num_trans == "standard_scaling":
            preprocessor = ColumnTransformer(transformers=[
                ("stand_scaler", StandardScaler(), numeric),
                ("ohe", OneHotEncoder(drop='first'), categorical)
            ],
                                             sparse_threshold=0)

        if num_trans == "minmax_scaling":
            preprocessor = ColumnTransformer(transformers=[
                ("minmax_scaler", MinMaxScaler(), numeric),
                ("ohe", OneHotEncoder(drop='first'), categorical)
            ],
                                             sparse_threshold=0)
            # print(2)

        # Applying transformations to training data set
        X_train = pd.DataFrame(
            preprocessor.fit_transform(X_train),
            index=X_train.index,
            columns=numeric +
            list(preprocessor.named_transformers_['ohe'].get_feature_names(
                categorical)))

        # applying transformations to test set
        X_test = pd.DataFrame(preprocessor.transform(X_test),
                              index=X_test.index,
                              columns=X_train.columns)

    if cat_trans == "label_encoding":

        if num_trans == "standard_scaling":
            preprocessor = ColumnTransformer(transformers=[
                ("stand_scaler", StandardScaler(), numeric),
                ("ordinal", OrdinalEncoder(), categorical)
            ],
                                             sparse_threshold=0)
            # print(3)

        if num_trans == "minmax_scaling":
            preprocessor = ColumnTransformer(transformers=[
                ("minmax_scaler", MinMaxScaler(), numeric),
                ("ordinal", OrdinalEncoder(), categorical)
            ],
                                             sparse_threshold=0)
            # print(4)

        # ## Applying transformations to training data set
        X_train = pd.DataFrame(preprocessor.fit_transform(X_train),
                               index=X_train.index,
                               columns=numeric + categorical)

        # applying transformations to test set
        X_test = pd.DataFrame(preprocessor.transform(X_test),
                              index=X_test.index,
                              columns=X_train.columns)

    transformed_dict = {'X_train': X_train, 'X_test': X_test}

    return transformed_dict
コード例 #13
0
# - use a `RandomizedSearchCV` to find the best set of hyper-parameters by
#   tuning the following parameters: `learning_rate`, `l2_regularization`,
#   `max_leaf_nodes`, and `min_samples_leaf`.

# %%
ordinal_encoding_columns = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'race', 'native-country', 'sex'
]

categories = [
    data[column].unique() for column in data[ordinal_encoding_columns]
]

preprocessor = ColumnTransformer(
    [('ordinal-encoder', OrdinalEncoder(categories=categories),
      ordinal_encoding_columns)],
    remainder='passthrough',
    sparse_threshold=0)

model = Pipeline([('preprocessor', preprocessor),
                  ('gbrt', HistGradientBoostingClassifier(max_iter=50))])
param_distributions = {
    'gbrt__learning_rate': expon(loc=0.001, scale=0.5),
    'gbrt__l2_regularization': uniform(loc=0, scale=0.5),
    'gbrt__max_leaf_nodes': randint(5, 30),
    'gbrt__min_samples_leaf': randint(5, 30)
}
model_grid_search = RandomizedSearchCV(model,
                                       param_distributions=param_distributions,
                                       n_iter=10,
コード例 #14
0
ファイル: census9.py プロジェクト: leocjj/0123
 def __init__(self):
     self.age = Pipeline([("scale", MinMaxScaler()),
                          ("input",
                           SimpleImputer(strategy="constant",
                                         fill_value=-1,
                                         add_indicator=True))])
     self.fnlwgt = Pipeline([("scale", MinMaxScaler()),
                             ("input",
                              SimpleImputer(strategy="constant",
                                            fill_value=-1,
                                            add_indicator=True))])
     self.education_num = Pipeline([("scale", MinMaxScaler()),
                                    ("input",
                                     SimpleImputer(strategy="constant",
                                                   fill_value=-1,
                                                   add_indicator=True))])
     self.capital_net = Pipeline([("scale", MinMaxScaler()),
                                  ("input",
                                   SimpleImputer(strategy="constant",
                                                 fill_value=-1,
                                                 add_indicator=True))])
     self.hours_per_week = Pipeline([("scale", MinMaxScaler()),
                                     ("input",
                                      SimpleImputer(strategy="constant",
                                                    fill_value=-1,
                                                    add_indicator=True))])
     self.workclass = Pipeline([("encode", OrdinalEncoder(dtype=np.int)),
                                ("input",
                                 SimpleImputer(strategy="constant",
                                               fill_value=-1,
                                               add_indicator=True)),
                                ("scale", MinMaxScaler())])
     self.marital_status = Pipeline([("encode",
                                      OrdinalEncoder(dtype=np.int)),
                                     ("input",
                                      SimpleImputer(strategy="constant",
                                                    fill_value=-1,
                                                    add_indicator=True)),
                                     ("scale", MinMaxScaler())])
     self.occupation = Pipeline([("encode", OrdinalEncoder(dtype=np.int)),
                                 ("input",
                                  SimpleImputer(strategy="constant",
                                                fill_value=-1,
                                                add_indicator=True)),
                                 ("scale", MinMaxScaler())])
     self.relationship = Pipeline([("encode", OrdinalEncoder(dtype=np.int)),
                                   ("input",
                                    SimpleImputer(strategy="constant",
                                                  fill_value=-1,
                                                  add_indicator=True)),
                                   ("scale", MinMaxScaler())])
     self.race = Pipeline([("encode", OrdinalEncoder(dtype=np.int)),
                           ("input",
                            SimpleImputer(strategy="constant",
                                          fill_value=-1,
                                          add_indicator=True)),
                           ("scale", MinMaxScaler())])
     self.sex = Pipeline([("encode", OrdinalEncoder(dtype=np.int)),
                          ("input",
                           SimpleImputer(strategy="constant",
                                         fill_value=-1,
                                         add_indicator=True)),
                          ("scale", MinMaxScaler())])
     self.native_country = Pipeline([("encode",
                                      OrdinalEncoder(dtype=np.int)),
                                     ("input",
                                      SimpleImputer(strategy="constant",
                                                    fill_value=-1,
                                                    add_indicator=True)),
                                     ("scale", MinMaxScaler())])
    for i in range(len(agg_labels)):
        if agg_labels[i] != focus_label:
            agg_labels[i] = "OTHER"
    print(agg_labels)
    return agg_labels


"""For *AG: only NAG, CAG and OAG focus labels are valid
For *GEN: only NGEN and GEN focus labels are valid, but the redefining of 
labels might not be necessary"""
focus_label = 'CAG'
agg_labels = redifine_labels(agg_labels, focus_label)

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

agg_labels_encoded = ordinal_encoder.fit_transform(agg_labels)

#%%
print(agg_labels_encoded[:10])
print(ordinal_encoder.categories_)

#%%

from pprint import pprint
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import NuSVC
コード例 #16
0
# models will be covered in more detail in a future module.
#
# For tree-based models, the handling of numerical and categorical variables is
# simpler than for linear models:
# * we do **not need to scale the numerical features**
# * using an **ordinal encoding for the categorical variables** is fine even if
#   the encoding results in an arbitrary ordering
#
# Therefore, for `HistGradientBoostingClassifier`, the preprocessing pipeline
# is slightly simpler than the one we saw earlier for the `LogisticRegression`:

# %%
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value",
                                          unknown_value=-1)

preprocessor = ColumnTransformer(
    [('categorical', categorical_preprocessor, categorical_columns)],
    remainder="passthrough")

model = make_pipeline(preprocessor, HistGradientBoostingClassifier())

# %% [markdown]
# Now that we created our model, we can check its generalization performance.

# %%
# %%time
_ = model.fit(data_train, target_train)

# %%
コード例 #17
0
def test(housing):
    # 查看数据的基本情况,可以结合可视化
    housing.info()
    housing.describe()
    # 数据呈现长尾分布
    # housing.hist(bins=50, figsize=(20, 15))
    train_set, test_set = train_test_split(housing,
                                           test_size=0.2,
                                           random_state=42)

    # 收入转化为标签
    housing["income_cat"] = pd.cut(housing["median_income"],
                                   bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                                   labels=[1, 2, 3, 4, 5])
    # housing["income_cat"].hist()

    # 分割数据
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]

    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)

    housing = strat_train_set.copy()
    '''
    # 注意这里绘图时通过x,y指定对应列
    # housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
    # 半径s人口,颜色c房价
    housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
                 s=housing["population"] / 100, label="population", figsize=(10, 7),
                 c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
                 )
    plt.legend()
    '''

    # 相关性
    corr_matrix = housing.corr()
    corr_matrix["median_house_value"].sort_values(ascending=False)

    attributes = [
        "median_house_value", "median_income", "total_rooms",
        "housing_median_age"
    ]
    # scatter_matrix(housing[attributes], figsize=(12, 8))
    # housing.plot(kind="scatter", x="median_income", y="median_house_value",alpha=0.1)

    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()

    # 数据清洗
    # NA 插入数据
    housing.dropna(subset=["total_bedrooms"])  # option 1
    housing.drop("total_bedrooms", axis=1)  # option 2
    median = housing["total_bedrooms"].median()  # option 3
    housing["total_bedrooms"].fillna(median, inplace=True)

    # sklearn提供的填充空值方法
    imputer = SimpleImputer(strategy="median")
    housing_num = housing.drop("ocean_proximity", axis=1)
    imputer.fit(housing_num)
    imputer.statistics_
    X = imputer.transform(housing_num)
    housing_tr = pd.DataFrame(X,
                              columns=housing_num.columns,
                              index=housing_num.index)

    # 编码,OrdinalEncoder相当于LabelEncoder的矩阵版,能对多个特征同时编码
    housing_cat = housing[["ocean_proximity"]]
    ordinal_encoder = OrdinalEncoder()
    housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

    cat_encoder = OneHotEncoder()
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
    housing_num_tr = num_pipeline.fit_transform(housing_num)

    # ColumnTransformer对pandas每列操作
    # 注意这里list(housing_num) 返回的是housing_num的列名
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    # 注意这里传入了dataframe的列名(属性名),使Transformer进行相应的转换
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    # 这里返回的onehot+连续值使用的是非稀疏举证, 将每个类编都转换成了0/1两种值的列,
    # 可以分开转换然后用sparse.hstack组装起来,titanic有用过
    housing_prepared = full_pipeline.fit_transform(housing)

    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)
    some_data = housing.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    some_data_prepared = full_pipeline.transform(some_data)
    print("Predictions:", lin_reg.predict(some_data_prepared))
    print("Labels:", list(some_labels))

    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    print(lin_rmse)

    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(housing_prepared, housing_labels)
    housing_predictions = tree_reg.predict(housing_prepared)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    # 这里的损失是0,因为没有测试集,决策树严重的过拟合了
    print(tree_rmse)

    # 采用K折验证,决策树的表现不如之前,cv分成几份
    scores = cross_val_score(tree_reg,
                             housing_prepared,
                             housing_labels,
                             scoring="neg_mean_squared_error",
                             cv=10)
    tree_rmse_scores = np.sqrt(-scores)
    print(tree_rmse_scores)

    # cross_val_score的评估标准是与正常loss相反
    lin_scores = cross_val_score(lin_reg,
                                 housing_prepared,
                                 housing_labels,
                                 scoring="neg_mean_squared_error",
                                 cv=10)

    lin_rmse_scores = np.sqrt(-lin_scores)
    display_scores(lin_rmse_scores)

    forest_reg = RandomForestRegressor()
    forest_reg.fit(housing_prepared, housing_labels)
    forest_scores = cross_val_score(forest_reg,
                                    housing_prepared,
                                    housing_labels,
                                    scoring="neg_mean_squared_error",
                                    cv=10)

    forest_rmse_scores = np.sqrt(-forest_scores)
    display_scores(forest_rmse_scores)

    # 超参数的自动搜索
    param_grid = [
        {
            'n_estimators': [3, 10, 30],
            'max_features': [2, 4, 6, 8]
        },
        {
            'bootstrap': [False],
            'n_estimators': [3, 10],
            'max_features': [2, 3, 4]
        },
    ]
    forest_reg = RandomForestRegressor()
    grid_search = GridSearchCV(forest_reg,
                               param_grid,
                               cv=5,
                               scoring='neg_mean_squared_error',
                               return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)

    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    final_model = grid_search.best_estimator_
    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()
    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = final_model.predict(X_test_prepared)
    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    print(final_rmse)

    # 95置信区间
    # 一般的t统计量写成t=(估计值-假设值)/标准误,它服从自由度为(n-2)的t分布
    confidence = 0.95
    squared_errors = (final_predictions - y_test)**2
    np.sqrt(
        stats.t.interval(confidence,
                         len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))
コード例 #18
0
#
# df = df[df['APIKEY'].isin(api_list)]

df_train['LABEL'] = df_train['LABEL'].apply(lambda x: 0
                                            if x == 'NOT ANAMOLY' else 1)
df_test['LABEL'] = df_test['LABEL'].apply(lambda x: 0
                                          if x == 'NOT ANAMOLY' else 1)

cat = ['APIKEY', 'DAY', 'TIMEBIN']
cont = list(set(df_train.columns) - set(cat) - set(['ANAMOLYDISTNUM']))

## transformations of df_train
df_cat_train = df_train[cat]
df_cont_train = df_train[cont]

enc = OrdinalEncoder()
cat_transformed = enc.fit_transform(df_cat_train)
df_cat_train = pd.DataFrame(cat_transformed.astype(int),
                            columns=df_cat_train.columns)

df = pd.DataFrame()
for col in df_cat_train.columns:
    df[col] = df_cat_train[col]

for col in df_cont_train.columns:
    df[col] = df_cont_train[col]

df = df.dropna()
df_train = df[list(set(df.columns) - set(['LABEL']))]

## transformations of df_test
コード例 #19
0

KNN = KNeighborsClassifier(n_neighbors = 4).fit(train[["price","latitude"]],train["room_type"])


# In[37]:


# predict new data
newdata = KNN.predict([[23,1.45],[18,1.31]])
print(newdata)
print()
csf = KNN.predict(test[["price","latitude"]])
accuracy = accuracy_score(test["room_type"],csf)
array = data[['room_type']]
array = OrdinalEncoder().fit_transform(array)

print("ACC : %.2f"%accuracy)


# In[25]:


n = 30
accuracy = np.zeros((n-1))
for i in range(1, n):    
    KNN = KNeighborsClassifier(n_neighbors = i).fit(train[["price", "latitude"]], train["room_type"])  
    classification = KNN.predict(test[["price", "latitude"]])
    accuracy[i - 1] = accuracy_score(test["room_type"], classification)
    
print("Best  ACC : %.2f" % accuracy.max(), ", with k = ", accuracy.argmax() + 1)
コード例 #20
0
def encode_categoricals(
        data: pd.DataFrame,
        group_cols: List[str]) -> (pd.DataFrame, OrdinalEncoder):
    enc = OrdinalEncoder()
    data[group_cols] = enc.fit_transform(data[group_cols].values)
    return data, enc
コード例 #21
0

# The above graph shows the age of the passengers, their class and whether they surivived. From the graph,
# is noticed that there is a lower proportion of thrid class survivors than 1st and 2nd class.

# ## Step 4: Pre-Processing the Data
# 
# The data with words, such as the gender/sex, is converted to numbers. 
# The data is then standardized and the features are scaled.
# Logistic regression is then used to determine which features are the most useful in predicting survival.

# In[12]:


from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
Gender_encoded = ordinal_encoder.fit_transform(X_train[["Sex"]])
Gender_encoded[:5]

X_train["Sex"] = Gender_encoded
X_train


# In[13]:


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X_train[["Sex","Age","Siblings/Spouses Aboard","Parents/Children Aboard","Fare","Pclass"]])
data = df.drop(columns=target_name)

df_train, df_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

categorical_columns = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'race', 'native-country', 'sex'
]

categories = [data[column].unique() for column in data[categorical_columns]]

categorical_preprocessor = OrdinalEncoder(categories=categories)

preprocessor = ColumnTransformer(
    [('cat-preprocessor', categorical_preprocessor, categorical_columns)],
    remainder='passthrough',
    sparse_threshold=0)

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import make_pipeline

model = make_pipeline(preprocessor,
                      HistGradientBoostingClassifier(random_state=42))

# %% [markdown]
# TODO: write your solution here
コード例 #23
0
verbs = textfeatures(verbs)
named_ents = textfeatures(named_ents)
adjs = textfeatures(adjs)

# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
nouns = pd.DataFrame(tfidf.fit_transform(nouns).toarray())
# verbs = pd.DataFrame(tfidf.fit_transform(verbs).toarray())
# named_ents = pd.DataFrame(tfidf.fit_transform(named_ents).toarray())
# adjs = pd.DataFrame(tfidf.fit_transform(adjs).toarray())
# noun_phrases = pd.DataFrame(tfidf.fit_transform(noun_phrases).toarray())

## ordinal encoder
from sklearn.preprocessing import OrdinalEncoder
labels = pd.DataFrame(OrdinalEncoder().fit_transform(labels.to_numpy().reshape(
    -1, 1)))
print("encoded")

## joining
X = pd.concat([
    count_nouns,
    count_named_ents,
    count_verbs,
    count_adjs,
    count_noun_phrases,
    unique_words,
    stopwords,
    articles,
    punc,
    mean_word_len,
    non_voc,
コード例 #24
0
        drops.append(col)
        # print(col, '=> min:', min(dataset[col].values), '--  max:', max(dataset[col].values), end='')
        # print(' --  values count', len(dataset[col].value_counts()))
    elif len(dataset[col].value_counts()) == 2:
        bins.append(col)
    else:
        cats.append(col)
else:
    dataset.drop(drops, axis=1, inplace=True)

cat_si_step = ('si', SimpleImputer(strategy='constant', fill_value=-99))  # This is for training
ohe_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))  # This is for testing
num_si_step = ('si', SimpleImputer(strategy='constant'))
sc_step = ('sc', StandardScaler())

bin_oe_step = ('le', OrdinalEncoder())
bin_si_step = ('si', SimpleImputer(strategy='most_frequent'))

cat_pipe = Pipeline([cat_si_step, ohe_step])
num_pipe = Pipeline([num_si_step, sc_step])
bin_pipe = Pipeline([bin_si_step,])

transformers = [
    ('cat', cat_pipe, []),
    ('num', num_pipe, cats),
    ('bin', bin_pipe, bins),
]
ct = ColumnTransformer(transformers=transformers)
# X_transformed = ct.fit_transform(dataset)
# print(X_transformed)
コード例 #25
0
    num for num in X_test.columns if X_test[num].dtype in ['int64', 'float64']
]
cat_col_train = [cat for cat in X.columns if X[cat].dtype == 'object']
cat_col_test = [cat for cat in X_test.columns if X_test[cat].dtype == 'object']

X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                      y,
                                                      test_size=0.2,
                                                      train_size=0.8,
                                                      random_state=1)

cat_col = ['Sex']
X_test2 = X_test.copy()
X_train2 = X_train.copy()
X_valid2 = X_valid.copy()
encoder = OrdinalEncoder()
X_test2[cat_col] = encoder.fit_transform(X_test[cat_col])
X_train2[cat_col] = encoder.fit_transform(X_train[cat_col])
X_valid2[cat_col] = encoder.transform(X_valid[cat_col])

X_test1 = X_test2.copy()
X_train1 = X_train2.copy()
X_valid1 = X_valid2.copy()
num = ['Fare']
simple = SimpleImputer()
X_train1[num] = pd.DataFrame(simple.fit_transform(X_train2[num]))
X_valid1[num] = pd.DataFrame(simple.transform(X_valid2[num]))
X_test1[num] = pd.DataFrame(simple.fit_transform(X_test2[num]))
X_test1[num].columns = X_test[num].columns
X_train1[num].columns = X_train[num].columns
X_valid1[num].columns = X_valid[num].columns
コード例 #26
0
def _encode_categorical_values(features: pd.DataFrame,
                               category_names: Iterable[str],
                               is_ohe: bool = False) -> Tuple[pd.DataFrame, Iterable[str]]:
    encoder = OneHotEncoder(dtype='uint8', sparse=False) if is_ohe else OrdinalEncoder(dtype='uint8')
    mask_category = features.columns.isin(category_names)
コード例 #27
0
#input variables encoding

# float variables: NEP_score, GEK_score
# categorical variables: Sex, Age, Marital_status, Place_of_residence, Education_level, Current_occupation, Monthly_net_income, 
# Household_size, Financial_situation, Satisfacion_with_life, Diet, Social_media, Religious_practices, Next_election 


# numeric features - standarization
numeric_features = ['NEP_score', 'GEK_score']
#change integer to float
for f in numeric_features:
    data_df[f] = data_df[f].astype(np.float64)

scaler = StandardScaler()
ordinal = OrdinalEncoder()
onehot = OneHotEncoder()

################zamienić tę część jakoś tak, żeby nie trzeba było tworzyć Pipeline z modelem (tj. Pipeline może co najwyżej przerobić
###############zmienne) albo żeby w pętlach się zamieniały czy coś

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])


# categorical features - transformation into binary
categorical_features = ['Sex', 'Marital_status', 'Current_occupation', 'Diet', 'Religious_practices', 'Next_election']

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
コード例 #28
0
def eval_wrapper(setting_name,
                 proj_root,
                 train_files,
                 test_files=[],
                 label_column='bin_label',
                 text_vars=[],
                 categ_vars=[],
                 num_vars=[],
                 bin_vars=[],
                 nn_settings=None):
    print(f"/// New config run: '{setting_name}' ///")
    # >  Load data
    # |- Train Data
    times = {}
    times['start'] = time.time()
    print("> Loading TRAINING data...")
    train_loader = BalancedData(proj_root=proj_root, file_paths=train_files)
    train_loader.load_data()
    train_df = train_loader.get_all_data_df()

    train_df['bin_label'] = train_df.apply(lambda x: NNA.binarize_label(x),
                                           axis=1)
    times['train-data-loaded'] = time.time()
    # |- Test Data
    if len(test_files) > 0:
        print("> Loading TEST data...")
        test_loader = BalancedData(proj_root=proj_root, file_paths=test_files)
        test_loader.load_data()
        test_df = test_loader.get_all_data_df()

        test_df['bin_label'] = test_df.apply(lambda x: NNA.binarize_label(x),
                                             axis=1)
    else:
        print("> Splitting df into test and train set.")
        train_df, test_df = train_test_split(train_df,
                                             test_size=0.3,
                                             random_state=42)
        train_df = train_df.copy()  # Avoid SettingWithCopy Warning from pandas
        test_df = test_df.copy()
    times['test-data-loaded'] = time.time()

    N_CLASSES = len(train_df[label_column].unique())
    print(f"  Found {N_CLASSES} classes to predict.")

    # >  Preprocess data
    # |- Encode labels
    print("> Encoding labels...")
    oe = OrdinalEncoder()
    #oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    np_labels = train_df[label_column].to_numpy().reshape(-1, 1)
    oe.fit(np_labels)
    train_df[label_column] = oe.transform(np_labels)
    test_df[label_column] = oe.transform(
        test_df[label_column].to_numpy().reshape(-1, 1))

    test_df.head()

    # |- Create TF datasets
    print("> Creating tf.DataSets...")
    train_ds = NNA.df_to_dataset(train_df, label_column, batch_size=128)
    test_ds = NNA.df_to_dataset(test_df, label_column, batch_size=128)

    times['data-ready'] = time.time()
    # >  Setup model
    print("> Setting up model...")
    # |- Combine features
    encoded_features, all_inputs = NNA.combine_input_pipelines(
        categ_vars, text_vars, num_vars, bin_vars, train_ds, nn_settings)

    # |- Create model
    model = NNA.create_model(encoded_features,
                             all_inputs,
                             N_CLASSES,
                             n_units=nn_settings['classifier']['n_units'],
                             n_layers=nn_settings['classifier']['n_layers'],
                             dropout=nn_settings['classifier']['dropout'])
    times['model-ready'] = time.time()
    # >  Train model
    print("> Training model...")
    NNA.fit_model(model,
                  train_ds,
                  test_ds,
                  epochs=nn_settings['fitting']['n_epochs'],
                  early_stopping=True)
    times['model-trained'] = time.time()
    # >  Save model
    print("> Saving model...")
    model.save(path.join(proj_root, "models", setting_name))
    times['model-saved'] = time.time()

    # >  Eval model on test data
    print("> Evaluating model on test data...")
    eval_dict = NNA.create_eval_dict(test_df, label_column, oe, model)
    times['model-tested'] = time.time()

    eval_dict['times'] = times
    # >  Save eval metrics
    eval_path = path.join(proj_root, "models", f"{setting_name}_eval.json")
    print(f"> Saving eval metrics under {eval_path}...")
    with open(eval_path, "w") as f:
        json.dump(eval_dict, f, indent=4, default=convert)

    print(f"## Work done for current setting '{setting_name}' ##")
    return
コード例 #29
0
def prepare_inputs (X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit (X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc
コード例 #30
0
def encode_catogory_features(X_train, X_valid, columns):
    oe = OrdinalEncoder()
    oe.fit(X_train[columns])
    X_train.loc[:, columns] = oe.transform(X_train[columns])
    X_valid.loc[:, columns] = oe.transform(X_valid[columns])
    return X_train, X_valid
コード例 #31
0
# %% [markdown]
# ## Strategies to encode categories
#
# ### Encoding ordinal categories
#
# The most intuitive strategy is to encode each category with a different
# number. The `OrdinalEncoder` will transform the data in such manner.
# We will start by encoding a single column to understand how the encoding
# works.

# %%
from sklearn.preprocessing import OrdinalEncoder

education_column = data_categorical[["education"]]

encoder = OrdinalEncoder()
education_encoded = encoder.fit_transform(education_column)
education_encoded

# %% [markdown]
# We see that each category in `"education"` has been replaced by a numeric
# value. We could check the mapping between the categories and the numerical
# values by checking the fitted attribute `categories_`.

# %%
encoder.categories_

# %% [markdown]
# Now, we can check the encoding applied on all categorical features.

# %%
コード例 #32
0
ファイル: pipeline.py プロジェクト: CaioMar/mlops
         reference_variable=config.model_config.drop_features,
     ),
 ),
 (
     "rare_label_encoder",
     RareLabelCategoricalEncoder(
         tol=config.model_config.rare_label_tol,
         n_categories=config.model_config.rare_label_n_categories,
         variables=config.model_config.categorical_vars,
     ),
 ),
 (
     "categorical_encoder",
     pp.SklearnTransformerWrapper(
         variables=config.model_config.categorical_vars,
         transformer=OrdinalEncoder(),
     ),
 ),
 (
     "drop_features",
     pp.DropUnecessaryFeatures(
         variables_to_drop=config.model_config.drop_features,
     ),
 ),
 (
     "gb_model",
     GradientBoostingRegressor(
         loss=config.model_config.loss,
         random_state=config.model_config.random_state,
         n_estimators=config.model_config.n_estimators,
     ),
コード例 #33
0
plot_scaling_result(data, stand_scaled, 'Standard Scaling', (-7, 7))

# %%
from sklearn.preprocessing import RobustScaler

robust_scaled = RobustScaler().fit_transform(data)
plot_scaling_result(data, robust_scaled, 'Robust Scaling', (-7, 7))

# %%
property_type = np.array(['House', 'Unit', 'Townhouse', 'House',
                          'Unit']).reshape(-1, 1)

# %%
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder().fit(property_type)
labels = enc.transform(property_type)
labels.flatten()

# %%
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False).fit(property_type)
one_hots = enc.transform(property_type)
one_hots

# %%
n_rooms = np.array([1, 2, 1, 4, 6, 7, 12, 20])
pd.cut(n_rooms, bins=[0, 3, 8, 100], labels=["small", "medium", "large"])

# %%
コード例 #34
0
ファイル: xgb_bank.py プロジェクト: positron1/amlb
def noprep(dataset,
           dirt,
           numeric_features,
           categorical_features,
           delim=',',
           indexdrop=False):
    index_features = ['_dmIndex_', '_PartInd_']
    data = pd.read_csv(dirt + dataset + '.csv',
                       delimiter=delim)  # panda.DataFrame
    print(data.columns)
    data = data.astype({'_dmIndex_': 'int', '_PartInd_': 'int'})
    numeric_features = list(
        set(data.select_dtypes(include=["number"])) - set(index_features) -
        set(['y']))
    categorical_features = list(
        set(data.select_dtypes(exclude=["number"])) - set(['y']))
    ###############################
    index_transformer = Pipeline(
        steps=[('imputer', SimpleImputer(strategy='constant', fill_value=-1))])
    y_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant',fill_value=-1)),\
                                   ('orden', OrdinalEncoder())])
    numeric_transformer = Pipeline(steps=[('imputer',
                                           SimpleImputer(strategy='median'))])

    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\
        ('onehot', OneHotEncoder(sparse=False))])

    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),\
         ('cat', categorical_transformer, categorical_features), ('y',y_transformer,['y']),('index',index_transformer, index_features)])

    ######################################################################
    data = preprocessor.fit_transform(data)
    data = pd.DataFrame(data)
    col = data.columns.values
    print(col)
    X = data.drop(col[-3:], axis=1)
    X_train = data[data[col[-1]] > 0].drop(
        col[-3:], axis=1)  #pd.DataFrame(X).to_csv('X_vanilla.csv')
    X_test = data[data[col[-1]] == 0].drop(
        col[-3:], axis=1)  #pd.DataFrame(X).to_csv('X_vanilla.csv')
    print(data.shape)

    ####################################################################
    #y= data["y"]
    #lb = preprocessing.LabelBinarizer()
    #y= lb.fit_transform(y)
    #data["y"]=data.where(data["y"]=='yes',1)
    #data["y"]=data.where(data["y"]=='no',0)
    y = data[col[-3]]
    y_train = data[data[col[-1]] > 0][col[-3]]
    y_test = data[data[col[-1]] == 0][col[-3]]
    ##########################################################
    ##################################################################
    feat_type = []  #dict()
    xcol = X.columns.values
    for cl in xcol:
        if cl in categorical_features:
            feat_type.append(1)
        else:
            feat_type.append(0)
    features = numeric_features + categorical_features
    #X_train, X_test, y_train, y_test = \
    #sklearn.model_selection.train_test_split(X, y,test_size=0.2, random_state=1)
    return data, X, y, X_train, y_train, X_test, y_test, feat_type, features
コード例 #35
0
---

# Análise Predititiva

Vamos inciar agora a análise preditiva, utilizando técnicas de aprendizado de máquina, para estimar se um passageiro poderia sobreviver ou não a esta tragédia.

O algoritmo selecionado para esta análise é o Random Forest.

Optei por este algoritmo por ele ser simples de implementar, trabalhar bem com classificação, e também pelo fato de o mesmo apresentar uma "explicação" sobre as decisões tomadas e as features mais importantes (podemos visualizar a 'feature importance' e ver quais features tem mais peso na tomada de decisão).

Para começar, vamos converter os dados categóricos em numéricos, pois máquinas gostam de números!
"""

#DATASET/TREINO -  Convertendo dados categóricos em numéricos
df_treino['sexo'] = OrdinalEncoder().fit_transform(
    df_treino['sexo'].values.reshape((-1, 1)))
df_treino['poltrona'] = OrdinalEncoder().fit_transform(
    df_treino['poltrona'].values.reshape((-1, 1)))
df_treino['local_de_embarque'] = OrdinalEncoder().fit_transform(
    df_treino['local_de_embarque'].values.reshape((-1, 1)))
df_treino['sobrevivente'] = OrdinalEncoder().fit_transform(
    df_treino['sobrevivente'].values.reshape((-1, 1)))

# Convertendo float para int
#
# A idade será mantida como float, pois idades estimadas estão no formato xx.5
df_treino['sexo'] = df_treino['sexo'].apply(lambda x: int(x))
df_treino['poltrona'] = df_treino['poltrona'].apply(lambda x: int(x))
df_treino['local_de_embarque'] = df_treino['local_de_embarque'].apply(
    lambda x: int(x))
df_treino['sobrevivente'] = df_treino['sobrevivente'].apply(lambda x: int(x))