Ejemplo n.º 1
0
def test_ordinal_encoder(X):
    enc = OrdinalEncoder()
    exp = np.array([[0, 1, 0],
                    [1, 0, 0]], dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
    enc = OrdinalEncoder(dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp)
Ejemplo n.º 2
0
def test_ordinal_encoder_raise_missing(X):
    ohe = OrdinalEncoder()

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit(X)

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.fit_transform(X)

    ohe.fit(X[:1, :])

    with pytest.raises(ValueError, match="Input contains NaN"):
        ohe.transform(X)
Ejemplo n.º 3
0
def test_ordinal_encoder_inverse():
    X = [['abc', 2, 55], ['def', 1, 55]]
    enc = OrdinalEncoder()
    X_tr = enc.fit_transform(X)
    exp = np.array(X, dtype=object)
    assert_array_equal(enc.inverse_transform(X_tr), exp)

    # incorrect shape raises
    X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
    msg = re.escape('Shape of the passed X data is not correct')
    assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
Ejemplo n.º 4
0
def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
    enc = OrdinalEncoder(categories=cats)
    exp = np.array([[0.], [1.]])
    assert_array_equal(enc.fit_transform(X), exp)
    assert list(enc.categories[0]) == list(cats[0])
    assert enc.categories_[0].tolist() == list(cats[0])
    # manually specified categories should have same dtype as
    # the data when coerced from lists
    assert enc.categories_[0].dtype == cat_dtype

    # when specifying categories manually, unknown categories should already
    # raise when fitting
    enc = OrdinalEncoder(categories=cats)
    with pytest.raises(ValueError, match="Found unknown categories"):
        enc.fit(X2)
Ejemplo n.º 5
0
def get_neighborhood_colors(column: str = "Neighborhood"):
    # Could also doc olumn = 'MSZoning'
    x: pd.Series = features[column]
    enc = OrdinalEncoder()
    colors = enc.fit_transform(x.to_numpy().reshape([len(x), 1]))
    return colors
Ejemplo n.º 6
0
                                   columns=housing_num.columns,
                                   index=housing_num.index)

# The housing_cat variable is the only categorical variable, so the book examines it and then converts it into
# a on hot encoding of various choices. I think it would be better to make it odinal

# In[55]:

housing[["ocean_proximity"]].head(10)

# In[56]:

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

housing_cat_encoded = ordinal_encoder.fit_transform(
    housing[["ocean_proximity"]])

# In[57]:

housing_cat_encoded[:10]

# In[58]:

ordinal_encoder.categories_

# There's no easy way to change the categories. You can renumber these values after the encoder has run.
#
# The book really wants us to use a one-hot array with individual values for each of the categories.

# In[59]:
Ejemplo n.º 7
0
    #Since we already stored the incomplete rows in
    #"sample_incomplete_rows", we're just checking to ensure those values were replaced with the median

    #Recall: the ".loc" locates values in a Pandas DataFrame  <-- see documentation
    print(housing_tr.loc[sample_incomplete_rows.index.values])

    #NOTE: For pushing "bare" repo to Github: $ git remote add origin https://github.com/MSilberberg0619/Machine_Learning_Practice.git

    #"ocean_proximity" was left out because it's a text attribute and so the median can't be computed
    #To fix, convert these categories from text to numbers using Scikit-Learn's OrdinalEncoder class
    housing_cat = housing[["ocean_proximity"]]
    print(housing_cat.head(10))

    ordinal_encoder = OrdinalEncoder()
    housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
    print(housing_cat_encoded)

    #Can use one-hot encoding to map attributes to categories so the values of the attributes that are more similar
    #will have similar encoded values
    #We don't want the model to assume some natural ordering to the data --> could result in poor performance or
    #unexpected results
    cat_encoder = OneHotEncoder()
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
    print(housing_cat_1hot)
    housing_cat_1hot.toarray()
    print(housing_cat_1hot)

    #List of categories using the encoder's categories instance variable
    print(cat_encoder.categories_)
Ejemplo n.º 8
0
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns)
housing_tr

### Data Type Conversion

- Categorical to Ordinal

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories='auto')

ocean_proximity_cat = housing[["ocean_proximity"]]
ocean_proximity_cat.head(10)

ocean_proximity_ordinal = ordinal_encoder.fit_transform(ocean_proximity_cat)
ocean_proximity_ordinal[:10]

ordinal_encoder.categories_

pd.DataFrame(ocean_proximity_oridinal).value_counts()

- One-hot encoding

from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder()
ocean_proximity_onehot = onehot_encoder.fit_transform(ocean_proximity_cat)

ocean_proximity_onehot

ocean_proximity_onehot.toarray()
    for i in range(len(agg_labels)):
        if agg_labels[i] != focus_label:
            agg_labels[i] = "ANOTHER"
    print (agg_labels)
    return agg_labels


agg_labels_train = redifine_labels(agg_labels_train, focus_label)
agg_labels_dev = redifine_labels(agg_labels_dev, focus_label)


from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder_train = OrdinalEncoder()

agg_labels_train_encoded = ordinal_encoder_train.fit_transform(agg_labels_train)

#%%
print(agg_labels_train_encoded[:10])
print(ordinal_encoder_train.categories_)

ordinal_encoder_dev = OrdinalEncoder()

agg_labels_dev_encoded = ordinal_encoder_dev.fit_transform(agg_labels_dev)

#%%
print(agg_labels_dev_encoded[:10])
print(ordinal_encoder_dev.categories_)

#%%
from time import time
Ejemplo n.º 10
0
def main():
    path = '../data/persons'
    data = pd.read_csv(f'{path}/person_data_clean.csv', header=0)

    cat_cols = [
        'person_type', 'trafficway_type', 'manner_of_collision', 'body_type',
        'seating_position', 'ejection', 'safety_equipment_use'
    ]
    binary_cols = ['sex', 'land_use_urban', 'rollover', 'air_bag_deployed']
    numeric_cols = ['age']

    data[cat_cols] = data[cat_cols].apply(lambda x: x.astype('category'))

    labels = data['fatality']
    features = data[cat_cols + binary_cols + numeric_cols]

    # features = pd.get_dummies(features, columns=cat_cols)
    # features.rename(columns={'manner_of_collision_Not Collision with Motor Vehicle in Transport (Not Necessarily in Transport for\n2005-2009)': 'manner_of_collision_Not Collision with Motor Vehicle in Transport'},
    #                 inplace=True)
    feature_names = features.columns

    oe = OrdinalEncoder()
    features = oe.fit_transform(features)

    scaler = StandardScaler()
    features = scaler.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=2020)
    print('Class Balance')
    print(y_test.value_counts())
    print()

    models = {
        'Random Forest': (RandomForestClassifier(n_estimators=100,
                                                 min_samples_leaf=5,
                                                 class_weight='balanced',
                                                 random_state=2020), 'rf'),
        'Logistic Regression': (LogisticRegressionCV(cv=5,
                                                     scoring='f1',
                                                     class_weight='balanced',
                                                     max_iter=500,
                                                     random_state=2020), 'lr')
    }

    for name, (model, suffix) in models.items():
        print(name)
        print('-' * 20)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_probs = model.predict_proba(X_test)[:, 1]

        utils.print_metrics(y_test, y_pred)
        utils.roc_curve(y_test, y_probs, name, suffix)
        utils.feature_importance(model, feature_names, name, suffix)
        utils.permutation_importances(model, X_test, y_test, feature_names,
                                      name, suffix)
        # utils.permutation_importances(model, X_train, y_train, feature_names, name, suffix + '_ohe', dataset='train')
        print('#' * 50)
Ejemplo n.º 11
0
def encode_categoricals(
        data: pd.DataFrame,
        group_cols: List[str]) -> (pd.DataFrame, OrdinalEncoder):
    enc = OrdinalEncoder()
    data[group_cols] = enc.fit_transform(data[group_cols].values)
    return data, enc
scale_mapper = {"Low": 1, "Medium": 2, "High": 3}

#특성을 정수로 변환
dataframe["Score"].replace(scale_mapper)

dataframe = pd.DataFrame({
    "Score":
    ["Low", "Low", "Medium", "Medium", "High", "Barely More Than Medium"]
})

scale_mapper = {"Low": 1, "Medium": 2, "Barely More Than Medium": 3, "High": 4}

dataframe["Score"].replace(scale_mapper)

scale_mapper = {
    "Low": 1,
    "Medium": 2,
    "Barely More Than Medium": 2.1,
    "High": 3
}
dataframe["Score"].replace(scale_mapper)

from sklearn.preprocessing import OrdinalEncoder

features = np.array([["Low", 10], ["High", 50], ["Medium", 3]])

ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit_transform(features)

ordinal_encoder.categories_
Ejemplo n.º 13
0
def test_ordinal_encoder(X):
    enc = OrdinalEncoder()
    exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp.astype('float64'))
    enc = OrdinalEncoder(dtype='int64')
    assert_array_equal(enc.fit_transform(X), exp)
Ejemplo n.º 14
0
# Shuffling users
user_vector = shuffled_ratings['user_emb_id'].values
print('Users:', user_vector, ', shape =', user_vector.shape)

# Shuffling movies
movie_vector = shuffled_ratings['movie_emb_id'].values
print('Movies:', movie_vector, ', shape =', movie_vector.shape)

# Shuffling ratings
rating_vector = shuffled_ratings['rating'].values
print('Ratings:', rating_vector, ', shape =', rating_vector.shape)

enc = OrdinalEncoder()
ratings[['age_desc',
         'occ_desc']] = enc.fit_transform(ratings[['age_desc', 'occ_desc']])

features = ratings[['age_desc', 'occ_desc', 'movie_id']].values
labels = ratings[['rating']].values

model = RandomForestRegressor()
model.fit(features, labels)

# Show the RMSE
y_pred = model.predict(features)

val_loss = np.sqrt(metrics.mean_squared_error(labels, y_pred))
print('Minimum RMSE {:f}'.format(val_loss))

save_obj(enc, model_path, 'feature_encoder')
save_obj(model, model_path, 'rf_recommender')
Ejemplo n.º 15
0
# %% [markdown]
# ## Encoding ordinal categories
#
# The most intuitive strategy is to encode each category with a different
# number. The `OrdinalEncoder` will transform the data in such manner.
# We will start by encoding a single column to understand how the encoding
# works.

# %%
from sklearn.preprocessing import OrdinalEncoder

education_column = data_categorical[["education"]]

encoder = OrdinalEncoder()
education_encoded = encoder.fit_transform(education_column)
education_encoded

# %% [markdown]
# We see that each category in `"education"` has been replaced by a numeric
# value. We could check the mapping between the categories and the numerical
# values by checking the fitted attribute `categories_`.

# %%
encoder.categories_

# %% [markdown]
#  Now, we can check the encoding applied on all categorical features.

# %%
data_encoded = encoder.fit_transform(data_categorical)
Ejemplo n.º 16
0
    i for i, column in enumerate(train.columns)
    if not is_numeric_dtype(train[column])
]

# Encode categorical data
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value",
                                 unknown_value=np.nan)

X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

for col in categorical:
    X_train[col].fillna("NaN", inplace=True)
    X_test[col].fillna("NaN", inplace=True)

X_train[categorical] = ordinal_encoder.fit_transform(X_train[categorical])
X_test[categorical] = ordinal_encoder.transform(X_test[categorical])

X_train[X_train_copy.isnull()] = np.nan
X_test[X_test_copy.isnull()] = np.nan

# Tune max_iters (number of trees)
params = {
    "categorical_features": cat_indices,
    "scoring": "neg_root_mean_squared_error"
}
max_iters_param_grid = {"max_iter": range(20, 120, 10)}
hgbr_1 = HistGradientBoostingRegressor(**params)
g_search = GridSearchCV(hgbr_1, max_iters_param_grid)
_ = g_search.fit(X_train, y_train)
Ejemplo n.º 17
0
print(label)                                       #查看获取的结果label
 
print(le.fit_transform(y))                         #也可以直接fit_transform一步到位
print(le.inverse_transform(label))                 #使用inverse_transform可以逆转

# In[]:
from sklearn.preprocessing import OrdinalEncoder

y = data_.iloc[:,-1].reshape(-1,1)
## 接口categories_对应LabelEncoder的接口classes_,一模一样的功能
enc = OrdinalEncoder()
enc.fit(y)
print(enc.categories_)
data_.iloc[:,-1] = enc.transform(y)

data_.iloc[:,1:-1] = enc.fit_transform(data_.iloc[:,1:-1]) # 一步到位

# In[]:
from sklearn.preprocessing import OneHotEncoder

X = data_.iloc[:,1:-1]
 
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
 
#依然可以直接一步到位,但为了给大家展示模型属性,所以还是写成了三步
OneHotEncoder(categories='auto').fit_transform(X).toarray()
 
#依然可以还原
pd.DataFrame(enc.inverse_transform(result))
 
Ejemplo n.º 18
0
def main():
    np.set_printoptions(
        threshold=10)  # Ndarray display threshold to avoid hiding some columns
    print('HOUSING_PATH=', HOUSING_PATH)
    print('HOUSING_URL=', HOUSING_URL)
    fetch_housing_data(HOUSING_URL, HOUSING_PATH)
    print('After fetch_housing_data')
    housing = load_housing_data(HOUSING_PATH)
    print('After load_housing_data')
    print(housing.head())

    # INFO statement
    print("\nINFO statement:")
    print(housing.info())

    # Value counts
    print("\nValue counts:")
    print(housing["ocean_proximity"].value_counts())

    # "describe" statement for summary
    print("\nDESCRIBE statement:")
    print(housing.describe())

    # Plot data
    #housing.hist(bins=50,figsize =(20,15))
    #plt.show()

    # Test set sampling - random vs stratification
    housing["income_cat"] = pd.cut(housing["median_income"],
                                   bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
                                   labels=[1, 2, 3, 4, 5])
    housing["income_cat"].hist()
    #plt.show()

    # Random test set
    rand_train_set, rand_test_set = train_test_split(housing,
                                                     test_size=0.2,
                                                     random_state=42)

    # Stratification of data
    print("\nStratify housing data:")
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    print(split.split(housing, housing["income_cat"]))
    print(len(list(split.split(housing, housing["income_cat"]))))

    ic = 0
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        ic += 1
        print("ic = ", ic)
        print(len(train_index), train_index)
        print(len(test_index), test_index)
        #sys.exit()
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
        strat_full_set = housing

    #rts = (rand_test_set["income_cat"].value_counts()/len(rand_test_set)).sort_index()
    #sts = (strat_test_set["income_cat"].value_counts()/len(strat_test_set)).sort_index()
    #sfs = (strat_full_set["income_cat"].value_counts()/len(strat_full_set)).sort_index()
    #print('rand_test:  \n{0}'.format(rts))
    #print('strat_test: \n{0}'.format(sts))
    #print('strat_full: \n{0}'.format(sfs))

    # Separate predictors and labels
    print("\nRevert training set:")
    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()
    housing_cat = housing[["ocean_proximity"]]
    housing_cat_head = housing_cat.head(10)

    print("housing_cat.head(10) = {}".format(housing_cat.head(10)))
    #print("housing_cat.head(10) = {}".format(housing_cat_head))

    # Simple imputer
    imputer = SimpleImputer(strategy="median")
    housing_num_only = housing.drop("ocean_proximity", axis=1)
    imputer.fit(housing_num_only)
    print("imputer.statistics_ = {0}".format(imputer.statistics_))
    print("housing_num_only.median() = {0}".format(housing_num_only.median()))
    X = imputer.transform(housing_num_only)
    housing_tr = pd.DataFrame(X, columns=housing_num_only.columns)
    print('housing_tr.info() : ')
    print(housing_tr.info())
    ''' Encording '''
    # Ordinal encoder : replace categorical attributes into numbers
    # Issue with this method is the "distance" between the numerical values
    print("\nOrdinal encoder:")
    ordinal_encoder = OrdinalEncoder()
    housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
    print("housing_cat_encoded = {0}".format(housing_cat_encoded[:10]))
    print("ordinal_encoder.categories_ = {0}".format(
        ordinal_encoder.categories_))

    # One-hot encorder: Split categories and label only 0 or 1
    # This way can avoid "distance" problem of the ordinal encorder
    # Output is a SiPy sparse matrix. User toarray() to convert to numpy array
    print("\nOne-hot encoder:")
    cat_encoder = OneHotEncoder()
    housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
    print("housing_cat_1hot = {0}".format(housing_cat_1hot))
    print("housing_cat_1hot.toarray() = {0}".format(
        housing_cat_1hot.toarray()))

    # Attribute adder
    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)

    # Transformation pipeline
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    print("num_pipeline = {0}".format(type(num_pipeline)))
    housing_num_tr = num_pipeline.fit_transform(housing_num_only)
    print("housing_num_tr = {0}".format(housing_num_tr))

    num_attribs = list(housing_num_only)
    cat_attribs = ["ocean_proximity"]
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    housing_prepared = full_pipeline.fit_transform(housing)

    return
Ejemplo n.º 19
0
df = df[cat_cols].astype("category")
print("Revised column data types ", df.columns)

# revalue column values
# df['salary'] = df['salary'].map({'>50K': "above_50K", '<=50K': "less_equal_50K"})
# df['salary'].replace(to_replace=dict('>50K': 'above_50K', '<=50K':'les_eq_50K'), inplace=True)
# df['salary'].replace('>50K','above_50K', inplace=True)
# df['salary'].replace('<=50K','less_eq_50K', inplace=True)
# df['salary'].replace(['>50K','<=50K'],['above_50K','less_eq_50K'],inplace=True)
# print(df.head)
# check for high correlated variables

# use ordinal encoder to convert categorical to numbers

enc = OrdinalEncoder()
df[cat_cols] = enc.fit_transform(df[cat_cols])
print(df.head)
print(enc.categories_)

# Model Building
# train_set, validate_set, test_set = train_validate_test_split(df)
# print("Train set: ", train_set.shape)
# print("Test set: ", test_set.shape)
# print("Validate set: ", validate_set.shape)
#
# print("Dataframe cols: ", df.columns)

# X_train_data = train_set[['workclass', 'education', 'marital-status', 'occupation',
#        'relationship', 'race', 'sex', 'native-country']].copy()
# y_train_label = train_set[['salary']].copy()
Ejemplo n.º 20
0
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
print(X_train_minmax)
print("mean", np.mean(X_train_minmax, axis=0))
print("SD", np.std(X_train_minmax, axis=0))

X_test = np.array([[-3., -1., 4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

#Task 3
import pandas as pd

import numpy as np

#convert categorical features to a numerical representation.
X = pd.DataFrame(
    np.array([
        'M', 'O-', 'medium', 'M', 'O-', 'high', 'F', 'O+', 'high', 'F', 'AB',
        'low', 'F', 'B+', 'NA'
    ]).reshape((5, 3)))
X.columns = ['sex', 'blood_type', 'edu_level']

print(X)

from sklearn.preprocessing import OrdinalEncoder
#reqiurement sklearn 0.20 check version!
encoder = OrdinalEncoder()
X.edu_level = encoder.fit_transform(X.edu_level.values.reshape(-1, 1))
print(X)
Ejemplo n.º 21
0
def compute_spatial_distribution(
    test_features,
    test_labs_true,
    test_labs_pred,
    base_features,
    base_labs,
    numerical_dist_metric=None,
    categorical_dist_metric=None,
    summary="mean",
):
    """Compute a summary of the pairwise distances between points.

    This computes pairwise distances between the test points and base points in
    feature space (ie. how far each test point is from each base point), and
    returns a summary of the distance for each test point relative to each base
    class.

    Parameters
    ----------
    test_features : DataFrame
        Feature values for the test dataset.
    test_labs_true : Series
        True labels for the test dataset.
    test_labs_pred : Series
        Labels predicted by a model for the test dataset.
    base_features:  DataFrame
        Feature values for the base dataset.
    base_labs : Series
        True labels for the base dataset.
    numerical_dist_metric : dict
        The metrics to use to measure distance between numerical
        (continuous)-valued columns. This should be a dict mapping column names
        to strings, each a named metric as accepted by
        `sklearn.metrics.pairwise_distances` appropriate for continuous data
    categorical_dist_metric : dict
        The metrics to use to measure distance between categorical
        (discrete)-valued columns. This should be a dict mapping column names to
        strings, each a named metric as accepted by
        `sklearn.metrics.pairwise_distances` appropriate for discrete data
    summary : str
        An aggregation function to apply to a Pandas Grouped object.

    Only columns listed in the distance metric dists will be included in the
    distance computation.

    Returns
    -------
    SpatialDistributionResult
    """
    # Compute a DF of pairwise distances between base datapoints (rows)
    # and test datapoints (cols).
    pairwise_dist = None
    if numerical_dist_metric:
        # Normalize numeric features to reduce the effect of different scales
        num_cols = list(numerical_dist_metric.keys())
        scaler = StandardScaler()
        base_scaled = DataFrame(
            scaler.fit_transform(base_features[num_cols]), columns=num_cols
        )
        test_scaled = DataFrame(
            scaler.transform(test_features[num_cols]), columns=num_cols
        )
        pairwise_dist = _pairwise_dist(base_scaled, test_scaled, numerical_dist_metric)
    if categorical_dist_metric:
        categ_cols = list(categorical_dist_metric.keys())
        encoder = OrdinalEncoder()
        base_encoded = DataFrame(
            encoder.fit_transform(base_features[categ_cols]), columns=categ_cols
        )
        test_encoded = DataFrame(
            encoder.transform(test_features[categ_cols]), columns=categ_cols
        )
        pairwise_dist_categ = _pairwise_dist(
            base_encoded, test_encoded, categorical_dist_metric
        )
        if pairwise_dist is None:
            pairwise_dist = pairwise_dist_categ
        else:
            pairwise_dist += pairwise_dist_categ
    df_dist = DataFrame(
        pairwise_dist,
        index=base_labs.index,
        columns=test_labs_true.index,
    )

    # Summarize distances within each base dataset class separately for each
    # test datapoint.
    # Result is a m x k DF with 1 row for each test datapoint and 1 column for
    # each base class.
    df_summ = df_dist.groupby(base_labs).agg(summary).transpose()
    # Add the test labels to the index for easy reference.
    df_summ = df_summ.set_index(
        MultiIndex.from_arrays([test_labs_true, test_labs_pred, df_summ.index])
    )

    return SpatialDistributionResult(
        vals=df_summ,
        dist_metrics_num=numerical_dist_metric,
        dist_metrics_categ=categorical_dist_metric,
        summary=summary,
    )
Ejemplo n.º 22
0
class NumericTransformer(object):
    """General purpose numeric conversion for pandas dataframes.

    All categorical data and levels must be passed to .fit().
    If new categorical series or levels are present in .transform() it won't work!

    Currently datetimes cannot be inverse_transformed back to datetime

    Args:
        na_strings (list): list of strings to replace as pd.NA
        categorical_fillna (str): how to fill NaN for categorical variables (numeric NaN are unaltered)
            "ffill" - uses forward and backward filling to supply na values
            "indicator" or anything else currently results in all missing replaced with str "missing_value"
        handle_unknown (str): passed through to scikit-learn OrdinalEncoder
        verbose (int): greater than 0 to print some messages
    """

    def __init__(
        self,
        na_strings: list = ['', ' '],  # 'NULL', 'NA', 'NaN', 'na', 'nan'
        categorical_fillna: str = "ffill",
        handle_unknown: str = 'use_encoded_value',
        verbose: int = 0,
    ):
        self.na_strings = na_strings
        self.verbose = verbose
        self.categorical_fillna = categorical_fillna
        self.handle_unknown = handle_unknown
        self.categorical_flag = False
        self.needs_transformation = True

    def _fit(self, df):
        """Fit categorical to numeric."""
        # test if any columns aren't numeric
        if not isinstance(df, pd.DataFrame):  # basically just Series inputs
            df = pd.DataFrame(df)

        if df.shape[1] == df.select_dtypes(include=np.number).shape[1]:
            self.needs_transformation = False
            if self.verbose > 2:
                print("All data is numeric, skipping NumericTransformer")

        if self.needs_transformation:
            # replace some common nan datatypes from strings to nan
            df.replace(self.na_strings, np.nan, inplace=True)  # pd.NA in future

            # convert series to numeric which can be readily converted.
            df = df.apply(pd.to_numeric, errors='ignore')

            # record which columns are which dtypes
            self.column_order = df.columns
            self.numeric_features = df.select_dtypes(
                include=[np.number]
            ).columns.tolist()
            self.categorical_features = list(
                set(df.columns.tolist()) - set(self.numeric_features)
            )

            if len(self.categorical_features) > 0:
                self.categorical_flag = True
            if self.categorical_flag:
                from sklearn.preprocessing import OrdinalEncoder

                df_enc = df[self.categorical_features]
                if self.categorical_fillna == "ffill":
                    df_enc = df_enc.fillna(method='ffill').fillna(method='bfill')
                df_enc = df_enc.fillna('missing_value')
                self.cat_transformer = OrdinalEncoder(
                    handle_unknown=self.handle_unknown, unknown_value=np.nan
                )
                # the + 1 makes it compatible with remove_leading_zeroes
                df_enc = self.cat_transformer.fit_transform(df_enc) + 1
                # df_enc = self.cat_transformer.transform(df_enc) + 1

                self.cat_max = df_enc.max(axis=0)
                self.cat_min = df_enc.min(axis=0)
                if self.verbose > 0:
                    print("Categorical features converted to numeric")
                df = pd.concat(
                    [
                        pd.DataFrame(
                            df[self.numeric_features], columns=self.numeric_features
                        ),
                        pd.DataFrame(
                            df_enc, columns=self.categorical_features, index=df.index
                        ),
                    ],
                    axis=1,
                )[self.column_order]
        return df.astype(float)

    def fit(self, df):
        """Learn behavior of data to change.

        Args:
            df (pandas.DataFrame): input dataframe
        """
        self._fit(df)
        return self

    def fit_transform(self, df):
        """Fits and Returns *Magical* DataFrame.

        Args:
            df (pandas.DataFrame): input dataframe
        """
        return self._fit(df)

    def transform(self, df):
        """Convert categorical dataset to numeric."""
        if self.needs_transformation:
            if not isinstance(df, pd.DataFrame):
                df = pd.DataFrame(df)
            df.replace(self.na_strings, np.nan, inplace=True)
            df = df.apply(pd.to_numeric, errors='ignore')
            if self.categorical_flag:
                df_enc = (df[self.categorical_features]).fillna(method='ffill')
                df_enc = df_enc.fillna(method='bfill').fillna('missing_value')
                df_enc = self.cat_transformer.transform(df_enc) + 1
                df = pd.concat(
                    [
                        pd.DataFrame(
                            df[self.numeric_features], columns=self.numeric_features
                        ),
                        pd.DataFrame(
                            df_enc, columns=self.categorical_features, index=df.index
                        ),
                    ],
                    axis=1,
                )[self.column_order]
        try:
            df = df.astype(float)
        except ValueError as e:
            raise ValueError(
                f"NumericTransformer.transform() could not convert data to float. {str(e)}."
            )
        return df

    def inverse_transform(self, df, convert_dtypes: bool = False):
        """Convert numeric back to categorical.
        Args:
            df (pandas.DataFrame): df
            convert_dtypes (bool): whether to use pd.convert_dtypes after inverse
        """
        if self.categorical_flag:
            if not isinstance(df, pd.DataFrame):  # basically just Series inputs
                df = pd.DataFrame(df)
            df_enc = (
                df[self.categorical_features].clip(
                    upper=self.cat_max, lower=self.cat_min, axis=1
                )
                - 1
            )
            df_enc = self.cat_transformer.inverse_transform(df_enc)
            df = pd.concat(
                [
                    pd.DataFrame(
                        df[self.numeric_features], columns=self.numeric_features
                    ),
                    pd.DataFrame(
                        df_enc, columns=self.categorical_features, index=df.index
                    ),
                ],
                axis=1,
            )[self.column_order]
        if convert_dtypes:
            df = df.convert_dtypes()
        return df
Ejemplo n.º 23
0
from sklearn import tree


# Test w/ Iris dataset using my class
dataset = load_iris()
X, y = dataset.data, dataset.target
clf_iris = Decision_Tree(max_depth = 5)
# Test to make target class strings instead of integers
y = ["one" if val == 1 or val == 2 else "zero" for val in y]
y = np.array(y)
# Need to ordinally encode strings to integers
if "int" not in str(y.dtype):
    # Reshape y array so it works w/ ordinal encoder
    y = y.reshape(-1, 1)
    encoder = OrdinalEncoder()
    y = encoder.fit_transform(y)
y = y.astype(int)
y = y.reshape(y.size,)

clf_iris.fit(X, y)
temp1 = np.array([[3, 2, 1, .5]])
temp2 = np.array([[4, 2.9, 1.3, .2]])
temp3 = np.array([[3.8, 3, 1.4, .4]])
temp4 = np.array([[7.7, 2.8, 6.7, 2]])


#temp1
print("------------------------------------------------------")
print(f"My Iris prediction for {temp1}:\n", clf_iris.predict(temp1))
print("------------------------------------------------------")
# Test w/ Iris dataset using sklearn
Ejemplo n.º 24
0
def cluster_clients(k=None, save_centroids=True, save_clusters=True):
    '''
    Runs k-prototypes clustering algorithm on preprocessed dataset
    :param k: Desired number of clusters
    :param save_centroids: Boolean indicating whether to save cluster centroids
    :param save_clusters: Boolean indicating whether to save client cluster assignments
    :return: A KPrototypes object that describes the best clustering of all the runs
    '''
    cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r'))

    # Load preprocessed client data
    try:
        client_df = pd.read_csv(cfg['PATHS']['CLIENT_DATA'])
    except FileNotFoundError:
        print("No file found at " + cfg['PATHS']['CLIENT_DATA'] + ". Running preprocessing of client data.")
        raw_df = load_raw_data(cfg)
        client_df = prepare_for_clustering(cfg, raw_df,  save_df=False)
    excluded_feats = cfg['K-PROTOTYPES']['FEATS_TO_EXCLUDE']
    client_df.drop(excluded_feats, axis=1, inplace=True)   # Features we don't want to see in clustering
    client_feats_df = client_df.copy()
    client_ids = client_df.pop('CONTRACT_ACCOUNT').tolist()
    cat_feats = [f for f in cfg['DATA']['CATEGORICAL_FEATS'] if f not in excluded_feats]
    bool_feats = [f for f in cfg['DATA']['BOOLEAN_FEATS'] if f not in excluded_feats]
    ordinal_encoder = OrdinalEncoder()
    client_df[cat_feats] = ordinal_encoder.fit_transform(client_df[cat_feats])
    X = np.array(client_df)

    # Get list of categorical feature indices. Boolean feats are considered categorical for clustering
    cat_feat_idxs = [client_df.columns.get_loc(c) for c in cat_feats + bool_feats if c in client_df]
    numcl_feat_idxs = [i for i in range(len(client_df.columns)) if i not in cat_feat_idxs]

    # Normalize noncategorical features
    X_noncat = X[:, numcl_feat_idxs]
    std_scaler = StandardScaler().fit(X_noncat)
    X_noncat = std_scaler.transform(X_noncat)
    X[:, numcl_feat_idxs] = X_noncat

    # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client
    if k is None:
        k = cfg['K-PROTOTYPES']['K']
    k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'],
                               n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim,
                               cat_dissim=matching_dissim)
    client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs)
    k_prototypes.samples = X
    k_prototypes.labels = client_clusters
    k_prototypes.dist = lambda x0, x1: \
        k_prototypes.num_dissim(np.expand_dims(x0[numcl_feat_idxs], axis=0),
                                np.expand_dims(x1[numcl_feat_idxs], axis=0)) + \
        k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0),
                                                     np.expand_dims(x1[cat_feat_idxs], axis=0))
    client_clusters += 1  # Enforce that cluster labels are integer range of [1, K]
    clusters_df = pd.DataFrame({'CONTRACT_ACCOUNT': client_ids, 'Cluster Membership': client_clusters})
    clusters_df = clusters_df.merge(client_feats_df, on='CONTRACT_ACCOUNT', how='left')
    clusters_df.set_index('CONTRACT_ACCOUNT')

    # Get centroids of clusters
    cluster_centroids = np.empty((k_prototypes.cluster_centroids_[0].shape[0],
                                  k_prototypes.cluster_centroids_[0].shape[1] +
                                  k_prototypes.cluster_centroids_[1].shape[1]))
    cluster_centroids[:, numcl_feat_idxs] = k_prototypes.cluster_centroids_[0]  # Numerical features
    cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[1]  # Categorical features

    # Scale noncategorical features of the centroids back to original range
    centroid_noncat_feats = cluster_centroids[:, numcl_feat_idxs]
    centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats)
    cluster_centroids[:, numcl_feat_idxs] = centroid_noncat_feats

    # Create a DataFrame of cluster centroids
    centroids_df = pd.DataFrame(cluster_centroids, columns=list(client_df.columns))
    for i in range(len(cat_feats)):
        ordinal_dict = {j: ordinal_encoder.categories_[i][j] for j in range(len(ordinal_encoder.categories_[i]))}
        centroids_df[cat_feats[i]] = centroids_df[cat_feats[i]].map(ordinal_dict)
    centroids_df[bool_feats] = centroids_df[bool_feats].round()
    cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1))
    centroids_df.insert(0, 'Cluster', cluster_num_series)

    # Get fraction of clients in each cluster
    cluster_freqs = np.bincount(client_clusters) / float(client_clusters.shape[0])
    centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100)

    # Save centroid features and cluster assignments to spreadsheet
    if save_centroids:
        centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                            index_label=False, index=False)
    if save_clusters:
        clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                           index_label=False, index=False)
    return k_prototypes
Ejemplo n.º 25
0
print('Removing redundant columns:', redundant_columns)
print('Removing useless targets:', other_targets)
print('Removing misc columns:', misc_columns)
columns_to_remove = redundant_columns + other_targets + misc_columns
df.drop(axis='columns', columns=columns_to_remove, inplace=True)

###############################################################################
### Remove NaN columns (with a lot of NaN values)
df, log = remove_nan_columns(df, 1 / 2, verbose=False)
print(log)

###############################################################################
### Encode categorical features
print('Encoding categorical features (ordinal encoding).')
my_encoder = OrdinalEncoder()
df['flgs'] = my_encoder.fit_transform(df['flgs'].values.reshape(-1, 1))
df['proto'] = my_encoder.fit_transform(df['proto'].values.reshape(-1, 1))
df['sport'] = my_encoder.fit_transform(df['sport'].astype(str).values.reshape(
    -1, 1))
df['dport'] = my_encoder.fit_transform(df['dport'].astype(str).values.reshape(
    -1, 1))
df['state'] = my_encoder.fit_transform(df['state'].values.reshape(-1, 1))
print('Objects:', list(df.select_dtypes(['object']).columns))

# In[5]:

###############################################################################
## Quick sanity check
###############################################################################
display_general_information(df)
Ejemplo n.º 26
0
df = pd.read_csv(filename, encoding='gbk')
# print(df.columns.tolist())

# 2、特征工具
# 1)特征标识/筛选
intCols = ['年龄', '收入', '家庭人数', '开通月数']
catCols = ['居住地', '婚姻状况', '教育水平', '性别']

target = '套餐类型'
y = df[target]

# 2)类别变量数字化
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder(dtype='int')
X_ = enc.fit_transform(df[catCols])

dfCats = pd.DataFrame(X_, columns=catCols)

# 3)合并
X = pd.concat([dfCats, df[intCols]], axis=1)
cols = X.columns.tolist()

# 3、训练模型
from xgboost import XGBClassifier

model = XGBClassifier(
    learning_rate=0.01,
    # n_estimators=3000,
    max_depth=4,
    min_child_weight=5,
Ejemplo n.º 27
0
 def transform(self, X):
     X = pd.DataFrame(X, columns=self.column_names)
     enc = OrdinalEncoder()
     X[self.categorical_cols] = enc.fit_transform(X[self.categorical_cols])
     return X
Ejemplo n.º 28
0
csv_save = os.path.join("..", "data", "LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_y_TRAIN.csv")
pd.DataFrame(tsr_train_y).to_csv(csv_save, index=False)

csv_save = os.path.join("..", "data", "LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_y_VALIDATION.csv")
pd.DataFrame(tsr_validation_y).to_csv(csv_save, index=False)

csv_save = os.path.join("..", "data", "LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_y_TEST.csv")
pd.DataFrame(tsr_test_y).to_csv(csv_save, index=False)

## scale G_X_train
scaler = MinMaxScaler()
tsr_train_x[continuous] = scaler.fit_transform(tsr_train_x[continuous])

encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=99)
tsr_train_x[ordinal_features] = encoder.fit_transform(tsr_train_x[ordinal_features])

ohe = OneHotEncoder(sparse=False, handle_unknown="ignore")
nominal_train = ohe.fit_transform(tsr_train_x[nominal_features])
tsr_train_x = pd.concat([tsr_train_x, pd.DataFrame(nominal_train)], axis=1)
tsr_train_x = tsr_train_x.drop(nominal_features, axis=1)
tsr_train_x.columns = column_names

csv_save = os.path.join("..", "data", "LINKED_DATA", "TSR_ALL", "TSR_ALL1", "TSR_ALL1_X_TRAIN.csv")
tsr_train_x.to_csv(csv_save, index=False)

## scale G_X_validation
tsr_validation_x[continuous] = scaler.transform(tsr_validation_x[continuous])

tsr_validation_x[ordinal_features] = encoder.transform(tsr_validation_x[ordinal_features])
    "Temperature",
    axis=1)  # Opción 2, eliminamos el atributo que contiene valores nulos
mean_temp = dataframe["Temperature"].mean()
dataframe_op3 = dataframe["Temperature"].fillna(
    mean_temp)  # Opción 3, asignamos el valor medio en los valores nulos
"""Iniciamos el preprocesamiento de los atributos con valores de texto"""

color_cat = dataframe[['Color']]
spectral_cat = dataframe[['Spectral_Class']]
print(color_cat.head(10))
print(spectral_cat.head(10))
"""Importamos la funcionalidad de Scikit-learn"""

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
color_cat_encoded = ordinal_encoder.fit_transform(color_cat)
print(color_cat_encoded[:10])

print(ordinal_encoder.categories_)
"""Importamos lo necesario para realizar el One Hot Encoding"""

from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
color_cat_one_hot = one_hot_encoder.fit_transform(color_cat)
print(color_cat_one_hot)
print(color_cat_one_hot.toarray().shape)
print(color_cat_one_hot.toarray())
"""Ejemplos de normalización de valores de atributos

"""
Ejemplo n.º 30
0
import pandas as pd
from tensorflow import keras
import streamlit as st
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

df = pd.read_csv('2020_chennai.csv')
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
enc = OrdinalEncoder()
df[['Wind']] = enc.fit_transform(df[['Wind']])


def changesymbol(x):
    symbol = ['F', '%', 'mph', 'in']
    for i in symbol:
        if i in x:
            return x[:x.find(i)]


columns = [
    'Temperature', 'Dew Point', 'Humidity', 'Wind Speed', 'Wind Gust',
    'Pressure'
]
for clmn in columns:
    df[clmn] = df[clmn].apply(changesymbol)
df[['Temperature', 'Dew Point', 'Humidity', 'Wind', 'Wind Speed',
    'Pressure']] = df[[
        'Temperature', 'Dew Point', 'Humidity', 'Wind', 'Wind Speed',
        'Pressure'
Ejemplo n.º 31
0
#read csv
def load_data():
    df = pd.read_csv("LifeExpectancy.csv")
    target = df[["Life Expectancy"]]
    features = df[[
        "Gender", "Residential", "Physical Activity (times per week)",
        "Happiness"
    ]]
    return features, target


features, target = load_data()
#buat object, utk ngubah yang ada di csv yang valuenya bukan angka menjadi angka
ordinal_encoder = OrdinalEncoder()
features[["Gender", "Residential"]] = ordinal_encoder.fit_transform(
    features[["Gender", "Residential"]])

#normalisasi data
scaler = MinMaxScaler()
features = scaler.fit_transform(features)

#convert target di csv yang bukan angka menjadi angka
one_hot_encoder = OneHotEncoder(sparse=False)  #supaya array
target = one_hot_encoder.fit_transform(target)

#ngesplit train data dan test data
train_data, test_data, train_target, test_target = train_test_split(
    features, target, test_size=0.2)

#model architecture
layer = {
Ejemplo n.º 32
0
def HP_regress(data, target, outdir, dataset, ABType=True):

################################################################################
# preprocessing
################################################################################

    df = pd.read_csv(data, index_col=0)

    X = df.iloc[:,:7]
    y = df[target]

    if ABType == True:
        #ohe = OneHotEncoder()
        #X = ohe.fit_transform(X)
        ore = OrdinalEncoder(categories=[["H1","H2","H3","H4"],["A","B"],\
        [8,16,32,64],[10,100,1000],[1,2,4],["fixed","max"],["L0","L2","L1"]])
        X = ore.fit_transform(X)

    else:
        X = X.drop(columns=["ABType"])
        #ohe = OneHotEncoder()
        #X = ohe.fit_transform(X)
        ore = OrdinalEncoder(categories=[["H1","H2","H3","H4"],\
        [8,16,32,64],[10,100,1000],[1,2,4],["fixed","max"],["L0","L2","L1"]])
        X = ore.fit_transform(X)

################################################################################
# RF
################################################################################

    gsc = GridSearchCV(estimator=RandomForestRegressor(),\
    param_grid={"max_depth": range(5,11), \
    "n_estimators": (500, 1000)},\
    cv=5, scoring="neg_mean_absolute_error", verbose=0, n_jobs=-1)

    gs_result = gsc.fit(X, y)
    best_params = gs_result.best_params_
    print(best_params)

    rfr = RandomForestRegressor(max_depth=best_params["max_depth"],\
    n_estimators=best_params["n_estimators"], random_state=42)

    cv_scores = cross_val_score(rfr, X, y, cv=10, \
    scoring="neg_mean_absolute_error")

    rfr.fit(X, y)

################################################################################
# feature importances
################################################################################

    importances = rfr.feature_importances_
    std = np.std([tree.feature_importances_ for tree in rfr.estimators_],\
    axis=0)
    imp_data = [tree.feature_importances_ for tree in rfr.estimators_]
    #ci = stats.sem(imp_data) * stats.t.ppf(1.95/2., len(imp_data)-1)

    indices = np.argsort(importances)[::-1]

    if ABType == True:
        xlab = np.where(indices==0, "ont. constr.", indices)
        xlab = np.where(xlab=="1", "rel. norm.", xlab)
        xlab = np.where(xlab=="2", "dimension", xlab)
        xlab = np.where(xlab=="3", "learn. rate", xlab)
        xlab = np.where(xlab=="4", "margin", xlab)
        xlab = np.where(xlab=="5", "magnitude", xlab)
        xlab = np.where(xlab=="6", "method", xlab)
    else:
        xlab = np.where(indices==0, "ont. constr.", indices)
        xlab = np.where(xlab=="1", "dimension", xlab)
        xlab = np.where(xlab=="2", "learn. rate", xlab)
        xlab = np.where(xlab=="3", "margin", xlab)
        xlab = np.where(xlab=="4", "magnitude", xlab)
        xlab = np.where(xlab=="5", "method", xlab)

    fig1, ax1 = plt.subplots(figsize=(3.6,3.6))
    ax1.set_title("10-fold CV NMAE = %.2f, std = %.2f" % (np.mean(cv_scores), np.std(cv_scores)), size=10)
    ax1.bar(range(X.shape[1]), importances[indices], color="#777777",\
    yerr=std[indices], align="center")
    ax1.set_xticks(list(range(X.shape[1])))
    ax1.set_xticklabels(xlab, size=9)
    ax1.tick_params(axis="x", rotation=45)
    ax1.set_xlim([-1, X.shape[1]])
    ax1.set_ylabel("relative feature importance")
    ax1.spines["top"].set_visible(False)
    ax1.spines["bottom"].set_visible(False)
    ax1.spines["left"].set_visible(False)
    ax1.spines["right"].set_visible(False)
    ax1.grid(linestyle=":", color="#777777")
    fig1.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \
    hspace=0.2, wspace=0.2)

################################################################################
# partial dependence plot
################################################################################

    X_df = pd.DataFrame(X)
    df2 = pd.concat([X_df, y], axis=1, sort=False)
    if ABType == True:
        df2.columns = ["HType", "ABType", "dimension", "learnFac", "margin", "constr", "LType", target]
        mod_feats = ["HType", "ABType", "dimension", "learnFac", "margin", "constr", "LType"]
    else:
        df2.columns = ["HType", "dimension", "learnFac", "margin", "constr", "LType", target]
        mod_feats = ["HType", "dimension", "learnFac", "margin", "constr", "LType"]

    # HType

    pdp_rfr = pdp.pdp_isolate(model=rfr, \
    dataset=df2, \
    model_features=mod_feats, \
    feature="HType")

    fig2, ax2 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr,
    feature_name="ontology constraints", \
    center=False, plot_lines=False, \
    plot_pts_dist=False, figsize = (3.6,3.6))
    ax2["pdp_ax"].set_xticklabels(["H+T", "H+TI", "H+TID", "H+TIDF"])
    ax2["pdp_ax"].set_xticks([0, 1, 2, 3])
    if target is not "mrank":
        ax2["pdp_ax"].set_ylim([0, 1])
    ax2["pdp_ax"].set_ylabel("predicted metric score")
    ax2["pdp_ax"].grid(color="#777777")
    fig2.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \
    hspace=0.2, wspace=0.2)

    # dimension

    pdp_rfr = pdp.pdp_isolate(model=rfr, \
    dataset=df2, \
    model_features=mod_feats, \
    feature="dimension")

    fig3, ax3 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr,
    feature_name="dimension $k$", \
    center=False, plot_lines=False, \
    plot_pts_dist=False, figsize = (3.6,3.6))
    ax3["pdp_ax"].set_xticklabels(["8", "16", "32", "64"])
    ax3["pdp_ax"].set_xticks([0, 1, 2, 3])
    if target is not "mrank":
        ax3["pdp_ax"].set_ylim([0, 1])
    ax3["pdp_ax"].set_ylabel("predicted metric score")
    ax3["pdp_ax"].grid(color="#777777")
    fig3.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \
    hspace=0.2, wspace=0.2)

    # learnFac

    pdp_rfr = pdp.pdp_isolate(model=rfr, \
    dataset=df2, \
    model_features=mod_feats, \
    feature="learnFac", \
    num_grid_points=3)

    fig4, ax4 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr,
    feature_name="learning rate \u03BB", \
    center=False, plot_lines=False, \
    plot_pts_dist=False, figsize = (3.6,3.6))
    ax4["pdp_ax"].set_xticklabels(["0.1", "0.01", "0.001"])
    ax4["pdp_ax"].set_xticks([0, 1, 2])
    if target is not "mrank":
        ax4["pdp_ax"].set_ylim([0, 1])
    ax4["pdp_ax"].set_ylabel("predicted metric score")
    ax4["pdp_ax"].grid(color="#777777")
    fig4.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \
    hspace=0.2, wspace=0.2)

    # margin

    pdp_rfr = pdp.pdp_isolate(model=rfr, \
    dataset=df2, \
    model_features=mod_feats, \
    feature="margin", \
    num_grid_points=3)

    fig5, ax5 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr,
    feature_name="margin $\gamma$", \
    center=False, plot_lines=False, \
    plot_pts_dist=False, figsize = (3.6,3.6))
    ax5["pdp_ax"].set_xticklabels(["1", "2", "4"])
    ax5["pdp_ax"].set_xticks([0, 1, 2])
    if target is not "mrank":
        ax5["pdp_ax"].set_ylim([0, 1])
    ax5["pdp_ax"].set_ylabel("predicted metric score")
    ax5["pdp_ax"].grid(color="#777777")
    fig5.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \
    hspace=0.2, wspace=0.2)

    # constr

    pdp_rfr = pdp.pdp_isolate(model=rfr, \
    dataset=df2, \
    model_features=mod_feats, \
    feature="constr")

    fig6, ax6 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr,
    feature_name="regularisation magnitude", \
    center=False, plot_lines=False, \
    plot_pts_dist=False, figsize = (3.6,3.6))
    ax6["pdp_ax"].set_xticklabels(["surface", "space"])
    ax6["pdp_ax"].set_xticks([0, 1])
    if target is not "mrank":
        ax6["pdp_ax"].set_ylim([0, 1])
    ax6["pdp_ax"].set_ylabel("predicted metric score")
    ax6["pdp_ax"].grid(color="#777777")
    fig6.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \
    hspace=0.2, wspace=0.2)

    # LType

    pdp_rfr = pdp.pdp_isolate(model=rfr, \
    dataset=df2, \
    model_features=mod_feats, \
    feature="LType", \
    num_grid_points=3)

    fig7, ax7 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr,
    feature_name="training method", \
    center=False, plot_lines=False, \
    plot_pts_dist=False, figsize = (3.6,3.6))
    ax7["pdp_ax"].set_xticklabels(["linear", "projection", "hybrid"])
    ax7["pdp_ax"].set_xticks([0, 1, 2])
    if target is not "mrank":
        ax7["pdp_ax"].set_ylim([0, 1])
    ax7["pdp_ax"].set_ylabel("predicted metric score")
    ax7["pdp_ax"].grid(color="#777777")
    fig7.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \
    hspace=0.2, wspace=0.2)

    # ABType (if true)

    if ABType == True:
        pdp_rfr = pdp.pdp_isolate(model=rfr, \
        dataset=df2, \
        model_features=mod_feats, \
        feature="ABType")

        fig8, ax8 = pdp.pdp_plot(pdp_isolate_out=pdp_rfr,
        feature_name="relation normalisation", \
        center=False, plot_lines=False, \
        plot_pts_dist=False, figsize = (3.6,3.6))
        ax8["pdp_ax"].set_xticklabels(["False", "True"])
        ax8["pdp_ax"].set_xticks([0, 1])
        if target is not "mrank":
            ax8["pdp_ax"].set_ylim([0, 1])
        ax8["pdp_ax"].set_ylabel("predicted metric score")
        ax8["pdp_ax"].grid(color="#777777")
        fig8.subplots_adjust(top=0.9, bottom=0.2, left=0.2, right=0.9, \
        hspace=0.2, wspace=0.2)

    #plt.show()

################################################################################
# save figures
################################################################################

    subdir = "/".join([outdir, dataset, target])
    if not os.path.exists(subdir):
        os.makedirs(subdir)

    fig1_path = "/".join([subdir, "_".join([dataset, target, "RF_importance.png"])])
    fig1.savefig(fig1_path)

    fig2_path = "/".join([subdir, "_".join([dataset, target, "PDP_HType.png"])])
    fig2.savefig(fig2_path)

    fig3_path = "/".join([subdir, "_".join([dataset, target, "PDP_dimension.png"])])
    fig3.savefig(fig3_path)

    fig4_path = "/".join([subdir, "_".join([dataset, target, "PDP_learnFac.png"])])
    fig4.savefig(fig4_path)

    fig5_path = "/".join([subdir, "_".join([dataset, target, "PDP_margin.png"])])
    fig5.savefig(fig5_path)

    fig6_path = "/".join([subdir, "_".join([dataset, target, "PDP_constr.png"])])
    fig6.savefig(fig6_path)

    fig7_path = "/".join([subdir, "_".join([dataset, target, "PDP_LType.png"])])
    fig7.savefig(fig7_path)

    if ABType == True:
        fig8_path = "/".join([subdir, "_".join([dataset, target, "PDP_ABType.png"])])
        fig8.savefig(fig8_path)
Ejemplo n.º 33
0
def redifine_labels(agg_labels, focus_label):
    for i in range(len(agg_labels)):
        if agg_labels[i] != focus_label:
            agg_labels[i] = "OTHER"
    print(agg_labels)
    return agg_labels


focus_label = 'OAG'
agg_labels = redifine_labels(agg_labels, focus_label)

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

agg_labels_encoded = ordinal_encoder.fit_transform(agg_labels)

#%%
print(agg_labels_encoded[:10])
print(ordinal_encoder.categories_)

#%%

from pprint import pprint
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#%%
Ejemplo n.º 34
0
# In[15]:

#filling the missing values and removing the non numeric attribute
X[X == np.inf] = np.nan
X.fillna(X.mean(), inplace=True)

# In[16]:

X.head()

# In[17]:

from sklearn.preprocessing import OrdinalEncoder
ord_encode = OrdinalEncoder()
X_cat = X[["IsHoliday_y"]]
X_cat_encoded = ord_encode.fit_transform(X_cat)
X_cat_encoded

# In[18]:

X_num = X.drop("IsHoliday_y", axis=1)
X_num.head()

# In[20]:

X["IsHoliday"] = X_cat_encoded

# In[21]:

X = X.drop("IsHoliday_y", axis=1)
X.head()