Ejemplo n.º 1
0
 def fit(self, data):
     '''
     fits the catagorical encoder, coecces to a pd data frame, save input and feature names
     :param data: a pandas data frame, or list
     :return: nothing, fitted encoder is saved as encoder
     '''
     from category_encoders import OneHotEncoder
     ohe = OneHotEncoder(return_df=self.return_df, handle_unknown=self.handle_unknown)
     x = self.replace_infrequent_df(data)
     self.input_names = x.columns
     ohe.fit(x)
     self.encoder = ohe
     self.feature_names_from_cat_encoder()
Ejemplo n.º 2
0
    def categoricals(self,
                     model_name='onehot_model.pkl',
                     cols=None,
                     owr=False,
                     model_bin=None):
        """Onehot encoder on categoricals."""

        self.log('Apply onehot encoder on categorical')
        model_path = os.path.join(self.model_path, model_name)
        if cols is None:
            cols = self.data.cat_cols

        if ((not os.path.isfile(model_path)) or owr) and (model_bin is None):
            self.log('\nTrain model\n')
            model_bin = OneHotEncoder(
                cols=cols,
                use_cat_names=True,
                handle_unknown='error',
                drop_invariant=False,
                impute_missing=False)
            model_bin.fit(self.data._X)
            self.data._X = model_bin.transform(self.data._X)
            setattr(model_bin, 'data_schema', self.data._X.columns.values)

            # Save model
            if self.auto_save:
                joblib.dump(model_bin, model_path)

        elif os.path.isfile(model_path):
            # File exists/prediction:
            model_bin = joblib.load(model_path)
            self.data._X = model_bin.transform(self.data._X)
            self.data.check_schema(model_bin, '_X')

        else:
            # Prediction in pipeline
            self.data._X = model_bin.transform(self.data._X)
            self.data.check_schema(model_bin, '_X')

        return model_bin
def encode_low_cardinality_categorical_df(dataframe, fit=False):
   """
    Encode low cardinality categorical features using OneHot Encoding and dropping invariant features
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), low card. categorical features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = OneHotEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'low_card_categorical_encoder')
    else:
        encoder = unpickle_obj('low_card_categorical_encoder')

    # transform data
    return encoder.transform(dataframe)
Ejemplo n.º 4
0
def fit_onehot(input_df: pd.DataFrame, cols: List[str], na_value: Any = None):
    """
    Creates the One-hot encoder by fitting it through the given DataFrame
    NaN values and Special value specified under `na_value` in the DataFrame will be encoded as unseen value.
    Args:
        input_df: DataFrame used to fit the encoder
        cols: List of categorical columns to be encoded
        na_value: Default null value for DataFrame

    Returns:
        result_df: encoded input_df DataFrame
        model : encoder model to be passed to `transform_onehot` method
    """
    df = input_df.copy()

    if na_value is not None:
        for col in cols:
            df[col] = df[col].replace({na_value: np.nan})

    drop_cols = ["{}_nan".format(col) for col in cols]

    encoder = OneHotEncoder(cols=cols, use_cat_names=True)
    encoder = encoder.fit(df)

    result_df = encoder.transform(df)

    for drop_col in drop_cols:
        if drop_col in result_df.columns:
            result_df = result_df.drop(columns=[drop_col])

    model = {
        "encoder": encoder,
        "cols": cols,
        "na_value": na_value,
        "drop_cols": drop_cols,
    }
    return result_df, model
Ejemplo n.º 5
0
train['type'].unique(), train['color'].unique()
sns.violinplot(x='bone_length', y='type', data=train)
sns.boxplot(x='hair_length', y='type', data=train)
sns.pairplot(train)
from category_encoders import OneHotEncoder

encoder = OneHotEncoder(cols=['color'], use_cat_names=True)

train = encoder.fit_transform(train)
test = encoder.fit_transform(test)
train.head()
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit(train['type'])

print(encoder.classes_)

train['type_no'] = encoder.transform(train['type'])
train.head()
sns.heatmap(train.corr(), xticklabels=list(train), yticklabels=list(train))
target = train['type_no'] # for visualizations
target_string = train['type'] # for final predictions

del train['type']
del train['type_no']

target.head()
from sklearn.model_selection import train_test_split
Ejemplo n.º 6
0
class RFEncoder(BaseEstimator, TransformerMixin):
    def __init__(self,
                 cols=None,
                 handle_missing='value',
                 handle_unknown='value',
                 use_cat_names=False,
                 return_df=True,
                 max_subsets=None,
                 max_depth=3,
                 n_estimators=100,
                 min_count=1,
                 n_jobs=1):
        self.cols = cols
        self.handle_missing = handle_missing
        self.handle_unknown = handle_unknown
        self.use_cat_names = use_cat_names
        self.return_df = return_df
        self.max_subsets = max_subsets
        self.max_depth = max_depth
        self.n_estimators = n_estimators
        self.n_jobs = n_jobs
        self.min_count = min_count

    def fit(self, X, y=None):
        self._dim = X.shape[1]

        if self.cols is None:
            self.cols = get_obj_cols(X)

        self.dummy_encoder = OneHotEncoder(cols=self.cols,
                                           handle_unknown='value',
                                           handle_missing='value')

        self.dummy_encoder = self.dummy_encoder.fit(X)
        self.mapping = self.generate_mapping(X, y)

        X_temp = self.transform(X, override_return_df=True)
        self.feature_names = list(X_temp.columns)

        return self

    def generate_mapping(self, X, y):
        X = self.dummy_encoder.transform(X.copy(deep=True))
        y = y.copy(deep=True)

        mapping = []

        for switch in self.dummy_encoder.mapping:
            col = switch.get('col')
            values = switch.get('mapping').copy(deep=True)

            if isinstance(self.max_depth, int):
                max_depth = self.max_depth
            elif isinstance(self.max_depth, float):
                max_depth = round(self.max_depth * values.shape[1])
            else:
                max_depth = min(self.max_depth[1],
                                round(self.max_depth[0] * values.shape[1]))
            if max_depth == 0:
                continue

            forest = RandomForestClassifier(
                max_depth=max_depth,
                n_estimators=self.n_estimators,
                n_jobs=self.n_jobs,
            )

            forest.fit(X[values.columns], y)

            subsets = self.get_subsets(forest.decision_path(values))
            subset_df = pd.DataFrame(data=subsets,
                                     index=values.index,
                                     columns=[
                                         '{col}_subset_{i}'.format(col=col,
                                                                   i=i)
                                         for i in range(subsets.shape[1])
                                     ])

            base_df = values.join(subset_df)

            mapping.append({'col': col, 'mapping': base_df})

        return mapping

    def get_subsets(self, decision_path):
        subset_sizes = np.asarray(decision_path[0].sum(axis=0))[0]
        subsets = decision_path[0][:, subset_sizes != 1].toarray()

        subsets, count = np.unique(subsets, return_counts=True, axis=1)

        subsets = subsets[:, count >= self.min_count]
        count = count[count >= self.min_count]

        subsets = subsets[:, np.argsort(-count)]

        subset_sizes = subsets.sum(axis=0)
        subsets = subsets[:, np.argsort(subset_sizes)]

        if self.max_subsets is not None:
            subsets = subsets[:, :self.max_subsets]

        return subsets

    def transform(self, X, override_return_df=False):
        if self.handle_missing == 'error':
            if X[self.cols].isnull().any().any():
                raise ValueError('Columns to be encoded can not contain null')

        if self._dim is None:
            raise ValueError(
                'Must train encoder before it can be used to transform data.')

        if X.shape[1] != self._dim:
            raise ValueError('Unexpected input dimension %d, expected %d' % (
                X.shape[1],
                self._dim,
            ))

        if not list(self.cols):
            return X if self.return_df else X.values

        X = self.dummy_encoder.ordinal_encoder.transform(X)

        if self.handle_unknown == 'error':
            if X[self.cols].isin([-1]).any().any():
                raise ValueError(
                    'Columns to be encoded can not contain new values')

        X = self.get_dummies(X)

        if self.return_df or override_return_df:
            return X
        else:
            return X.values

    def get_dummies(self, X_in):
        X = X_in.copy(deep=True)

        cols = X.columns.values.tolist()

        for switch in self.mapping:
            col = switch.get('col')
            mod = switch.get('mapping')

            base_df = mod.reindex(X[col])
            base_df = base_df.set_index(X.index)
            X = pd.concat([base_df, X], axis=1)

            old_column_index = cols.index(col)
            cols[old_column_index:old_column_index + 1] = mod.columns

        X = X.reindex(columns=cols)

        return X

    def get_feature_names(self):
        if not isinstance(self.feature_names, list):
            raise ValueError(
                'Must transform data first. Affected feature names are not known before.'
            )
        else:
            return self.feature_names
review_desc= X.review_description   # seperating review before encoding
X=X.drop("review_description",axis=1)


# In[248]:


X.head()


# In[249]:


one_hot= OneHotEncoder(cols=["user_name","country","hint_variety"],use_cat_names=True)  # OneHotEncoder from category_encoders package
one_hot.fit(X)


# In[250]:


X=one_hot.transform(X)


# In[253]:


X.columns


# #### Word2vec implementation -
Ejemplo n.º 8
0
class OneHotEncoder():
    """Maps each categorical value to several columns using one-hot encoding.

    Parameters:
        cols: [str]
            list of column names to encode.
        top_n: int
            number of unique category values to encode (determines the number of resulting columns)
            selects based off of number of occurences of value
            defaults to 15
            'None' will result in all unique values being encoded.
    """
    name = 'one_hot'

    def __init__(self, cols=None, top_n=15):
        self.encoder = OneHot(cols=cols)
        self.matrix = None
        self.top_n = top_n

    def fit(self, X, features, y=None):
        """Fits encoder to data table.
        returns self
        """
        self.encoder.fit(X, y=None)
        self.features = self.encode_features_list(X, features)
        return self

    def transform(self, X):
        """Encodes matrix and updates features accordingly.
        returns encoded matrix (dataframe)
        """
        assert (self.matrix is not None), "Check that the encoder is fitted."
        return self.matrix

    def fit_transform(self, X, features=None, y=None):
        """First fits, then transforms matrix.
        returns encoded matrix (dataframe)
        """
        return self.fit(X, features, y).transform(X)

    def get_mapping(self, category):
        """Gets the mapping for the one-hot encoder.
        returns mapping (dict)
        """
        if isinstance(category, str):
            for map in self.encoder.mapping:
                if map['col'] == category:
                    return map['mapping']
        return self.encoder.mapping[category]['mapping']

    def encode_features_list(self, X, features):
        X_new = X.copy()
        feature_list = []
        for f in features:
            if f.number_output_features > 1:
                logger.warning(
                    "Feature %s has multiple columns. One-Hot Encoder may not properly encode."
                    "Consider using another encoding method or the `encoder` property value assigned "
                    "to this OneHotEncoder class instance." % (f))
            if f.get_name() in self.encoder.cols:
                val_counts = X[f.get_name()].value_counts().to_frame()
                val_counts.sort_values(f.get_name(), ascending=False)
                if self.top_n is None:
                    self.top_n = len(val_counts)
                unique = val_counts.head(self.top_n).index.tolist()

                index = X_new.columns.get_loc(f.get_name())
                for label in unique:
                    add = ft.Feature([f], primitive=OneHotEnc(label))
                    feature_list.append(add)
                    X_new.insert(index,
                                 add.get_name(),
                                 (X_new[f.get_name()] == label).astype(int),
                                 allow_duplicates=True)
                    index += 1
                has_unknown = X[f.get_name()].isnull().values.any()
                if has_unknown:
                    unknown = ft.Feature([f], primitive=OneHotEnc(np.nan))
                    feature_list.append(unknown)
                    X_new.insert(
                        index,
                        unknown.get_name(),
                        (~X_new[f.get_name()].isin(unique)).astype(int),
                        allow_duplicates=True)
                X_new.drop([f.get_name()], axis=1, inplace=True)
            else:
                feature_list.append(f)
        self.matrix = X_new
        return feature_list

    def get_features(self):
        return self.features

    def get_name(self):
        return self.name
Ejemplo n.º 9
0
df['bathrooms_text'].value_counts()

df = df[df['bathrooms_text'] != '']

df[['host_is_superhost', 'bathrooms_text', 'has_availability', 'instant_bookable']].astype(float)

df.head

pip install category_encoders

# Instantiate transformer - one hot encoder
from category_encoders import OneHotEncoder
transformer = OneHotEncoder(use_cat_names=True)

# Transform to fit training data
transformer.fit(df)

# Transform our training data
df = transformer.transform(df)

X = df.drop('price', axis=1)
y= df['price']

X = X.astype(float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from keras.layers import BatchNormalization, Dropout
import keras

model = Sequential([
Ejemplo n.º 10
0
import pandas as pd
data = pd.read_csv('139394485_T_T100D_MARKET_ALL_CARRIER.csv')

# Define col names for the parameters of the network
pred_vars = ['MONTH', 'ORIGIN', 'DEST', 'DISTANCE']
target_var = 'PASSENGERS'
keep = pred_vars
keep.append(target_var)

# Subset only what's needed
data = data[keep]

# Encode the  source and target nodes using a catagory encoder
from category_encoders import OneHotEncoder
ce = OneHotEncoder()
ce.fit(data)

# transform the encoded data
data_encoded = ce.transform(data)
labels = data[target_var]
data_encoded.drop(target_var, 1, inplace=True)

# split out a final eval set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_encoded,
                                                    labels,
                                                    random_state=0,
                                                    test_size=.25)

# convert to xgb data format
import xgboost as xgb
Ejemplo n.º 11
0
def get_score(model, X, y, X_test, y_test):
    model.fit(X, y)
    y_pred = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, y_pred)
    return score


######### Creating objects for 2 classification models.
logit = LogisticRegression(random_state=SEED)
rf = RandomForestClassifier(random_state=SEED)

###################################################################################################
######### Apply One Hot Encoding
from category_encoders import OneHotEncoder
onehot_enc = OneHotEncoder(cols=X_Columns)
onehot_enc.fit(X_train, y_train)

print('Original number of features: \n', X_train.shape[1], "\n")
data_ohe_train = onehot_enc.fit_transform(X_train)
data_ohe_test = onehot_enc.transform(X_test)
print('Features after OHE: \n', data_ohe_train.shape[1])

######### Logistic Regression
onehot_logit_score = get_score(logit, data_ohe_train, y_train, data_ohe_test,
                               y_test)
print('Logistic Regression score with One hot encoding:', onehot_logit_score)

######### Random Forest
onehot_rf_score = get_score(rf, data_ohe_train, y_train, data_ohe_test, y_test)
print('Random Forest score with One hot encoding:', onehot_logit_score)