コード例 #1
0
class DFLeaveOneOutEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = LeaveOneOutEncoder(**kwargs)
        self.transform_cols = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols], y)

        return self

    def transform(self, X):
        return self.__transform(X)

    def __transform(self, X, y=None):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.drop(columns=self.transform_cols)
        new_X = pd.concat([
            new_X,
            self.model.transform(X[self.transform_cols]) if y is None else
            self.model.fit_transform(X[self.transform_cols], y)
        ],
                          axis=1)

        return new_X

    def fit_transform(self, X, y):
        # NOTE: Result of fit_transform() is different from fit() + transform()
        return self.fit(X, y).__transform(X, y)
コード例 #2
0
class MineFeatureManager(FeatureManager):
    def __init__(self, num_config=None, categorical_config=None):
        self.num_features = [
            'power', 'kilometer', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_11', 
            'v_12', 'v_13', 'v_14', 'carAge', 'v_10_1', 'v_10_2', 'v_10_3', 'nameEncode', 
            'modelEncode', 'regionCodeEncode', 'gearbox', 'notRepairedDamage', 'seller', 
            'offerType'
        ]
        self.categorical_features = ['model', 'brand', 'bodyType', 'fuelType', 'createMon']
        self.encoded_cates = ['name', 'model', 'regionCode']
        self.cate_encoder = LeaveOneOutEncoder(cols=self.encoded_cates)
        self.general_model = None
        super().__init__(self.num_features, self.categorical_features, num_config, categorical_config)

    def _feature_engien(self, features):
        zero_na = {0: np.nan}
        features = features.replace({'power': zero_na, 'v_5': zero_na, 'v_6': zero_na})
        
        features['carAge'] = (features['creatDate'] - features['regDate']).apply(lambda x: x.days)
        features['createMon'] = features['creatDate'].dt.month
        features['notRepairedDamage'] = features['notRepairedDamage'].replace('-', np.nan).astype(float)
        
        features.loc[features['power'] > 600, 'power'] = np.nan
        features['power'] = np.log(features['power'])
        features.loc[features['v_7'] > 0.5, 'v_7'] = np.nan
        features.loc[features['v_11'] > 10, 'v_11'] = np.nan
        features.loc[features['v_13'] > 7.5, 'v_13'] = np.nan
        features.loc[features['v_14'] > 7.5, 'v_14'] = np.nan
        
        features.loc[features['v_10'] <= 0, 'v_10_1'] = features.loc[features['v_10'] <= 0, 'v_10']
        features.loc[(features['v_10'] >= 0) & (features['v_10'] < 6), 'v_10_2'] = features.loc[(features['v_10'] >= 0) & (features['v_10'] < 6), 'v_10']
        features.loc[features['v_10'] > 8, 'v_10_3'] = features.loc[features['v_10'] > 8, 'v_10']
        features.loc[~features['model'].isin(self.general_model), 'model'] = np.nan
        return features
    
    def get_model_features(self, features):
        features = features.copy()
        self.general_model = df.model.value_counts()[df.model.value_counts() < 2000].index
        encoded_cate = self.cate_encoder.fit_transform(features[self.encoded_cates], features['logPrice'])
        for cate in self.encoded_cates:
            features[cate + 'Encode'] = encoded_cate[cate]
        features = self._feature_engien(features)
        return super().get_model_features(features)
    
    def transform_feature(self, features):
        features = features.copy()
        encoded_cate = self.cate_encoder.transform(features[self.encoded_cates])
        for cate in self.encoded_cates:
            features[cate + 'Encode'] = encoded_cate[cate]
        features = self._feature_engien(features)
        return super().get_model_features(features)
コード例 #3
0
class LeaveOneOutEncoder():
    """Maps each categorical value to one column using LeaveOneOut encoding.

    Parameters:
        cols: [str]
            list of column names to encode.
    """
    name = 'leave_one_out'

    def __init__(self, cols=None):
        self.encoder = LeaveOneOut(cols=cols)

    def fit(self, X, features, y):
        """Fits encoder to data table.
        returns self
        """
        self.encoder.fit(X, y)
        self.features = self.encode_features_list(X, features)
        return self

    def transform(self, X):
        """Encodes matrix and updates features accordingly.
        returns encoded matrix (dataframe)
        """
        X_new = self.encoder.transform(X)
        X_new.columns = self._rename_columns(self.features)
        return X_new

    def fit_transform(self, X, features, y=None):
        """First fits, then transforms matrix.
        returns encoded matrix (dataframe)
        """
        self.encoder.fit(X, y)
        self.features = self.encode_features_list(X, features)
        X_new = self.encoder.fit_transform(X, y)
        X_new.columns = self._rename_columns(self.features)
        return X_new

    def get_mapping(self, category):
        """Gets the mapping for the LeaveOneOut encoder. Only takes strings of the column name, not the index number.
        returns mapping (dict)
        """
        return self.encoder.mapping[category]

    def encode_features_list(self, X, features):
        feature_list = []
        for f in features:
            if f.get_name() in self.encoder.cols:
                f = ft.Feature([f],
                               primitive=LeaveOneOutEnc(self, f.get_name()))
            feature_list.append(f)
        return feature_list

    def _rename_columns(self, features):
        feature_names = []
        for feature in features:
            for fname in feature.get_feature_names():
                feature_names.append(fname)
        return feature_names

    def get_features(self):
        return self.features

    def get_name(self):
        return self.name
コード例 #4
0
#Replacing null values by its respective mean value.
df['HB'].fillna(df['HB'].mean(), inplace=True)

df['CREATININE'].fillna(df['CREATININE'].mode()[0], inplace=True)

df['UREA'].fillna(df['UREA'].mean(), inplace=True)

df_drop = df.drop(['SL.', 'PAST MEDICAL HISTORY CODE'], axis=1)

cat_col = df_drop.select_dtypes(exclude=np.number).columns

#Leave one out encoder.
le = LeaveOneOutEncoder()

df_drop[cat_col] = le.fit_transform(X=df_drop[cat_col],
                                    y=df_drop['TOTAL COST TO HOSPITAL '])
#Train test split
X = df_drop.drop('TOTAL COST TO HOSPITAL ', axis=1)
y = df_drop['TOTAL COST TO HOSPITAL ']

#Page Layout:

col1 = st.sidebar
col2, col3 = st.beta_columns((1, 1))

empty = pd.DataFrame(columns=X.columns)

#Manual Input
age_val = col1.slider('Age of the Patient', 0, 120, 30)
gen_val = col1.selectbox(' Select Gender of Patient', ('Male', 'Female'))
mar_val = col1.radio(' Select Marital Status', ('Married', 'Unmarried'))
コード例 #5
0
#pd.get_dummies(X[["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"]])

X = X.drop(["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"], axis=1) \
        .join(pd.get_dummies(X[["nom_0", "nom_1", "nom_2", "nom_3", "nom_4"]]))


from sklearn.feature_extraction import FeatureHasher

#h = FeatureHasher(input_type='string', n_features=1000)
#X[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].values
#hash_X = h.fit_transform(X[['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']].values)
#hash_X = pd.DataFrame(hash_X.toarray())

from category_encoders import LeaveOneOutEncoder
loo_encoder = LeaveOneOutEncoder(cols=["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"])
loo_X = loo_encoder.fit_transform(X[["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"]], y)
X = X.drop(["nom_5", "nom_6", "nom_7", "nom_8", "nom_9"], axis=1).join(loo_X)

X.ord_1.replace(to_replace = ['Novice', 'Contributor','Expert', 'Master', 'Grandmaster'],
                         value = [0, 1, 2, 3, 4], inplace = True)

X.ord_2.replace(to_replace = ['Freezing', 'Cold', 'Warm', 'Hot','Boiling Hot', 'Lava Hot'],
                         value = [0, 1, 2, 3, 4, 5], inplace = True)


from sklearn.preprocessing import LabelEncoder
for i in ["ord_3", "ord_4"]:
    le = LabelEncoder()
    X[[i]] = le.fit_transform(X[[i]])

コード例 #6
0
def DataCleaner(values_df, labels_df, test_df):

    # Training Set
    df = pd.merge(values_df, labels_df, on='id')

    #Fills in the mod
    for col in df.columns[df.isna().sum() > 0]:
        mode = df[col].mode()[0]
        df[col].fillna(value=mode, inplace=True)

    #dropping
    to_drop = [
        'funder', 'num_private', 'subvillage', 'region_code', 'recorded_by',
        'source_type', 'waterpoint_type', 'scheme_name', 'payment_type',
        'quantity_group'
    ]
    df.drop(columns=to_drop, inplace=True)
    #targets to 0,1,2
    df['status_group'] = df['status_group'].map({
        'functional': 2,
        'functional needs repair': 1,
        'non functional': 0
    })

    #date column
    df['date_recorded'] = pd.to_datetime(df['date_recorded'])
    df['year_recorded'] = df['date_recorded'].dt.year
    df['month_recorded'] = df['date_recorded'].dt.month
    df.drop(columns='date_recorded', inplace=True)

    #Test Set
    #TEST SET TRANSFORM
    test_df = pd.read_csv('test_set_values.csv')

    #Fills in the mod
    for col in test_df.columns[test_df.isna().sum() > 0]:
        mode = test_df[col].mode()[0]
        test_df[col].fillna(value=mode, inplace=True)

    #dropping
    to_drop = [
        'funder', 'num_private', 'subvillage', 'region_code', 'recorded_by',
        'source_type', 'waterpoint_type', 'scheme_name', 'payment_type',
        'quantity_group'
    ]
    test_df.drop(columns=to_drop, inplace=True)

    #date column
    test_df['date_recorded'] = pd.to_datetime(test_df['date_recorded'])
    test_df['year_recorded'] = test_df['date_recorded'].dt.year
    test_df['month_recorded'] = test_df['date_recorded'].dt.month
    test_df.drop(columns='date_recorded', inplace=True)

    #target encode
    target = 'status_group'
    lst_te = [
        'wpt_name', 'basin', 'region', 'district_code', 'lga', 'ward',
        'scheme_management', 'installer', 'source'
    ]

    #encoder = TargetEncoder()
    encoder = LeaveOneOutEncoder()

    te_everything = [
        'wpt_name', 'basin', 'region', 'district_code', 'lga', 'ward',
        'scheme_management', 'installer', 'source', 'extraction_type',
        'extraction_type_group', 'extraction_type_class', 'management',
        'payment', 'water_quality', 'management_group', 'quality_group',
        'quantity', 'source_class', 'waterpoint_type_group'
    ]

    for c in te_everything:
        df[str(c) + '_encoded'] = encoder.fit_transform(
            df[c].values, df[target])  # TRAINING SET
        test_df[str(c) + '_encoded'] = encoder.transform(
            test_df[c].values)  # TEST SET
        df.drop(columns=c, inplace=True)  # TRAINING SET
        test_df.drop(columns=c, inplace=True)  # TEST SET

#     #one hot encode
#     encoder_ohe = OneHotEncoder(sparse=False)

    ohe = [
        'extraction_type', 'extraction_type_group', 'extraction_type_class',
        'management', 'payment', 'water_quality', 'management_group',
        'quality_group', 'quantity', 'source_class', 'waterpoint_type_group'
    ]

    #     #ONE HOT ENCODING TRAINING SET
    #     df_new = df[ohe]
    #     encoder_ohe.fit(df_new)
    #     x = encoder_ohe.transform(df_new)
    #     df1 = pd.DataFrame(x)
    #     df = pd.concat([df, df1], axis=1)
    #     df.drop(columns=ohe, inplace=True)

    #     #ONE HOT ENCODING TEST SET
    #     df_new1 = test_df[ohe]
    #     x1 = encoder_ohe.transform(df_new1)
    #     df2 = pd.DataFrame(x1)
    #     test_df = pd.concat([test_df, df2], axis = 1)
    #     test_df.drop(columns=ohe, inplace=True)

    return df, test_df