def fit(X, y, output_dir, **kwargs): """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data DataRobot runs this hook when the task is being trained inside a blueprint. As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data. The input parameters are passed by DataRobot based on project and blueprint configuration. Parameters ------- X: pd.DataFrame Training data that DataRobot passes when this task is being trained. y: pd.Series Project's target column (None is passed for unsupervised projects). output_dir: str A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform(). Returns ------- None fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir so that the trained object can be used during scoring inside transform() """ # Transform categorical columns into a numeric transformation using Weight of Evidence encoder_glm = ce.GLMMEncoder(cols=X.columns, binomial_target=True, randomized=True) encoder_glm.fit(X, y) # dump the trained object # into an artifact [in this example - woe.pkl] # and save it into output_dir so that it can be used later to impute on new data output_dir_path = Path(output_dir) if output_dir_path.exists() and output_dir_path.is_dir(): with open("{}/binomialglm.pkl".format(output_dir), "wb") as fp: pickle.dump(encoder_glm, fp)
def _fit_glm(self, df, y, target, parameter): glm_encoder = ce.GLMMEncoder() glm_encoder.fit(df[target].map(to_str), df[y]) name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_glm' for x in glm_encoder.get_feature_names()] self.trans_ls.append(('glm', name, target, glm_encoder))
def test_binary(self): cols = [ 'unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int' ] enc = encoders.GLMMEncoder(cols=cols, binomial_target=True) enc.fit(X, np_y) th.verify_numeric(enc.transform(X))
def feat_tsf_dataset(self): """ Returns ------- Features Dataset Numpy Array with Category Encoders """ features = self.features_dataset labels = self.labels_dataset #Encoder encoder = ce.GLMMEncoder(cols=self.cat_index) #Encoder Cv cv_encoder = NestedCVWrapper(feature_encoder=encoder, cv=5, shuffle=True, random_state=7) #Apply Transform to all datasets feat_tsf = cv_encoder.fit_transform(features, labels) return feat_tsf
def test_binary(self): cols = [ 'unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int' ] enc = encoders.GLMMEncoder(cols=cols, binomial_target=True)
oh_pipeline = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder(handle_unknown='ignore')) def select_hc_features(df): hc_features =\ df\ .select_dtypes(['object', 'category'])\ .apply(lambda col: col.nunique())\ .loc[lambda x: x > MAX_OH_CARDINALITY]\ .index\ .tolist() return hc_features hc_pipeline = make_pipeline(ce.GLMMEncoder()) column_transformer = ColumnTransformer(transformers=\ [('numeric_pipeline', numeric_pipeline, select_numeric_features),\ ('oh_pipeline', oh_pipeline, select_oh_features),\ ('hc_pipeline', hc_pipeline, select_hc_features) ],\ n_jobs=n_threads, remainder='drop')
def predict( self, size, propertyType, district, status, rooms, bathrooms, #box_posto_auto, #hasGarden, #hasTerrace, #hasSwimmingPool ): """ Parameters ---------- district : str (category) status : str (category) rooms : int bathrooms : int box_posto_auto : Bool(1,0) garden : Bool(1,0) terrace : Bool(1,0) hasSwimmingPool : Bool(1,0) Returns ------- Prediction : Best Model Prediction """ """ #Avg Price Zone avg_price_zone_df = self.dataset_preprocessed[['district','avgPriceZone']] avg_price_zone_df = avg_price_zone_df.drop_duplicates() avgPriceZone = avg_price_zone_df.loc[ avg_price_zone_df['district']==district]['avgPriceZone'].values[0] """ #Rooms Category roomsCat = self.roomsCategory(rooms) #Bathrooms Logic bathroomsCat = self.bathroomsCategory(bathrooms) #Array for prediction array = np.array([ size, propertyType, district, status, roomsCat, bathroomsCat, #box_posto_auto, #hasGarden, #hasTerrace, #hasSwimmingPool ]).reshape(1, -1) #Encoder encoder = ce.GLMMEncoder(cols=self.cat_index) #Encoder CV KFold cv_encoder = NestedCVWrapper(encoder, cv=5, shuffle=True, random_state=7) #Datasets features = self.features_dataset labels = self.labels_dataset #Apply Transform to all datasets feat_tsf = cv_encoder.fit_transform(features, labels, array) #Prediction prediction = self.model_fit.predict(feat_tsf[1])[0] return prediction