Ejemplo n.º 1
0
def fit(X, y, output_dir, **kwargs):
    """ This hook defines how DataRobot will train this task. Even transform tasks need to be trained to learn/store information from training data
    DataRobot runs this hook when the task is being trained inside a blueprint.
    As an output, this hook is expected to create an artifact containg a trained object [in this example - median of each numeric column], that is then used to transform new data.
    The input parameters are passed by DataRobot based on project and blueprint configuration.

    Parameters
    -------
    X: pd.DataFrame
        Training data that DataRobot passes when this task is being trained.
    y: pd.Series
        Project's target column (None is passed for unsupervised projects).
    output_dir: str
        A path to the output folder; the artifact [in this example - containing median of each numeric column] must be saved into this folder to be re-used in transform().

    Returns
    -------
    None
        fit() doesn't return anything, but must output an artifact (typically containing a trained object) into output_dir
        so that the trained object can be used during scoring inside transform()
    """

    # Transform categorical columns into a numeric transformation using Weight of Evidence
    encoder_glm = ce.GLMMEncoder(cols=X.columns,
                                 binomial_target=True,
                                 randomized=True)
    encoder_glm.fit(X, y)

    # dump the trained object
    # into an artifact [in this example - woe.pkl]
    # and save it into output_dir so that it can be used later to impute on new data
    output_dir_path = Path(output_dir)
    if output_dir_path.exists() and output_dir_path.is_dir():
        with open("{}/binomialglm.pkl".format(output_dir), "wb") as fp:
            pickle.dump(encoder_glm, fp)
Ejemplo n.º 2
0
    def _fit_glm(self, df, y, target, parameter):
        glm_encoder = ce.GLMMEncoder()

        glm_encoder.fit(df[target].map(to_str), df[y])
        name = ['continuous_' + remove_continuous_discrete_prefix(x) + '_glm' for x in
                glm_encoder.get_feature_names()]
        self.trans_ls.append(('glm', name, target, glm_encoder))
Ejemplo n.º 3
0
 def test_binary(self):
     cols = [
         'unique_str', 'underscore', 'extra', 'none', 'invariant', 321,
         'categorical', 'na_categorical', 'categorical_int'
     ]
     enc = encoders.GLMMEncoder(cols=cols, binomial_target=True)
     enc.fit(X, np_y)
     th.verify_numeric(enc.transform(X))
Ejemplo n.º 4
0
    def feat_tsf_dataset(self):
        """
        Returns
        -------
        Features Dataset Numpy Array with Category Encoders

        """
        features = self.features_dataset
        labels = self.labels_dataset

        #Encoder
        encoder = ce.GLMMEncoder(cols=self.cat_index)

        #Encoder Cv
        cv_encoder = NestedCVWrapper(feature_encoder=encoder,
                                     cv=5,
                                     shuffle=True,
                                     random_state=7)

        #Apply Transform to all datasets
        feat_tsf = cv_encoder.fit_transform(features, labels)

        return feat_tsf
Ejemplo n.º 5
0
 def test_binary(self):
     cols = [
         'unique_str', 'underscore', 'extra', 'none', 'invariant', 321,
         'categorical', 'na_categorical', 'categorical_int'
     ]
     enc = encoders.GLMMEncoder(cols=cols, binomial_target=True)
Ejemplo n.º 6
0
oh_pipeline = make_pipeline(SimpleImputer(strategy='constant'), 
                            OneHotEncoder(handle_unknown='ignore'))

def select_hc_features(df):

    hc_features =\
        df\
        .select_dtypes(['object', 'category'])\
        .apply(lambda col: col.nunique())\
        .loc[lambda x: x > MAX_OH_CARDINALITY]\
        .index\
        .tolist()

    return hc_features

hc_pipeline = make_pipeline(ce.GLMMEncoder())


column_transformer = ColumnTransformer(transformers=\
                                       [('numeric_pipeline',
                                         numeric_pipeline, 
                                         select_numeric_features),\
                                        ('oh_pipeline', 
                                         oh_pipeline, 
                                         select_oh_features),\
                                        ('hc_pipeline', 
                                         hc_pipeline, 
                                         select_hc_features)
                                       ],\
                                       n_jobs=n_threads,
                                       remainder='drop')
Ejemplo n.º 7
0
    def predict(
        self,
        size,
        propertyType,
        district,
        status,
        rooms,
        bathrooms,
        #box_posto_auto,
        #hasGarden,
        #hasTerrace,
        #hasSwimmingPool
    ):
        """
        
        Parameters
        ----------
        district : str (category)
        status : str (category)
        rooms : int
        bathrooms : int
        box_posto_auto : Bool(1,0)
        garden : Bool(1,0)
        terrace : Bool(1,0)
        hasSwimmingPool : Bool(1,0)

        Returns
        -------
        Prediction : Best Model Prediction

        """
        """
        #Avg Price Zone
        avg_price_zone_df = self.dataset_preprocessed[['district','avgPriceZone']]

        avg_price_zone_df = avg_price_zone_df.drop_duplicates()       
        
        avgPriceZone = avg_price_zone_df.loc[
            avg_price_zone_df['district']==district]['avgPriceZone'].values[0]
        """

        #Rooms Category
        roomsCat = self.roomsCategory(rooms)

        #Bathrooms Logic
        bathroomsCat = self.bathroomsCategory(bathrooms)

        #Array for prediction
        array = np.array([
            size,
            propertyType,
            district,
            status,
            roomsCat,
            bathroomsCat,
            #box_posto_auto,
            #hasGarden,
            #hasTerrace,
            #hasSwimmingPool
        ]).reshape(1, -1)

        #Encoder
        encoder = ce.GLMMEncoder(cols=self.cat_index)

        #Encoder CV KFold
        cv_encoder = NestedCVWrapper(encoder,
                                     cv=5,
                                     shuffle=True,
                                     random_state=7)

        #Datasets
        features = self.features_dataset
        labels = self.labels_dataset

        #Apply Transform to all datasets
        feat_tsf = cv_encoder.fit_transform(features, labels, array)

        #Prediction
        prediction = self.model_fit.predict(feat_tsf[1])[0]

        return prediction