Ejemplo n.º 1
0
 def fit(self, data: pd.DataFrame):
     log.info("TargetEncode fit: %s", self.targets)
     for target in self.targets:
         self.encoders["enc_{}".format(target)] = TargetEncoder(
             cols=self.cols, handle_missing="return_nan")
         log.info("Target encoding fit for target: %s", target)
         self.encoders["enc_{}".format(target)].fit(data[self.cols],
                                                    data[target])
Ejemplo n.º 2
0
 def run(self):
     results_all_splits = []
     data = self.data_provider.get_data()
     split_num = 0
     for split in self.splitter.split(
             data, self.data_provider.config.load_config):
         results = self._run_split(data, split, split_num)
         results_all_splits.append(results)
         split_num += 1
     log.info("results for all split: %s", results_all_splits)
Ejemplo n.º 3
0
def main():
    args = get_args()

    # if it is a cloud config file, add the function to
    # download the file to local first
    pipeline_config = load_config(args.config_file, config_factory)
    log.info("config loaded: %s", pipeline_config)

    if pipeline_config.pipeline_type == "training_evaluation_pipeline":
        run_training_evaluation_pipeline(pipeline_config)
Ejemplo n.º 4
0
 def transform(self, data: pd.DataFrame) -> pd.DataFrame:
     log.info("LabelEncode transform: %s", self.cols)
     for col in self.cols:
         data.loc[~data[col].isin(self.label_encoders[col].classes_),
                  col] = -1
         data.loc[data[col].isin(self.label_encoders[col].classes_),
                  col] = self.label_encoders[col].transform(
                      data[col][data[col].isin(
                          self.label_encoders[col].classes_)])
         # data[col] = self.label_encoders[col].transform(data[col])
     return data
Ejemplo n.º 5
0
 def train(self, train_data: pd.DataFrame, val_data: pd.DataFrame,
           features: list, labels: list):
     log.info("training")
     eval_set = [(val_data[features], val_data[labels])]
     self._model.fit(X=train_data[features],
                     y=train_data[labels],
                     eval_set=eval_set,
                     eval_names=['val'],
                     feature_name=features,
                     categorical_feature='auto',
                     **self._fit_config)
Ejemplo n.º 6
0
 def transform(self, data: pd.DataFrame) -> pd.DataFrame:
     log.info("TargetEncode transform: %s", self.targets)
     for target in self.targets:
         encoded_features_values = self.encoders["enc_{}".format(
             target)].transform(data[self.cols], data[target])
         encoded_features = {}
         for encoded_feature in encoded_features_values:
             encoded_features["{}_{}".format(
                 encoded_feature,
                 target)] = encoded_features_values[encoded_feature]
         data = data.assign(**encoded_features)
     return data
Ejemplo n.º 7
0
 def _transform_data(self):
     log.info("cols before data transform: %s", self.train_data.columns)
     log.info("transforming training data")
     self.train_data = self.transformer_pipeline.fit_transform(
         self.train_data)
     # self.train_data = self.transformer_pipeline.transform(self.train_data)
     log.info("cols after data transform: %s", self.train_data.columns)
     log.info("transforming validation data")
     self.validation_data = self.transformer_pipeline.transform(
         self.validation_data)
     return {
         "train_data": self.train_data,
         "val_data": self.validation_data,
         "transformer_pipeline": self.transformer_pipeline
     }
Ejemplo n.º 8
0
 def _run_split(self, data: pd.DataFrame, split: Split, split_num: int):
     train_set, test_set = split.data(data)
     log.info("train_set shape: %s", train_set.shape)
     log.info("test_set shape: %s", test_set.shape)
     training_pipeline = self._run_training_pipeline(train_set)
     transformer_pipeline = training_pipeline.get_transformer_pipeline()
     model = training_pipeline.get_model()
     ypred, ytrue = self._run_prediction_pipeline(test_set,
                                                  transformer_pipeline,
                                                  model)
     scores_for_split = self._score(ypred, ytrue)
     log.info("results for split %s: %s", split_num, scores_for_split)
     return scores_for_split
Ejemplo n.º 9
0
def batch_read_dask(raw_data_path):
    """raw_data_path can contain wildcard
    e.g. "data/part-*.csv"
    """
    log.info("Dask reading")
    df = dd.read_csv(raw_data_path,
                     assume_missing=True,
                     dtype={
                         "reply_timestamp": str,
                         "retweet_timestamp": str,
                         "retweet_with_comment_timestamp": str,
                         "like_timestamp": str
                     })
    log.info("Converting to pandas dataframe")
    df_pd = df.compute().reset_index(drop=True)
    log.info("dtypes: %s", df_pd.dtypes)
    return df_pd
Ejemplo n.º 10
0
 def fit(self, data: pd.DataFrame):
     log.info("LabelEncode fit: %s", self.cols)
     for col in self.cols:
         self.label_encoders[col] = preprocessing.LabelEncoder()
         self.label_encoders[col] = self.label_encoders[col].fit(data[col])
Ejemplo n.º 11
0
 def transform(self, data: pd.DataFrame) -> pd.DataFrame:
     data_config = self._config
     cols = data_config.normal_features + data_config.categorical_features + data_config.label_cols
     log.info("Column Selection. cols: %s", cols)
     data = data[list(set(cols))]
     return data
Ejemplo n.º 12
0
 def transform(self, data: pd.DataFrame) -> pd.DataFrame:
     log.info("LabelTransform transform: %s", self.cols)
     for col in self.cols:
         data[col] = data[col].map(convert_str_to_bool)
     return data
Ejemplo n.º 13
0
 def fit(self, data: pd.DataFrame):
     log.info("LabelTransform fit: pass")
Ejemplo n.º 14
0
 def predict(self, data: pd.DataFrame, features: list):
     log.info("predicting")
     ypred = self._model.predict(data[features])
     return ypred
Ejemplo n.º 15
0
    def _process_data(self, data):
        log.info("data shape before column selection: %s", data.shape)
        data = self._column_selector.transform(data)
        log.info("data shape after column selection: %s", data.shape)

        return data
Ejemplo n.º 16
0
 def run(self):
     self._generate_train_val_data()
     self._transform_data()
     log.info(self.transformer_pipeline.get_updated_features())
     self._train()