def fit(self, data: pd.DataFrame): log.info("TargetEncode fit: %s", self.targets) for target in self.targets: self.encoders["enc_{}".format(target)] = TargetEncoder( cols=self.cols, handle_missing="return_nan") log.info("Target encoding fit for target: %s", target) self.encoders["enc_{}".format(target)].fit(data[self.cols], data[target])
def run(self): results_all_splits = [] data = self.data_provider.get_data() split_num = 0 for split in self.splitter.split( data, self.data_provider.config.load_config): results = self._run_split(data, split, split_num) results_all_splits.append(results) split_num += 1 log.info("results for all split: %s", results_all_splits)
def main(): args = get_args() # if it is a cloud config file, add the function to # download the file to local first pipeline_config = load_config(args.config_file, config_factory) log.info("config loaded: %s", pipeline_config) if pipeline_config.pipeline_type == "training_evaluation_pipeline": run_training_evaluation_pipeline(pipeline_config)
def transform(self, data: pd.DataFrame) -> pd.DataFrame: log.info("LabelEncode transform: %s", self.cols) for col in self.cols: data.loc[~data[col].isin(self.label_encoders[col].classes_), col] = -1 data.loc[data[col].isin(self.label_encoders[col].classes_), col] = self.label_encoders[col].transform( data[col][data[col].isin( self.label_encoders[col].classes_)]) # data[col] = self.label_encoders[col].transform(data[col]) return data
def train(self, train_data: pd.DataFrame, val_data: pd.DataFrame, features: list, labels: list): log.info("training") eval_set = [(val_data[features], val_data[labels])] self._model.fit(X=train_data[features], y=train_data[labels], eval_set=eval_set, eval_names=['val'], feature_name=features, categorical_feature='auto', **self._fit_config)
def transform(self, data: pd.DataFrame) -> pd.DataFrame: log.info("TargetEncode transform: %s", self.targets) for target in self.targets: encoded_features_values = self.encoders["enc_{}".format( target)].transform(data[self.cols], data[target]) encoded_features = {} for encoded_feature in encoded_features_values: encoded_features["{}_{}".format( encoded_feature, target)] = encoded_features_values[encoded_feature] data = data.assign(**encoded_features) return data
def _transform_data(self): log.info("cols before data transform: %s", self.train_data.columns) log.info("transforming training data") self.train_data = self.transformer_pipeline.fit_transform( self.train_data) # self.train_data = self.transformer_pipeline.transform(self.train_data) log.info("cols after data transform: %s", self.train_data.columns) log.info("transforming validation data") self.validation_data = self.transformer_pipeline.transform( self.validation_data) return { "train_data": self.train_data, "val_data": self.validation_data, "transformer_pipeline": self.transformer_pipeline }
def _run_split(self, data: pd.DataFrame, split: Split, split_num: int): train_set, test_set = split.data(data) log.info("train_set shape: %s", train_set.shape) log.info("test_set shape: %s", test_set.shape) training_pipeline = self._run_training_pipeline(train_set) transformer_pipeline = training_pipeline.get_transformer_pipeline() model = training_pipeline.get_model() ypred, ytrue = self._run_prediction_pipeline(test_set, transformer_pipeline, model) scores_for_split = self._score(ypred, ytrue) log.info("results for split %s: %s", split_num, scores_for_split) return scores_for_split
def batch_read_dask(raw_data_path): """raw_data_path can contain wildcard e.g. "data/part-*.csv" """ log.info("Dask reading") df = dd.read_csv(raw_data_path, assume_missing=True, dtype={ "reply_timestamp": str, "retweet_timestamp": str, "retweet_with_comment_timestamp": str, "like_timestamp": str }) log.info("Converting to pandas dataframe") df_pd = df.compute().reset_index(drop=True) log.info("dtypes: %s", df_pd.dtypes) return df_pd
def fit(self, data: pd.DataFrame): log.info("LabelEncode fit: %s", self.cols) for col in self.cols: self.label_encoders[col] = preprocessing.LabelEncoder() self.label_encoders[col] = self.label_encoders[col].fit(data[col])
def transform(self, data: pd.DataFrame) -> pd.DataFrame: data_config = self._config cols = data_config.normal_features + data_config.categorical_features + data_config.label_cols log.info("Column Selection. cols: %s", cols) data = data[list(set(cols))] return data
def transform(self, data: pd.DataFrame) -> pd.DataFrame: log.info("LabelTransform transform: %s", self.cols) for col in self.cols: data[col] = data[col].map(convert_str_to_bool) return data
def fit(self, data: pd.DataFrame): log.info("LabelTransform fit: pass")
def predict(self, data: pd.DataFrame, features: list): log.info("predicting") ypred = self._model.predict(data[features]) return ypred
def _process_data(self, data): log.info("data shape before column selection: %s", data.shape) data = self._column_selector.transform(data) log.info("data shape after column selection: %s", data.shape) return data
def run(self): self._generate_train_val_data() self._transform_data() log.info(self.transformer_pipeline.get_updated_features()) self._train()