def predict(self): """ use a pre-fitted model to make predictions and save them as csv @return: None """ try: model = self._load_model(f=self.model_path) x_val = self._prepare_predict_data( ) # the same is used for clustering y_pred = model.predict(x_val) y_pred = _reshape(y_pred) logger.info( f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}" ) logger.info(f"predict on targets: {self.target}") df_pred = pd.DataFrame.from_dict({ self.target[i]: y_pred[:, i] if len(y_pred.shape) > 1 else y_pred for i in range(len(self.target)) }) logger.info(f"saving the predictions to {self.prediction_file}") df_pred.to_csv(self.prediction_file) except Exception as e: logger.exception(f"Error while preparing predictions: {e}")
def _get_predictions(self, **kwargs): """ use a pre-fitted model to generate predictions @return: None """ try: model = self._load_model(f=self.model_path) x_val = ( self._prepare_predict_data() ) # the same is used for clustering y_pred = model.predict(x_val) y_pred = _reshape(y_pred) logger.info( f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}" ) logger.info(f"predict on targets: {self.target}") if not self.target: self.target = ["result"] df_pred = pd.DataFrame.from_dict( { self.target[i]: y_pred[:, i] if len(y_pred.shape) > 1 else y_pred for i in range(len(self.target)) } ) return df_pred except Exception as e: logger.exception(f"Error while preparing predictions: {e}")
def _process_data(self, fit=True): """ read and return data as x and y @return: list of separate x and y """ assert isinstance(self.target, list), "provide target(s) as a list in the yaml file" assert len(self.target) > 0, "please provide at least a target to predict" try: dataset = pd.read_csv(self.data_path) logger.info(f"dataset shape: {dataset.shape}") attributes = list(dataset.columns) logger.info(f"dataset attributes: {attributes}") # handle missing values in the dataset preprocess_props = self.dataset_props.get('preprocess', None) if preprocess_props: # preprocessing strategy: mean, median, mode etc.. strategy = preprocess_props.get('missing_values') if strategy: dataset = handle_missing_values(dataset, strategy=strategy) if any(col not in attributes for col in self.target): raise Exception("chosen target(s) to predict must exist in the dataset") y = pd.concat([dataset.pop(x) for x in self.target], axis=1) x = _reshape(dataset.to_numpy()) y = _reshape(y.to_numpy()) logger.info(f"y shape: {y.shape} and x shape: {x.shape}") if not fit: return x, y split_options = self.dataset_props.get('split') test_size = split_options.get('test_size') shuffle = split_options.get('shuffle') stratify = split_options.get('stratify') x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=shuffle, stratify=None if not stratify or stratify.lower() == "none" else stratify) return x_train, y_train, x_test, y_test except Exception as e: logger.exception(f"error occured while preparing the data: {e.args}")
def _prepare_predict_data(self): """ read and return x_pred @return: x_pred """ try: x_val = pd.read_csv(self.data_path) logger.info(f"shape of the prediction data: {x_val.shape}") return _reshape(x_val) except Exception as e: logger.exception(f"exception while preparing prediction data: {e}")
def _process_data(self, target='fit'): """ read and return data as x and y @return: list of separate x and y """ assert isinstance(self.target, list), "provide target(s) as a list in the yaml file" if self.model_type != "clustering": assert len( self.target) > 0, "please provide at least a target to predict" try: read_data_options = self.dataset_props.get('read_data_options', None) dataset = pd.read_csv( self.data_path) if not read_data_options else pd.read_csv( self.data_path, **read_data_options) logger.info(f"dataset shape: {dataset.shape}") attributes = list(dataset.columns) logger.info(f"dataset attributes: {attributes}") # handle missing values in the dataset preprocess_props = self.dataset_props.get('preprocess', None) if preprocess_props: # handle encoding encoding = preprocess_props.get('encoding') if encoding: encoding_type = encoding.get('type', None) column = encoding.get('column', None) if column in attributes: dataset, classes_map = encode( df=dataset, encoding_type=encoding_type.lower(), column=column) if classes_map: self.dataset_props[ 'label_encoding_classes'] = classes_map logger.info( f"adding classes_map to dataset props: \n{classes_map}" ) logger.info( f"shape of the dataset after encoding => {dataset.shape}" ) # preprocessing strategy: mean, median, mode etc.. strategy = preprocess_props.get('missing_values') if strategy: dataset = handle_missing_values(dataset, strategy=strategy) logger.info( f"shape of the dataset after handling missing values => {dataset.shape}" ) if target == 'predict' or target == 'fit_cluster': x = _reshape(dataset.to_numpy()) if not preprocess_props: return x scaling_props = preprocess_props.get('scale', None) if not scaling_props: return x else: scaling_method = scaling_props.get('method', None) return normalize(x, method=scaling_method) if any(col not in attributes for col in self.target): raise Exception( "chosen target(s) to predict must exist in the dataset") y = pd.concat([dataset.pop(x) for x in self.target], axis=1) x = _reshape(dataset.to_numpy()) y = _reshape(y.to_numpy()) logger.info(f"y shape: {y.shape} and x shape: {x.shape}") # handle data scaling if preprocess_props: scaling_props = preprocess_props.get('scale', None) if scaling_props: scaling_method = scaling_props.get('method', None) scaling_target = scaling_props.get('target', None) if scaling_target == 'all': x = normalize(x, method=scaling_method) y = normalize(y, method=scaling_method) elif scaling_target == 'inputs': x = normalize(x, method=scaling_method) elif scaling_target == 'outputs': y = normalize(y, method=scaling_method) if target == 'evaluate': return x, y split_options = self.dataset_props.get('split', None) if not split_options: return x, y, None, None test_size = split_options.get('test_size') shuffle = split_options.get('shuffle') stratify = split_options.get('stratify') x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, shuffle=shuffle, stratify=None if not stratify or stratify.lower() == "default" else stratify) return x_train, y_train, x_test, y_test except Exception as e: logger.exception( f"error occured while preparing the data: {e.args}")