def predict_pipeline(training_pipeline_params): path = os.getcwd() model = pickle.load( open(training_pipeline_params.pretrained_model_path, 'rb')) logger.info(f"pretrained model {model} extracted") logger.info( f"start predict pipeline with params {training_pipeline_params}") data = read_data(training_pipeline_params.input_data_path) logger.info(f"data.shape is {data.shape}") data = drop_columns(data, training_pipeline_params.feature_params) logger.info(f"data.shape after dropping some columns is {data.shape}") transformer = build_transformer(training_pipeline_params.feature_params) transformer.fit(data) pred_features = make_features(transformer, data) predicts = predict_model( model, pred_features, training_pipeline_params.feature_params.use_log_trick, ) predictions_path = training_pipeline_params.predictions_path pd.DataFrame(predicts, columns=['predictions']).to_csv(predictions_path, index=None, mode='w') logger.info(f"predictions are written to {predictions_path}") return predicts
def train_pipeline(params: PipelineParams): logger.info(f"Start train with params {params}.") data = read_data(params.train_data_path) logger.info(f"Data shape is {data.shape}") data_train, data_val = split_train_val_data(data, params.split_params) logger.info(f"Train data shape is {data_train.shape}") logger.info(f"Validation data shape is {data_val.shape}") target_train = extract_target(data_train, params.features_params) data_train = data_train.drop(columns=['target']) transformer = build_transformer(params.features_params) transformer.fit(data_train) features_train = make_features(transformer, data_train) logger.info(f"Train features shape is {features_train.shape}") target_val = extract_target(data_val, params.features_params) data_val = data_val.drop(columns=['target']) features_val = make_features(transformer, data_val) logger.info(f"Validation features shape is {features_val.shape}") model = train_model(features_train, target_train, params.train_params) predicts = predict_model(model, features_val) metrics = evaluate_model(predicts, target_val) with open(params.metric_path, "w") as metric_file: json.dump(metrics, metric_file) logger.info(f"Metrics are: {metrics}") path_to_model = dump_model(model, params.model_path) logger.info(f"Model saved at {params.model_path}") with open(params.transformer_path, "wb") as tr: pickle.dump(transformer, tr) logger.info(f"Feature transformer saved at {params.transformer_path}") logger.info("Finished.") return path_to_model, metrics
def predict_pipeline(self, data: pd.DataFrame) -> np.ndarray: logger.info(f"Start prediction.") train_features = make_features(self.pipeline, data) logger.info(f"Test features shape: {train_features.shape}") predictions = predict_model(train_features, self.model) logger.info(f"Prediction done") return predictions
def train_pipeline(training_pipeline_params: TrainingPipelineParams, model: SklearnClassifierModel): logger.info(f"start train pipeline with params {training_pipeline_params}") data = read_data(training_pipeline_params.input_data_path) logger.info(f"data.shape is {data.shape}") data = drop_columns(data, training_pipeline_params.feature_params) logger.info(f"data.shape after dropping some columns is {data.shape}") train_df, val_df = split_train_val_data( data, training_pipeline_params.splitting_params ) logger.info(f"train_df.shape is {train_df.shape}") logger.info(f"val_df.shape is {val_df.shape}") if train_df.shape[0] < NOT_ENOUGH_DATA_THRESHOLD: msg = "No enough data to build good model" logger.warning(msg) warning_logger.warning(msg) transformer = build_transformer(training_pipeline_params.feature_params) transformer.fit(train_df) train_features = make_features(transformer, train_df) train_target = extract_target(train_df, training_pipeline_params.feature_params) logger.info(f"train_features.shape is {train_features.shape}") model = train_model( train_features, train_target, model ) val_features = make_features(transformer, val_df) val_target = extract_target(val_df, training_pipeline_params.feature_params) logger.info(f"val_features.shape is {val_features.shape}") predicts = predict_model( model, val_features, training_pipeline_params.feature_params.use_log_trick, ) metrics = evaluate_model( predicts, val_target, use_log_trick=training_pipeline_params.feature_params.use_log_trick, ) with open(training_pipeline_params.metric_path, "w") as metric_file: json.dump(metrics, metric_file) logger.info(f"metrics is {metrics}") path_to_model = serialize_model(model, training_pipeline_params.output_model_path) return path_to_model, metrics
def predict_pipeline(params: PipelineParams): logger.info(f"Start predict pipeline with params {params}") data = pd.read_csv(params.data_for_pred_path) logger.info(f"Data shape is {data.shape}") with open(params.model_path, 'rb') as m: model = pickle.load(m) logger.info(f"Model {model} loaded.") with open(params.transformer_path, 'rb') as t: transformer = pickle.load(t) logger.info("Transformer loaded.") features = make_features(transformer, data) logger.info(f"Features shape is {features.shape}") predictions = predict_model(model, features) logger.info(f"Predictions shape is {predictions.shape}") data["pred_target"] = predictions logger.info(f"Predictions saved to {params.predictions_path}") data.to_csv(params.predictions_path) logger.info("Finished.")
def _get_project_root() -> Path: return Path(__file__).parent.parent root_dir = str(_get_project_root()) training_data = '/data/churn_train.csv' test_data = '/data/churn_test.csv' if __name__ == "__main__": # Load data df = load_data(root_dir, training_data) df_final = run_data_pipeline(df) X, y = create_variables(df_final) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 11) # Create models and predict data y_pred_lr, y_pred_proba_lr, lr_model = predict_model(X_train, y_train, X_test, LogisticRegression) y_pred_rf, y_pred_proba_rf, rf_model = predict_model(X_train, y_train, X_test, RandomForestClassifier) y_pred_gb, y_pred_proba_gb, gb_model = predict_model(X_train, y_train, X_test, GradientBoostingClassifier) # Visualize results print_metrics(y_test, y_pred_lr, 'Logistic Regression') print_metrics(y_test, y_pred_rf, 'Random Forest Classifier') print_metrics(y_test,y_pred_gb, 'Gradient Boosting Classifier') plot_roc_curve(y_test,y_pred_proba_lr, y_pred_proba_rf, y_pred_proba_gb,'LR','RF','GB') # Plot feature importance for gradient boosting classifier model plot_feature_importance_chart(gb_model, X_train)