def train_pipeline( training_pipeline_params: TrainingPipelineParams) -> Tuple[str, dict]: logger.info(f"start train pipeline with params {training_pipeline_params}") data = read_data(training_pipeline_params.input_data_path) train_df, test_df = split_train_test_data( data, training_pipeline_params.splitting_params) feature_extractor = FeaturesExtractor( training_pipeline_params.feature_params) train_features = feature_extractor.fit_transform(train_df) train_target = extract_target(train_df, training_pipeline_params.feature_params) logger.info(f"train_features.shape is {train_features.shape}") logger.info("features and target created") model = train_model(train_features, train_target, training_pipeline_params.train_params) test_features = feature_extractor.transform(test_df) test_target = extract_target(test_df, training_pipeline_params.feature_params) logger.info(f"test_features.shape is {test_features.shape}") predicts = predict_model(model, test_features) metrics = evaluate_model(predicts, test_target) save_metrics(metrics, training_pipeline_params.metric_path) path_to_model = save_model(model, training_pipeline_params.output_model_path) logger.info(f"metrics is {metrics}") return path_to_model, metrics
def train_pipeline(params: TrainingPipelineParams): logger.info("start training pipeline") data = read_data(params.input_data_path) logger.info(f"data readed from {params.input_data_path}") train_df, val_df = split_to_train_val(data, params.splitting_params) logger.debug( f"data splitted; train_df size: {train_df.shape}, val_df size: {val_df.shape}" ) feature_extractor = FeaturesExtractor(params.feature_params) X_train = feature_extractor.fit_transform(train_df) X_val = feature_extractor.transform(val_df) y_train = extract_target(train_df, params.feature_params) y_val = extract_target(val_df, params.feature_params) logger.info("features and target extracted") logger.debug( f"X_train size: {X_train.shape}, y_train size: {y_train.shape}, " f"X_val size: {X_val.shape}, y_val size: {y_val.shape}") model = train_model(X_train, y_train, params.train_params) logger.info(f"model {params.train_params.model_type} loaded") y_pred = predict_model(model, X_val) logger.debug(f"prediction done; y_pred size: {y_pred.shape}") metrics = evaluate_model(y_pred, y_val) logger.info(f"evaluation done; accuracy: {metrics['accuracy_score']}") path_to_model = save_model(model, params.output_model_path) logger.info(f"model saved to {path_to_model}") save_metrics(metrics, params.metric_path) logger.info(f"metrics saved to {params.metric_path}") return path_to_model, metrics
def predict_pipeline( training_pipeline_params: TrainingPipelineParams, ): model = load_model(training_pipeline_params.output_model_path) logger.info(f"loaded model from {training_pipeline_params.output_model_path} for prediction") df = read_data(training_pipeline_params.input_data_path) extracted_features = FeaturesExtractor(training_pipeline_params.feature_params).fit_transform(df) logger.info(f"features extracted; extracted_features size: {extracted_features.shape}") predict = predict_model(model, extracted_features) logger.info(f"prediction done; prediction size: {predict.shape}") pd.DataFrame(predict, columns=['target']).to_csv(training_pipeline_params.output_predict_path, index=False)
def predict(predict_config): test_df = read_data(to_absolute_path(predict_config.test_data_path)) test_df = test_df.drop(predict_config.feature_params.target_col, axis=1) model_path = to_absolute_path(predict_config.output_model_path) model = load_model(model_path) transformer = load_transformer( to_absolute_path(predict_config.feature_transformer_path)) test_features = make_features(transformer, test_df) y_pred = pd.DataFrame(model.predict_proba(test_features)[:, 1], columns=["target"]) y_pred.to_csv(to_absolute_path(predict_config.predict_path), index=False)
def train_pipeline(training_pipeline_params: Params): # train, val data logger.info(f"start train pipeline with params {training_pipeline_params}") data = read_data(training_pipeline_params.train_data_path) logger.info(f"data.shape is {data.shape}") train_df, val_df = split_train_val_data( data, training_pipeline_params.splitting_params) logger.info(f"train_df.shape is {train_df.shape}") logger.info(f"val_df.shape is {val_df.shape}") # features extraction train_target = extract_target(train_df, training_pipeline_params.feature_params) transformer = Features_transformer(training_pipeline_params.feature_params) transformer.fit( train_df.drop( columns=training_pipeline_params.feature_params.target_col)) train_features = make_features( transformer, train_df.drop( columns=training_pipeline_params.feature_params.target_col)) logger.info(f"train_features.shape is {train_features.shape}") val_target = extract_target(val_df, training_pipeline_params.feature_params) val_features = make_features( transformer, val_df.drop( columns=training_pipeline_params.feature_params.target_col)) logger.info(f"val_features.shape is {val_features.shape}") # train and score model = train_model(train_features, train_target, training_pipeline_params.train_params) predicts = predict_model(model, val_features) metrics = evaluate_model(predicts, val_target) logger.info(f"metrics is {metrics}") # save path_to_feature_transformer = serialize_features_transformer( transformer, training_pipeline_params.features_transformer_path) path_to_model = serialize_model(model, training_pipeline_params.model_path) path_to_metrics = serialize_metrics(metrics, training_pipeline_params.metric_path) logger.info(f"transformer, model and metrics were saved") return path_to_feature_transformer, path_to_model, path_to_metrics, metrics
def main(): warnings.filterwarnings('ignore') params = read_training_pipeline_params(TRAIN_CONFIG_PATH) # load data data = read_data(params.input_data_path) X = FeaturesExtractor(params.feature_params).fit_transform(data) y = extract_target(data, params.feature_params) # determine models and its parameters logreg = LogisticRegression() logreg_grid_params = { "fit_intercept": [True, False], "max_iter": [100, 500, 1000], "C": np.logspace(-2, 1, 30), } trees = RandomForestClassifier() trees_grid_params = { "n_estimators": np.linspace(10, 100, 5).astype(int), "criterion": ["gini", "entropy"], "max_features": ["sqrt", "log2", .5, None], "min_samples_leaf": [1, 3, 5], } knn = KNeighborsClassifier() knn_grid_params = {"n_neighbors": [1, 3, 5, 7, 9, 11], "p": [1, 2, 3]} # unite model entities search_entities = [ ('logreg', logreg, logreg_grid_params), ('trees', trees, trees_grid_params), ('knn', knn, knn_grid_params), ] # run grid search and write best params and scores to file best_params_path = PATH_TO_BEST_MODEL_PARAMS with open(best_params_path, 'w') as fout: fout.write(f"model_name\tbest_params\taccuracy\n") for mname, model, grid_params in search_entities: best_params, score = best_model_params(model, grid_params, X, y) print( f"{mname}, {best_params}, {score}", file=sys.stderr, ) fout.write(f"{mname}\t{best_params}\t{score}\n")
def inference_pipeline(inference_pipeline_params: Params): # train, val data logger.info(f"start inference pipeline with params {inference_pipeline_params.inference_params}") data = read_data(inference_pipeline_params.inference_params.source_data_path) logger.info(f"data.shape is {data.shape}") # features extraction transformer = load_transformer(inference_pipeline_params.features_transformer_path) data_features = make_features(transformer, data) logger.info(f"data_features.shape is {data.shape}") # predict model = load_model(inference_pipeline_params.model_path) predicts = predict_model(model, data_features) logger.info(f"predicts shape is {predicts.shape}") # save path_to_predics = save_predicts(data, predicts, inference_pipeline_params.inference_params.result_data_path) logger.info(f"predicted data was saved") return path_to_predics, predicts
def predict_pipeline( dataset_path: str, output_path: str, params: TrainingPipelineParams, ): try: model = load_model(params.output_model_path) except: raise Exception( f"Cannot load model from {params.output_model_path}, " f"try to train model with model_type = {params.train_params.model_type}" ) logger.info(f"loaded model from {params.output_model_path} for prediction") df = read_data(dataset_path) logger.info(f"data readed from {dataset_path}") logger.debug(f"data size: {df.shape}") X = FeaturesExtractor(params.feature_params).fit_transform(df) logger.debug(f"features extracted; X size: {X.shape}") preds = predict_model(model, X) logger.info(f"prediction done; prediction size: {preds.shape}") pd.DataFrame(preds, columns=['target']).to_csv(output_path, index=None)
def test_read_data(tmp_path): path = tmp_path / "predicts.csv" pd.DataFrame([[1, 2], ["a", "b"]]).to_csv(path, index=False) data = read_data(path) assert data.shape == (2, 2)
def test_read_data(path_to_synthetic_data): data = read_data(path_to_synthetic_data) assert isinstance(data, pd.DataFrame), 'data is not a pd.DataFrame' assert data.shape[0] > 0 and data.shape[1] > 0, 'dataframe is empty'