def making_data(data_path, target): data = read_data(data_path) X, y = extract_target(data, target) cat_feats, num_feats = get_features_labels(X) mu, sigma = 0, 0.1 noise = np.random.normal(mu, sigma, [X[num_feats].shape[0], X[num_feats].shape[1]]) num_feats_new = X[num_feats] + noise data_fake = pd.concat([num_feats_new, X[cat_feats], y], axis=1) return data_fake
def test_train_model(dataset_path: str, target_name: str, conf_path: str): training_pipeline_params = read_training_pipeline_params(conf_path) data = read_data(dataset_path) X, y = extract_target(data, target_name) X_transformed = full_transform(X) X_train, X_test, y_train, y_test = split_train_val_data( X_transformed, y, training_pipeline_params.splitting_params) model = train_model(X_train, y_train, training_pipeline_params.train_params) assert isinstance(model, LogisticRegression)
def test_split_train_val_data(dataset_path: str, target_name: str, conf_path: str): training_pipeline_params = read_training_pipeline_params(conf_path) data = read_data(dataset_path) X, y = extract_target(data, target_name) X_train, X_test, y_train, y_test = split_train_val_data( X, y, training_pipeline_params.splitting_params) assert len(X_train) > 0 assert len(X_test) > 0 assert len(y_train) > 0 assert len(y_test) > 0
def test_predict_model(dataset_path: str, target_name: str, conf_path: str): training_pipeline_params = read_training_pipeline_params(conf_path) data = read_data(dataset_path) X, y = extract_target(data, target_name) X_transformed = full_transform(X) X_train, X_test, y_train, y_test = split_train_val_data( X_transformed, y, training_pipeline_params.splitting_params) model = train_model(X_train, y_train, training_pipeline_params.train_params) pred_labels, pred_proba = predict_model(model, X_test) assert len(set(pred_labels)) == 2 assert max(pred_proba) < 1
def test_train_pipeline(dataset_path: str, target_name: str, conf_path: str): training_pipeline_params = read_training_pipeline_params(conf_path) data = read_data(dataset_path) X, y = extract_target(data, target_name) X_transformed = full_transform(X) X_train, X_test, y_train, y_test = split_train_val_data( X_transformed, y, training_pipeline_params.splitting_params) model = train_model(X_train, y_train, training_pipeline_params.train_params) pred_labels, pred_proba = predict_model(model, X_test) res = evaluate_model(y_test, pred_labels, pred_proba) assert res['accuracy'] > 0 assert res['roc_auc_score'] > 0.5
def predict_pipeline_run(predict_pipeline_params): logger.info(f"Start predict pipeline") data = read_data(predict_pipeline_params.input_data_path) # testing predict function on data without target data, y = extract_target(data, 'target') data_transformed = full_transform(data) logger.info(f"Transformed data shape is {data_transformed.shape}") model = load_model(predict_pipeline_params.dump_model) pred_labels, pred_proba = predict_model(model, data_transformed) pd.Series(pred_labels, index=data_transformed.index, name="prediction") \ .to_csv(predict_pipeline_params.result_path) logger.info(f"Results written to directory")
def train_pipeline_run(training_pipeline_params): logger.info(f"Start training pipeline") data = read_data(training_pipeline_params.input_data_path) X, y = extract_target(data, training_pipeline_params.target_name) logger.info(f"X and y shape is {X.shape, y.shape}") X_transformed = full_transform(X) X_train, X_test, y_train, y_test = split_train_val_data( X_transformed, y, training_pipeline_params.splitting_params) model = train_model(X_train, y_train, training_pipeline_params.train_params) dump_model(training_pipeline_params.dump_model, model) logger.info(f"model fitted and dumped") pred_labels, pred_proba = predict_model(model, X_test) res = evaluate_model(y_test, pred_labels, pred_proba) logger.info(f"metrics is {res}")
def test_get_features_labels(dataset_path: str, target_name: str): data = read_data(dataset_path) X, y = extract_target(data, target_name) cat_cols, num_cols = get_features_labels(X) assert len(num_cols) > 0
def test_full_transform(dataset_path: str, target_name: str): data = read_data(dataset_path) X, y = extract_target(data, target_name) X_transformed = full_transform(X) assert max(X_transformed.describe().T['std']) < 2 assert max(abs(X_transformed.describe().T['mean'])) < 1
def test_extract_target(dataset_path: str, target_name: str): data = read_data(dataset_path) X, y = extract_target(data, target_name) assert len(X) > 0 assert len(y) > 0
def test_read_data(dataset_path: str): data = read_data(dataset_path) assert len(data) > 0