Ejemplo n.º 1
0
def train_pipeline(params: PipelineParams):
    logger.info(f"Start train with params {params}.")
    data = read_data(params.train_data_path)
    logger.info(f"Data shape is {data.shape}")
    data_train, data_val = split_train_val_data(data, params.split_params)
    logger.info(f"Train data shape is {data_train.shape}")
    logger.info(f"Validation data shape is {data_val.shape}")
    target_train = extract_target(data_train, params.features_params)
    data_train = data_train.drop(columns=['target'])
    transformer = build_transformer(params.features_params)
    transformer.fit(data_train)
    features_train = make_features(transformer, data_train)
    logger.info(f"Train features shape is {features_train.shape}")
    target_val = extract_target(data_val, params.features_params)
    data_val = data_val.drop(columns=['target'])
    features_val = make_features(transformer, data_val)
    logger.info(f"Validation features shape is {features_val.shape}")

    model = train_model(features_train, target_train, params.train_params)
    predicts = predict_model(model, features_val)
    metrics = evaluate_model(predicts, target_val)
    with open(params.metric_path, "w") as metric_file:
        json.dump(metrics, metric_file)
    logger.info(f"Metrics are: {metrics}")
    path_to_model = dump_model(model, params.model_path)
    logger.info(f"Model saved at {params.model_path}")
    with open(params.transformer_path, "wb") as tr:
        pickle.dump(transformer, tr)
    logger.info(f"Feature transformer saved at {params.transformer_path}")
    logger.info("Finished.")
    return path_to_model, metrics
def train_pipeline(training_pipeline_params: TrainingPipelineParams, model: SklearnClassifierModel):
    logger.info(f"start train pipeline with params {training_pipeline_params}")
    data = read_data(training_pipeline_params.input_data_path)
    logger.info(f"data.shape is {data.shape}")
    data = drop_columns(data, training_pipeline_params.feature_params)
    logger.info(f"data.shape after dropping some columns is {data.shape}")
    train_df, val_df = split_train_val_data(
        data, training_pipeline_params.splitting_params
    )
    logger.info(f"train_df.shape is {train_df.shape}")
    logger.info(f"val_df.shape is {val_df.shape}")

    if train_df.shape[0] < NOT_ENOUGH_DATA_THRESHOLD:
        msg = "No enough data to build good model"
        logger.warning(msg)
        warning_logger.warning(msg)

    transformer = build_transformer(training_pipeline_params.feature_params)
    transformer.fit(train_df)
    train_features = make_features(transformer, train_df)
    train_target = extract_target(train_df, training_pipeline_params.feature_params)

    logger.info(f"train_features.shape is {train_features.shape}")

    model = train_model(
        train_features, train_target, model
    )

    val_features = make_features(transformer, val_df)
    val_target = extract_target(val_df, training_pipeline_params.feature_params)

    logger.info(f"val_features.shape is {val_features.shape}")
    predicts = predict_model(
        model,
        val_features,
        training_pipeline_params.feature_params.use_log_trick,
    )

    metrics = evaluate_model(
        predicts,
        val_target,
        use_log_trick=training_pipeline_params.feature_params.use_log_trick,
    )

    with open(training_pipeline_params.metric_path, "w") as metric_file:
        json.dump(metrics, metric_file)
    logger.info(f"metrics is {metrics}")

    path_to_model = serialize_model(model, training_pipeline_params.output_model_path)

    return path_to_model, metrics
def predict_pipeline(training_pipeline_params):
    path = os.getcwd()
    model = pickle.load(
        open(training_pipeline_params.pretrained_model_path, 'rb'))
    logger.info(f"pretrained model {model} extracted")

    logger.info(
        f"start predict pipeline with params {training_pipeline_params}")
    data = read_data(training_pipeline_params.input_data_path)
    logger.info(f"data.shape is {data.shape}")
    data = drop_columns(data, training_pipeline_params.feature_params)
    logger.info(f"data.shape after dropping some columns is {data.shape}")

    transformer = build_transformer(training_pipeline_params.feature_params)
    transformer.fit(data)
    pred_features = make_features(transformer, data)

    predicts = predict_model(
        model,
        pred_features,
        training_pipeline_params.feature_params.use_log_trick,
    )
    predictions_path = training_pipeline_params.predictions_path
    pd.DataFrame(predicts, columns=['predictions']).to_csv(predictions_path,
                                                           index=None,
                                                           mode='w')
    logger.info(f"predictions are written to {predictions_path}")

    return predicts
def predict_pipeline(params: PredictionPipelineParams) -> pd.DataFrame:
    logger.info(f"start predict pipeline")

    logger.info(f"open data")
    df = read_data(params.input_data_predict_path)
    logger.debug(f"data shape: {df.shape}")

    logger.info(f"load model")
    with open(params.output_model_path, "rb") as f:
        model = pickle.load(f)

    logger.info(f"load transformer")
    with open(params.output_transformer_path, "rb") as f:
        transformer = pickle.load(f)

    logger.info(f"create features")
    transformed_df = make_features(transformer, df.drop(columns=['target']))

    logger.info(f"prediction")
    predicts = model.predict(transformed_df)

    logger.info(f"save predictions")
    pd.DataFrame(predicts,
                 columns=["target"]).to_csv(params.output_data_predict_path,
                                            index=False)

    logger.info(f"predict pipeline is finished")
    return pd.DataFrame(predicts, columns=["target"])
def train_pipeline(params: TrainingPipelineParams) -> float:
    logger.info(f"start train pipeline")

    df = read_data(params.input_data_path)
    logger.info(f"load data, shape: {df.shape}")

    logger.info(f"train/test spit")
    train_df, test_df = split_train_val_data(df, params.split_params)
    logger.debug(f"train shape: {train_df.shape}")
    logger.debug(f"test shape: {test_df.shape}")

    logger.info(f"feature engineering")
    transformer = build_transformer(params.feature_params)
    transformer.fit(train_df.drop(columns=['target']))

    logger.info(f"create train features and target")
    train_features = make_features(transformer,
                                   train_df.drop(columns=['target']))
    train_target = extract_target(train_df, params.feature_params)

    logger.info(f"fit model")
    model = Classifier(params.model_params)
    model.fit(train_features, train_target)
    logger.info(f"model is fitted")

    logger.info(f"create test features and target")
    test_features = make_features(transformer,
                                  test_df.drop(columns=['target']))
    test_target = extract_target(test_df, params.feature_params)

    logger.info(f"made predictions")
    pred = model.predict(test_features)

    score = get_score(test_target, pred)
    logger.debug(f"ROC-AUC: {score}")

    logger.info(f"save model")
    model.dump(params.output_model_path)

    logger.info(f"save transformer")
    with open(params.output_transformer_path, "wb") as f:
        pickle.dump(transformer, f)

    logger.info(f"train pipeline is finished")
    return score
Ejemplo n.º 6
0
    def predict_pipeline(self, data: pd.DataFrame) -> np.ndarray:
        logger.info(f"Start prediction.")

        train_features = make_features(self.pipeline, data)
        logger.info(f"Test features shape: {train_features.shape}")

        predictions = predict_model(train_features, self.model)
        logger.info(f"Prediction done")
        return predictions
Ejemplo n.º 7
0
def predict_pipeline(params: PipelineParams):
    logger.info(f"Start predict pipeline with params {params}")
    data = pd.read_csv(params.data_for_pred_path)
    logger.info(f"Data shape is {data.shape}")
    with open(params.model_path, 'rb') as m:
        model = pickle.load(m)
    logger.info(f"Model {model} loaded.")
    with open(params.transformer_path, 'rb') as t:
        transformer = pickle.load(t)
    logger.info("Transformer loaded.")
    features = make_features(transformer, data)
    logger.info(f"Features shape is {features.shape}")
    predictions = predict_model(model, features)
    logger.info(f"Predictions shape is {predictions.shape}")
    data["pred_target"] = predictions
    logger.info(f"Predictions saved to {params.predictions_path}")
    data.to_csv(params.predictions_path)
    logger.info("Finished.")