Esempio n. 1
0
def test_stratified_splitter(test_specs, spark_dataset):
    splits = spark_stratified_split(
        spark_dataset, ratio=test_specs["ratio"], filter_by="user", min_rating=10
    )

    assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )
    assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    # Test if both contains the same user list. This is because stratified split is stratified.
    users_train = (
        splits[0].select(DEFAULT_USER_COL).distinct().rdd.map(lambda r: r[0]).collect()
    )
    users_test = (
        splits[1].select(DEFAULT_USER_COL).distinct().rdd.map(lambda r: r[0]).collect()
    )

    assert set(users_train) == set(users_test)

    splits = spark_stratified_split(spark_dataset, ratio=test_specs["ratios"])

    assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert splits[2].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )
Esempio n. 2
0
def test_stratified_splitter(spark_dataset):
    splits = spark_stratified_split(spark_dataset,
                                    ratio=RATIOS[0],
                                    filter_by="user",
                                    min_rating=10)

    assert splits[0].count() / NUM_ROWS == pytest.approx(RATIOS[0], TOL)
    assert splits[1].count() / NUM_ROWS == pytest.approx(1 - RATIOS[0], TOL)

    # Test if both contains the same user list. This is because stratified split is stratified.
    users_train = (splits[0].select(DEFAULT_USER_COL).distinct().rdd.map(
        lambda r: r[0]).collect())
    users_test = (splits[1].select(DEFAULT_USER_COL).distinct().rdd.map(
        lambda r: r[0]).collect())

    assert set(users_train) == set(users_test)

    splits = spark_stratified_split(spark_dataset, ratio=RATIOS)

    assert splits[0].count() / NUM_ROWS == pytest.approx(RATIOS[0], TOL)
    assert splits[1].count() / NUM_ROWS == pytest.approx(RATIOS[1], TOL)
    assert splits[2].count() / NUM_ROWS == pytest.approx(RATIOS[2], TOL)
def test_timestamp_splitter(test_specs, spark_dataset):
    """Test timestamp splitter for Spark dataframes"""
    from pyspark.sql.functions import col

    dfs_rating = spark_dataset
    dfs_rating = dfs_rating.withColumn(DEFAULT_TIMESTAMP_COL, col(DEFAULT_TIMESTAMP_COL).cast("float"))

    splits = spark_timestamp_split(
        dfs_rating, ratio=test_specs["ratio"], col_timestamp=DEFAULT_TIMESTAMP_COL
    )

    assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )
    assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    # Test multi split
    splits = spark_stratified_split(dfs_rating, ratio=test_specs["ratios"])

    assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][0], test_specs["tolerance"]
    )
    assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][1], test_specs["tolerance"]
    )
    assert splits[2].count() / test_specs["number_of_rows"] == pytest.approx(
        test_specs["ratios"][2], test_specs["tolerance"]
    )

    dfs_train = splits[0]
    dfs_valid = splits[1]
    dfs_test = splits[2]

    # if valid is later than train.
    all_later_1 = _if_later(dfs_train, dfs_valid, col_timestamp=DEFAULT_TIMESTAMP_COL)
    assert all_later_1

    # if test is later than valid.
    all_later_2 = _if_later(dfs_valid, dfs_test, col_timestamp=DEFAULT_TIMESTAMP_COL)
    assert all_later_2
Esempio n. 4
0
from reco_utils.dataset.spark_splitters import (
    spark_random_split, 
    spark_chrono_split, 
    spark_stratified_split,
    spark_timestamp_split
)
import pyspark.sql.functions as sql_func
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator

training, test = spark_stratified_split(
    ratings, ratio=0.65, filter_by="user",
    col_user='******', col_item='Varenr', seed=42
)

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(alpha=10, rank=35, maxIter=15, regParam=0.01, 
          userCol="Medlemsnr_index", itemCol="Varenr", ratingCol="Rating",
          coldStartStrategy="drop",
          implicitPrefs=True, seed=42)
model = als.fit(training)

#started logging the Model
with mlflow.start_run():
    mlflow.spark.log_model(model, "MyALSModel")
    modelpath = "/dbfs/ml/SparkModel/"
    mlflow.spark.save_model(model, modelpath)