def spark_dataset(python_data):
    """Get Python labels"""
    rating = python_data
    spark = start_or_get_spark("SplitterTesting")
    df_rating = spark.createDataFrame(rating)

    return df_rating
Example #2
0
def spark(tmp_path_factory, app_name="Sample", url="local[*]"):
    """Start Spark if not started.

    Other Spark settings which you might find useful:
        .config("spark.executor.cores", "4")
        .config("spark.executor.memory", "2g")
        .config("spark.memory.fraction", "0.9")
        .config("spark.memory.stageFraction", "0.3")
        .config("spark.executor.instances", 1)
        .config("spark.executor.heartbeatInterval", "36000s")
        .config("spark.network.timeout", "10000000s")

    Args:
        app_name (str): sets name of the application
        url (str): url for spark master

    Returns:
        SparkSession: new Spark session
    """

    with TemporaryDirectory(dir=tmp_path_factory.getbasetemp()) as td:
        config = {"spark.local.dir": td, "spark.sql.shuffle.partitions": 1}
        spark = start_or_get_spark(app_name=app_name, url=url, config=config)
        yield spark
        spark.stop()
Example #3
0
def prepare_metrics_als(train, test):
    schema = StructType((
        StructField(DEFAULT_USER_COL, IntegerType()),
        StructField(DEFAULT_ITEM_COL, IntegerType()),
        StructField(DEFAULT_RATING_COL, FloatType()),
        StructField(DEFAULT_TIMESTAMP_COL, LongType()),
    ))
    spark = start_or_get_spark()
    return prepare_training_als(train), spark.createDataFrame(test, schema)
Example #4
0
def test_als_pyspark_integration(notebooks):
    notebook_path = notebooks["als_pyspark"]
    pm.execute_notebook(
        notebook_path,
        OUTPUT_NOTEBOOK,
        kernel_name=KERNEL_NAME,
        parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="1m"),
    )
    nb = pm.read_notebook(OUTPUT_NOTEBOOK)
    results = nb.dataframe.set_index("name")["value"]
    start_or_get_spark("ALS PySpark").stop()

    assert results["map"] == pytest.approx(0.00201, rel=TOL, abs=ABS_TOL)
    assert results["ndcg"] == pytest.approx(0.02516, rel=TOL, abs=ABS_TOL)
    assert results["precision"] == pytest.approx(0.03172, rel=TOL, abs=ABS_TOL)
    assert results["recall"] == pytest.approx(0.009302, rel=TOL, abs=ABS_TOL)
    assert results["rmse"] == pytest.approx(0.8621, rel=TOL, abs=ABS_TOL)
    assert results["mae"] == pytest.approx(0.68023, rel=TOL, abs=ABS_TOL)
    assert results["exp_var"] == pytest.approx(0.4094, rel=TOL, abs=ABS_TOL)
    assert results["rsquared"] == pytest.approx(0.4038, rel=TOL, abs=ABS_TOL)
Example #5
0
def test_als_pyspark_smoke(notebooks):
    notebook_path = notebooks["als_pyspark"]
    pm.execute_notebook(
        notebook_path,
        OUTPUT_NOTEBOOK,
        kernel_name=KERNEL_NAME,
        parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="100k"),
    )
    nb = pm.read_notebook(OUTPUT_NOTEBOOK)
    results = nb.dataframe.set_index("name")["value"]
    start_or_get_spark("ALS PySpark").stop()

    assert results["map"] == pytest.approx(0.0052, rel=TOL, abs=ABS_TOL)
    assert results["ndcg"] == pytest.approx(0.0463, rel=TOL, abs=ABS_TOL)
    assert results["precision"] == pytest.approx(0.0487, rel=TOL, abs=ABS_TOL)
    assert results["recall"] == pytest.approx(0.0177, rel=TOL, abs=ABS_TOL)
    assert results["rmse"] == pytest.approx(0.9636, rel=TOL, abs=ABS_TOL)
    assert results["mae"] == pytest.approx(0.7508, rel=TOL, abs=ABS_TOL)
    assert results["exp_var"] == pytest.approx(0.2672, rel=TOL, abs=ABS_TOL)
    assert results["rsquared"] == pytest.approx(0.2611, rel=TOL, abs=ABS_TOL)
def test_als_pyspark_integration(notebooks):
    notebook_path = notebooks["als_pyspark"]
    pm.execute_notebook(
        notebook_path,
        OUTPUT_NOTEBOOK,
        kernel_name=KERNEL_NAME,
        parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE="1m"),
    )
    nb = pm.read_notebook(OUTPUT_NOTEBOOK)
    results = nb.dataframe.set_index("name")["value"]
    start_or_get_spark("ALS PySpark").stop()

    assert results["map"] == pytest.approx(0.00201, rel=TOL, abs=ABS_TOL)
    assert results["ndcg"] == pytest.approx(0.02516, rel=TOL, abs=ABS_TOL)
    assert results["precision"] == pytest.approx(0.03172, rel=TOL, abs=ABS_TOL)
    assert results["recall"] == pytest.approx(0.009302, rel=TOL, abs=ABS_TOL)
    assert results["rmse"] == pytest.approx(0.8621, rel=TOL, abs=ABS_TOL)
    assert results["mae"] == pytest.approx(0.68023, rel=TOL, abs=ABS_TOL)
    assert results["exp_var"] == pytest.approx(0.4094, rel=TOL, abs=ABS_TOL)
    assert results["rsquared"] == pytest.approx(0.4038, rel=TOL, abs=ABS_TOL)
def prepare_metrics_als(train, test):
    schema = StructType(
    (
        StructField(DEFAULT_USER_COL, IntegerType()),
        StructField(DEFAULT_ITEM_COL, IntegerType()),
        StructField(DEFAULT_RATING_COL, FloatType()),
        StructField(DEFAULT_TIMESTAMP_COL, LongType()),
    )
    )
    spark = start_or_get_spark()
    return prepare_training_als(train), spark.createDataFrame(test, schema)
Example #8
0
def spark_dataset(python_data):
    spark = start_or_get_spark("SplitterTesting")
    return spark.createDataFrame(python_data)
Example #9
0
def test_load_spark_df(
    size,
    num_samples,
    num_movies,
    movie_example,
    title_example,
    genres_example,
    year_example,
    tmp,
):
    """Test MovieLens dataset load into pySpark.DataFrame
    """
    spark = start_or_get_spark("MovieLensLoaderTesting")

    # Test if correct data are loaded
    header = ["1", "2", "3"]
    schema = StructType(
        [
            StructField("u", IntegerType()),
            StructField("m", IntegerType()),
        ]
    )
    with pytest.warns(Warning):
        df = load_spark_df(
            spark, size=size, local_cache_path=tmp, header=header, schema=schema
        )
        assert df.count() == num_samples
        # Test if schema is used when both schema and header are provided
        assert len(df.columns) == len(schema)
        # Test if raw-zip file, rating file, and item file are cached
        assert len(os.listdir(tmp)) == 3

    # Test title, genres, and released year load
    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        df = load_spark_df(
            spark,
            size=size,
            local_cache_path=tmp,
            header=header,
            title_col="Title",
            genres_col="Genres",
            year_col="Year",
        )
        assert df.count() == num_samples
        assert (
            len(df.columns) == 7
        )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
        assert "e" not in df.columns  # only the first 4 header columns are used
        # Get two records of the same items and check if the item-features are the same.
        head = df.filter(col("b") == movie_example).limit(2)
        title = head.select("Title").collect()
        assert title[0][0] == title[1][0]
        assert title[0][0] == title_example
        genres = head.select("Genres").collect()
        assert genres[0][0] == genres[1][0]
        assert genres[0][0] == genres_example
        year = head.select("Year").collect()
        assert year[0][0] == year[1][0]
        assert year[0][0] == year_example

    # Test default arguments
    df = load_spark_df(spark, size)
    assert df.count() == num_samples
    # user, item, rating and timestamp
    assert len(df.columns) == 4
Example #10
0
def test_load_spark_df(size, num_samples, num_movies, title_example,
                       genres_example):
    """Test MovieLens dataset load into pySpark.DataFrame
    """
    spark = start_or_get_spark("MovieLensLoaderTesting")

    # Check if the function load correct dataset
    df = movielens.load_spark_df(spark, size=size)
    assert df.count() == num_samples
    assert len(df.columns) == 4

    # Test if can handle different size of header columns
    header = ["a"]
    df = movielens.load_spark_df(spark, header=header)
    assert len(df.columns) == len(header)

    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        df = movielens.load_spark_df(spark, header=header)
        assert len(df.columns) == 4

    # Test title load
    df = movielens.load_spark_df(spark, size=size, title_col="Title")
    assert len(df.columns) == 5
    # Movie 1 is Toy Story
    title = df.filter(
        col(DEFAULT_ITEM_COL) == 1).select("Title").limit(2).collect()
    assert title[0][0] == title[1][0]
    assert title[0][0] == title_example

    # Test genres load
    df = movielens.load_spark_df(spark, size=size, genres_col="Genres")
    assert len(df.columns) == 5
    # Movie 1 is Toy Story
    genres = df.filter(
        col(DEFAULT_ITEM_COL) == 1).select("Genres").limit(2).collect()
    assert genres[0][0] == genres[1][0]
    assert genres[0][0] == genres_example

    # Test movie data load (not rating data)
    df = movielens.load_spark_df(spark,
                                 size=size,
                                 header=None,
                                 title_col="Title",
                                 genres_col="Genres")
    assert df.count() == num_movies
    assert len(df.columns) == 3

    # Test if can handle wrong size argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, size='10k')
    # Test if can handle wrong cache path argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, local_cache_path='.')

    # Test if use schema when both schema and header are provided
    header = ["1", "2"]
    schema = StructType([StructField("u", IntegerType())])
    with pytest.warns(Warning):
        df = movielens.load_spark_df(spark, header=header, schema=schema)
        assert len(df.columns) == len(schema)
Example #11
0
def test_load_spark_df():
    """Test MovieLens dataset load into pySpark.DataFrame
    """
    spark = start_or_get_spark("MovieLensLoaderTesting")

    # Check if the function load correct dataset
    size_100k = movielens.load_spark_df(spark, size="100k")
    assert size_100k.count() == 100000
    assert len(size_100k.columns) == 4
    size_1m = movielens.load_spark_df(spark, size="1m")
    assert size_1m.count() == 1000209
    assert len(size_1m.columns) == 4
    size_10m = movielens.load_spark_df(spark, size="10m")
    assert size_10m.count() == 10000054
    assert len(size_10m.columns) == 4
    size_20m = movielens.load_spark_df(spark, size="20m")
    assert size_20m.count() == 20000263
    assert len(size_20m.columns) == 4

    # Test if can handle wrong size argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, size='10k')
    # Test if can handle wrong cache path argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, local_cache_path='.')

    # Test if can handle different size of header columns
    header = ["a", "b", "c"]
    with_header = movielens.load_spark_df(spark, header=header)
    assert with_header.count() == 100000
    assert len(with_header.columns) == len(header)

    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        with_header = movielens.load_spark_df(spark, header=header)
        assert with_header.count() == 100000
        assert len(with_header.columns) == 4

    # Test if can throw exception for wrong types
    schema = StructType([StructField("u", StringType())])
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, schema=schema)
    schema = StructType(
        [StructField("u", IntegerType()),
         StructField("i", StringType())])
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, schema=schema)
    schema = StructType([
        StructField("u", IntegerType()),
        StructField("i", IntegerType()),
        StructField("r", IntegerType()),
    ])
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, schema=schema)

    # Test if can handle different size of schema fields
    schema = StructType([
        StructField("u", IntegerType()),
        StructField("i", IntegerType()),
        StructField("r", FloatType()),
    ])
    with_schema = movielens.load_spark_df(spark, schema=schema)
    assert with_schema.count() == 100000
    assert len(with_schema.columns) == len(schema)
    schema = StructType([
        StructField("u", IntegerType()),
        StructField("i", IntegerType()),
        StructField("r", DoubleType()),
        StructField("a", IntegerType()),
        StructField("b", IntegerType()),
    ])
    with pytest.warns(Warning):
        with_schema = movielens.load_spark_df(spark, schema=schema)
        assert with_schema.count() == 100000
        assert len(with_schema.columns) == 4

    # Test if use schema when both schema and header are provided
    schema = StructType([StructField("u", IntegerType())])
    with pytest.warns(Warning):
        with_schema = movielens.load_spark_df(spark,
                                              header=header,
                                              schema=schema)
        assert with_schema.count() == 100000
        assert len(with_schema.columns) == len(schema)