Beispiel #1
0
def test_load_spark_df(
    size,
    num_samples,
    num_movies,
    movie_example,
    title_example,
    genres_example,
    year_example,
    tmp,
    spark,
):
    """Test MovieLens dataset load into pySpark.DataFrame"""

    # Test if correct data are loaded
    header = ["1", "2", "3"]
    schema = StructType([
        StructField("u", IntegerType()),
        StructField("m", IntegerType()),
    ])
    with pytest.warns(Warning):
        df = load_spark_df(spark,
                           size=size,
                           local_cache_path=tmp,
                           header=header,
                           schema=schema)
        assert df.count() == num_samples
        # Test if schema is used when both schema and header are provided
        assert len(df.columns) == len(schema)
        # Test if raw-zip file, rating file, and item file are cached
        assert len(os.listdir(tmp)) == 3

    # Test title, genres, and released year load
    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        df = load_spark_df(
            spark,
            size=size,
            local_cache_path=tmp,
            header=header,
            title_col="Title",
            genres_col="Genres",
            year_col="Year",
        )
        assert df.count() == num_samples
        assert (
            len(df.columns) == 7
        )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
        assert "e" not in df.columns  # only the first 4 header columns are used
        # Get two records of the same items and check if the item-features are the same.
        head = df.filter(col("b") == movie_example).limit(2)
        title = head.select("Title").collect()
        assert title[0][0] == title[1][0]
        assert title[0][0] == title_example
        genres = head.select("Genres").collect()
        assert genres[0][0] == genres[1][0]
        assert genres[0][0] == genres_example
        year = head.select("Year").collect()
        assert year[0][0] == year[1][0]
        assert year[0][0] == year_example

    # Test default arguments
    df = load_spark_df(spark, size)
    assert df.count() == num_samples
    # user, item, rating and timestamp
    assert len(df.columns) == 4
def test_load_spark_df(size, num_samples, num_movies, title_example,
                       genres_example):
    """Test MovieLens dataset load into pySpark.DataFrame
    """
    spark = start_or_get_spark("MovieLensLoaderTesting")

    # Check if the function load correct dataset
    df = movielens.load_spark_df(spark, size=size)
    assert df.count() == num_samples
    assert len(df.columns) == 4

    # Test if can handle different size of header columns
    header = ["a"]
    df = movielens.load_spark_df(spark, header=header)
    assert len(df.columns) == len(header)

    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        df = movielens.load_spark_df(spark, header=header)
        assert len(df.columns) == 4

    # Test title load
    df = movielens.load_spark_df(spark, size=size, title_col="Title")
    assert len(df.columns) == 5
    # Movie 1 is Toy Story
    title = df.filter(
        col(DEFAULT_ITEM_COL) == 1).select("Title").limit(2).collect()
    assert title[0][0] == title[1][0]
    assert title[0][0] == title_example

    # Test genres load
    df = movielens.load_spark_df(spark, size=size, genres_col="Genres")
    assert len(df.columns) == 5
    # Movie 1 is Toy Story
    genres = df.filter(
        col(DEFAULT_ITEM_COL) == 1).select("Genres").limit(2).collect()
    assert genres[0][0] == genres[1][0]
    assert genres[0][0] == genres_example

    # Test movie data load (not rating data)
    df = movielens.load_spark_df(spark,
                                 size=size,
                                 header=None,
                                 title_col="Title",
                                 genres_col="Genres")
    assert df.count() == num_movies
    assert len(df.columns) == 3

    # Test if can handle wrong size argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, size='10k')
    # Test if can handle wrong cache path argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, local_cache_path='.')

    # Test if use schema when both schema and header are provided
    header = ["1", "2"]
    schema = StructType([StructField("u", IntegerType())])
    with pytest.warns(Warning):
        df = movielens.load_spark_df(spark, header=header, schema=schema)
        assert len(df.columns) == len(schema)
def test_load_spark_df(
    size,
    num_samples,
    num_movies,
    movie_example,
    title_example,
    genres_example,
    year_example,
    tmp,
):
    """Test MovieLens dataset load into pySpark.DataFrame
    """
    spark = start_or_get_spark("MovieLensLoaderTesting")

    # Test if correct data are loaded
    header = ["1", "2", "3"]
    schema = StructType(
        [
            StructField("u", IntegerType()),
            StructField("m", IntegerType()),
        ]
    )
    with pytest.warns(Warning):
        df = load_spark_df(
            spark, size=size, local_cache_path=tmp, header=header, schema=schema
        )
        assert df.count() == num_samples
        # Test if schema is used when both schema and header are provided
        assert len(df.columns) == len(schema)
        # Test if raw-zip file, rating file, and item file are cached
        assert len(os.listdir(tmp)) == 3

    # Test title, genres, and released year load
    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        df = load_spark_df(
            spark,
            size=size,
            local_cache_path=tmp,
            header=header,
            title_col="Title",
            genres_col="Genres",
            year_col="Year",
        )
        assert df.count() == num_samples
        assert (
            len(df.columns) == 7
        )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
        assert "e" not in df.columns  # only the first 4 header columns are used
        # Get two records of the same items and check if the item-features are the same.
        head = df.filter(col("b") == movie_example).limit(2)
        title = head.select("Title").collect()
        assert title[0][0] == title[1][0]
        assert title[0][0] == title_example
        genres = head.select("Genres").collect()
        assert genres[0][0] == genres[1][0]
        assert genres[0][0] == genres_example
        year = head.select("Year").collect()
        assert year[0][0] == year[1][0]
        assert year[0][0] == year_example

    # Test default arguments
    df = load_spark_df(spark, size)
    assert df.count() == num_samples
    # user, item, rating and timestamp
    assert len(df.columns) == 4
Beispiel #4
0
def test_load_spark_df():
    """Test MovieLens dataset load into pySpark.DataFrame
    """
    spark = start_or_get_spark("MovieLensLoaderTesting")

    # Check if the function load correct dataset
    size_100k = movielens.load_spark_df(spark, size="100k")
    assert size_100k.count() == 100000
    assert len(size_100k.columns) == 4
    size_1m = movielens.load_spark_df(spark, size="1m")
    assert size_1m.count() == 1000209
    assert len(size_1m.columns) == 4
    size_10m = movielens.load_spark_df(spark, size="10m")
    assert size_10m.count() == 10000054
    assert len(size_10m.columns) == 4
    size_20m = movielens.load_spark_df(spark, size="20m")
    assert size_20m.count() == 20000263
    assert len(size_20m.columns) == 4

    # Test if can handle wrong size argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, size='10k')
    # Test if can handle wrong cache path argument
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, local_cache_path='.')

    # Test if can handle different size of header columns
    header = ["a", "b", "c"]
    with_header = movielens.load_spark_df(spark, header=header)
    assert with_header.count() == 100000
    assert len(with_header.columns) == len(header)

    header = ["a", "b", "c", "d", "e"]
    with pytest.warns(Warning):
        with_header = movielens.load_spark_df(spark, header=header)
        assert with_header.count() == 100000
        assert len(with_header.columns) == 4

    # Test if can throw exception for wrong types
    schema = StructType([StructField("u", StringType())])
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, schema=schema)
    schema = StructType(
        [StructField("u", IntegerType()),
         StructField("i", StringType())])
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, schema=schema)
    schema = StructType([
        StructField("u", IntegerType()),
        StructField("i", IntegerType()),
        StructField("r", IntegerType()),
    ])
    with pytest.raises(ValueError):
        movielens.load_spark_df(spark, schema=schema)

    # Test if can handle different size of schema fields
    schema = StructType([
        StructField("u", IntegerType()),
        StructField("i", IntegerType()),
        StructField("r", FloatType()),
    ])
    with_schema = movielens.load_spark_df(spark, schema=schema)
    assert with_schema.count() == 100000
    assert len(with_schema.columns) == len(schema)
    schema = StructType([
        StructField("u", IntegerType()),
        StructField("i", IntegerType()),
        StructField("r", DoubleType()),
        StructField("a", IntegerType()),
        StructField("b", IntegerType()),
    ])
    with pytest.warns(Warning):
        with_schema = movielens.load_spark_df(spark, schema=schema)
        assert with_schema.count() == 100000
        assert len(with_schema.columns) == 4

    # Test if use schema when both schema and header are provided
    schema = StructType([StructField("u", IntegerType())])
    with pytest.warns(Warning):
        with_schema = movielens.load_spark_df(spark,
                                              header=header,
                                              schema=schema)
        assert with_schema.count() == 100000
        assert len(with_schema.columns) == len(schema)