Exemple #1
0
def load_spark_df(
    spark,
    size="sample",
    header=DEFAULT_HEADER,
    local_cache_path=None,
    dbfs_datapath="dbfs:/FileStore/dac",
    dbutils=None,
):
    """Loads the Criteo DAC dataset as pySpark.DataFrame.

    The dataset consists of a portion of Criteo’s traffic over a period
    of 24 days. Each row corresponds to a display ad served by Criteo and the first
    column is indicates whether this ad has been clicked or not.

    There are 13 features taking integer values (mostly count features) and 26
    categorical features. The values of the categorical features have been hashed
    onto 32 bits for anonymization purposes.

    The schema is:
    <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>

    More details (need to accept user terms to see the information): 
    http://labs.criteo.com/2013/12/download-terabyte-click-logs/ 

    Args:
        spark (pySpark.SparkSession): Spark session.
        size (str): Dataset size. It can be "sample" or "full".
        local_cache_path (str): Path where to cache the tar.gz file locally.
        header (list): Dataset header names.
        dbfs_datapath (str): Where to store the extracted files on Databricks.
        dbutils (Databricks.dbutils): Databricks utility object.
  
    Returns:
        pySpark.DataFrame: Criteo DAC training dataset.
    """
    with download_path(local_cache_path) as path:
        filepath = download_criteo(size, path)
        filepath = extract_criteo(size, filepath)

        if is_databricks():
            try:
                # Driver node's file path
                node_path = "file:" + filepath
                ## needs to be on dbfs to load
                dbutils.fs.cp(node_path, dbfs_datapath, recurse=True)
                path = dbfs_datapath
            except:
                raise ValueError(
                    "To use on a Databricks notebook, dbutils object should be passed as an argument"
                )
        else:
            path = filepath

        schema = _get_spark_schema(header)
        df = spark.read.csv(path, schema=schema, sep="\t", header=False)
        df.cache().count(
        )  # trigger execution to overcome spark's lazy evaluation
    return df
Exemple #2
0
def load_spark_df(
    spark,
    size="sample",
    header=DEFAULT_HEADER,
    local_cache_path=None,
    dbfs_datapath="dbfs:/FileStore/dac",
    dbutils=None,
):
    """Loads the Criteo DAC dataset as pySpark.DataFrame.

    The dataset consists of a portion of Criteo’s traffic over a period
    of 24 days. Each row corresponds to a display ad served by Criteo and the first
    column is indicates whether this ad has been clicked or not.

    There are 13 features taking integer values (mostly count features) and 26
    categorical features. The values of the categorical features have been hashed
    onto 32 bits for anonymization purposes.

    The schema is:
    <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>

    More details (need to accept user terms to see the information): 
    http://labs.criteo.com/2013/12/download-terabyte-click-logs/ 

    Args:
        spark (pySpark.SparkSession): Spark session.
        size (str): Dataset size. It can be "sample" or "full".
        local_cache_path (str): Path where to cache the tar.gz file locally.
        header (list): Dataset header names.
        dbfs_datapath (str): Where to store the extracted files on Databricks.
        dbutils (Databricks.dbutils): Databricks utility object.
  
    Returns:
        pySpark.DataFrame: Criteo DAC training dataset.
    """
    with download_path(local_cache_path) as path:
        filepath = download_criteo(size, path)
        filepath = extract_criteo(size, filepath)

        if is_databricks():
            try:
                # Driver node's file path
                node_path = "file:" + filepath
                ## needs to be on dbfs to load
                dbutils.fs.cp(node_path, dbfs_datapath, recurse=True)
                path = dbfs_datapath
            except:
                raise ValueError(
                    "To use on a Databricks notebook, dbutils object should be passed as an argument"
                )
        else:
            path = filepath

        schema = get_spark_schema(header)
        df = spark.read.csv(path, schema=schema, sep="\t", header=False)
        df.cache().count() # trigger execution to overcome spark's lazy evaluation
    return df
Exemple #3
0
def test_is_jupyter():
    # Test on the terminal
    assert is_jupyter() is False
    assert is_databricks() is False

    # Test on Jupyter notebook
    path = os.path.join("tests", "unit", "test_notebook_utils.ipynb")
    pm.execute_notebook(
        path,
        OUTPUT_NOTEBOOK,
        kernel_name=KERNEL_NAME,
    )
    nb = pm.read_notebook(OUTPUT_NOTEBOOK)
    df = nb.dataframe
    result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0]
    assert result_is_jupyter is True
    result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0]
    assert result_is_databricks is False
def test_is_jupyter():
    # Test on the terminal
    assert is_jupyter() is False
    assert is_databricks() is False

    # Test on Jupyter notebook
    path = os.path.join("tests", "unit", "test_notebook_utils.ipynb")
    pm.execute_notebook(
        path,
        OUTPUT_NOTEBOOK,
        kernel_name=KERNEL_NAME,
    )
    nb = pm.read_notebook(OUTPUT_NOTEBOOK)
    df = nb.dataframe
    result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0]
    assert result_is_jupyter is True
    result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0]
    assert result_is_databricks is False
Exemple #5
0
def test_is_jupyter(output_notebook, kernel_name):
    # Test on the terminal
    assert is_jupyter() is False
    assert is_databricks() is False

    # Test on Jupyter notebook
    path = Path(__file__).absolute().parent.joinpath(
        "test_notebook_utils.ipynb")
    pm.execute_notebook(
        path,
        output_notebook,
        kernel_name=kernel_name,
    )
    nb = sb.read_notebook(output_notebook)
    df = nb.scraps.dataframe
    result_is_jupyter = df.loc[df["name"] == "is_jupyter", "data"].values[0]
    assert result_is_jupyter == True  # is True not allowed
    result_is_databricks = df.loc[df["name"] == "is_databricks",
                                  "data"].values[0]
    assert result_is_databricks == False
Exemple #6
0
def test_is_jupyter():
    """Test if the module is running on Jupyter
    """
    # Test on the terminal
    assert is_jupyter() is False
    assert is_databricks() is False

    # Test on Jupyter notebook
    pm.execute_notebook(
        'test_notebook_utils.ipynb',
        OUTPUT_NOTEBOOK,
        kernel_name=KERNEL_NAME,
    )
    nb = pm.read_notebook(OUTPUT_NOTEBOOK)
    df = nb.dataframe
    result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0]
    assert result_is_jupyter is True
    result_is_databricks = df.loc[df["name"] == "is_databricks",
                                  "value"].values[0]
    assert result_is_databricks is False
Exemple #7
0
def load_spark_df(
    spark, size="100k", header=None, schema=None, local_cache_path="ml.zip", dbutils=None
):
    """Loads the MovieLens dataset as pySpark.DataFrame.

    Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load

    Args:
        spark (pySpark.SparkSession)
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m")
        header (list): Dataset header. If both schema and header is None,
            use ["UserId", "MovieId", "Rating", "Timestamp"] by default. If both header and schema are provided,
            the header argument will be ignored.
        schema (pySpark.StructType): Dataset schema. If None, use
            StructType(
                [
                    StructField("UserId", IntegerType()),
                    StructField("MovieId", IntegerType()),
                    StructField("Rating", FloatType()),
                    StructField("Timestamp", LongType()),
                ]
            )
        local_cache_path (str): Path where to cache the zip file locally
        dbutils (Databricks.dbutils): Databricks utility object

    Returns:
        pySpark.DataFrame: Dataset
    """
    if schema is None or len(schema) == 0:
        # Use header to generate schema
        if header is None or len(header) == 0:
            header = ["UserId", "MovieId", "Rating", "Timestamp"]
        elif len(header) > 4:
            warnings.warn(WARNING_MOVIE_LENS_HEADER)
            header = header[:4]

        schema = StructType()
        try:
            schema.add(StructField(header[0], IntegerType())).add(
                StructField(header[1], IntegerType())
            ).add(StructField(header[2], FloatType())).add(
                StructField(header[3], LongType())
            )
        except IndexError:
            pass
    else:
        if header is not None:
            warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER)

        if len(schema) > 4:
            warnings.warn(WARNING_MOVIE_LENS_HEADER)
            schema = schema[:4]
        try:
            # User and movie IDs should be int type
            if not isinstance(schema[0].dataType, IntegerType):
                raise ValueError(ERROR_USER_ID_TYPE)
            if not isinstance(schema[1].dataType, IntegerType):
                raise ValueError(ERROR_MOVIE_ID_TYPE)
            # Ratings should be float type
            if not isinstance(schema[2].dataType, FloatType) and not isinstance(schema[2].dataType, DoubleType):
                raise ValueError(ERROR_RATING_TYPE)
        except IndexError:
            pass

    datapath = "file:" + _load_datafile(size, local_cache_path)
    if is_databricks():
        _, dataname = os.path.split(_data_format[size].path)
        dbfs_datapath = "dbfs:/tmp/" + dataname
        try:
            dbutils.fs.mv(datapath, dbfs_datapath)
        except:
            raise ValueError("To use on a Databricks notebook, dbutils object should be passed as an argument")
        datapath = dbfs_datapath

    # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
    separator = _data_format[size].separator
    if len(separator) > 1:
        raw_data = spark.sparkContext.textFile(datapath)
        data_rdd = raw_data.map(lambda l: l.split(separator)).map(
            lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][: len(schema)]
        )
        df = spark.createDataFrame(data_rdd, schema)
    else:
        df = spark.read.csv(
            datapath, schema=schema, sep=separator, header=_data_format[size].has_header
        )

    return df
def load_spark_df(
    spark,
    size="100k",
    header=None,
    schema=None,
    local_cache_path=None,
    dbutils=None,
    title_col=None,
    genres_col=None,
    year_col=None,
):
    """Loads the MovieLens dataset as `pyspark.DataFrame`.

    Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load as `pyspark.DataFrame`.

    To load movie information only, you can use `load_item_df` function. 

    Args:
        spark (pyspark.SparkSession): Spark session.
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
        header (list or tuple): Rating dataset header.
            If schema is provided, this argument is ignored.
        schema (pyspark.StructType): Dataset schema. 
        local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
            If None, all the intermediate files will be stored in a temporary directory and removed after use.
        dbutils (Databricks.dbutils): Databricks utility object
        title_col (str): Title column name. If None, the column will not be loaded.
        genres_col (str): Genres column name. Genres are '|' separated string.
            If None, the column will not be loaded.
        year_col (str): Movie release year column name. If None, the column will not be loaded.

    Returns:
        pyspark.DataFrame: Movie rating dataset.
        
    **Examples**

    .. code-block:: python
    
        # To load just user-id, item-id, and ratings from MovieLens-1M dataset:
        spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'))

        # The schema can be defined as well:
        schema = StructType([
            StructField(DEFAULT_USER_COL, IntegerType()),
            StructField(DEFAULT_ITEM_COL, IntegerType()),
            StructField(DEFAULT_RATING_COL, FloatType()),
            StructField(DEFAULT_TIMESTAMP_COL, LongType()),
            ])
        spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'), schema=schema)

        # To load rating's timestamp together:
        spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'))

        # To load movie's title, genres, and released year info along with the ratings data:
        spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'),
            title_col='Title',
            genres_col='Genres',
            year_col='Year'
        )

        # On DataBricks, pass the dbutils argument as follows:
        spark_df = load_spark_df(spark, dbutils=dbutils)
    """
    size = size.lower()
    if size not in DATA_FORMAT:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)

    schema = _get_schema(header, schema)
    if len(schema) < 2:
        raise ValueError(ERROR_HEADER)

    movie_col = schema[1].name

    with download_path(local_cache_path) as path:
        filepath = os.path.join(path, "ml-{}.zip".format(size))
        datapath, item_datapath = _maybe_download_and_extract(size, filepath)
        spark_datapath = "file:///" + datapath  # shorten form of file://localhost/

        # Load movie features such as title, genres, and release year.
        # Since the file size is small, we directly load as pd.DataFrame from the driver node
        # and then convert into spark.DataFrame
        item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col,
                                   genres_col, year_col)
        item_df = spark.createDataFrame(
            item_pd_df) if item_pd_df is not None else None

        if is_databricks():
            if dbutils is None:
                raise ValueError("""
                    To use on a Databricks, dbutils object should be passed as an argument.
                    E.g. load_spark_df(spark, dbutils=dbutils)
                """)

            # Move rating file to DBFS in order to load into spark.DataFrame
            dbfs_datapath = "dbfs:/tmp/" + datapath
            dbutils.fs.mv(spark_datapath, dbfs_datapath)
            spark_datapath = dbfs_datapath

        # pyspark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
        separator = DATA_FORMAT[size].separator
        if len(separator) > 1:
            raw_data = spark.sparkContext.textFile(spark_datapath)
            data_rdd = raw_data.map(lambda l: l.split(separator)).map(
                lambda c:
                [int(c[0]), int(c[1]),
                 float(c[2]), int(c[3])][:len(schema)])
            df = spark.createDataFrame(data_rdd, schema)
        else:
            df = spark.read.csv(
                spark_datapath,
                schema=schema,
                sep=separator,
                header=DATA_FORMAT[size].has_header,
            )

        # Merge rating df w/ item_df
        if item_df is not None:
            df = df.join(item_df, movie_col, "left")

        # Cache and force trigger action since data-file might be removed.
        df.cache()
        df.count()

    return df
Exemple #9
0
def load_spark_df(
    spark,
    size="100k",
    header=(DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL,
            DEFAULT_TIMESTAMP_COL),
    schema=None,
    local_cache_path="ml.zip",
    dbutils=None,
    title_col=None,
    genres_col=None,
):
    """Loads the MovieLens dataset as pySpark.DataFrame.

    Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load

    Args:
        spark (pySpark.SparkSession)
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m")
        header (list or tuple): Rating dataset header. If None, ratings are not loaded.
            If schema is provided, this argument is ignored.
        schema (pySpark.StructType): Dataset schema. By default,
            StructType(
                [
                    StructField(DEFAULT_USER_COL, IntegerType()),
                    StructField(DEFAULT_ITEM_COL, IntegerType()),
                    StructField(DEFAULT_RATING_COL, FloatType()),
                    StructField(DEFAULT_TIMESTAMP_COL, LongType()),
                ]
            )
        local_cache_path (str): Path where to cache the zip file locally
        dbutils (Databricks.dbutils): Databricks utility object
        title_col (str): Title column name. If None, title is not loaded.
        genres_col (str): Genres column name. Genres are '|' separated string.
            If None, genres are not loaded.

    Returns:
        pySpark.DataFrame: Movie rating dataset.
            If header is None but either title_col or genres_col is not None,
            returns movie titles and/or genres.
    """

    # fix capitalization
    size = size.lower()

    file_datapath, file_item_datapath = _load_datafile(size, local_cache_path)
    # Driver node's file path
    datapath = "file:///" + file_datapath
    item_datapath = "file:" + file_item_datapath
    if is_databricks():
        # Move rating file to DBFS (we load items as pandas, so no need to move to DBFS)
        dbfs_datapath = "dbfs:/tmp/" + file_datapath
        try:
            dbutils.fs.mv(datapath, dbfs_datapath)
        except:
            raise ValueError(
                "To use on a Databricks notebook, dbutils object should be passed as an argument"
            )
        datapath = dbfs_datapath

    schema = _get_schema(header, schema)

    # Load title and genres
    movie_col = DEFAULT_ITEM_COL if schema is None or len(
        schema) < 2 else schema[1].name
    item_df = _load_item_df(size, movie_col, title_col, genres_col,
                            item_datapath)
    if item_df is not None:
        # Convert to spark DataFrame
        item_df = spark.createDataFrame(item_df)

    # Load rating data
    if schema is None:
        return item_df
    else:
        if len(schema) == 1 and item_df is not None:
            # MovieID should be loaded to merge rating df w/ item_df
            schema.add(StructField(movie_col, IntegerType()))

        # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
        separator = DATA_FORMAT[size].separator
        if len(separator) > 1:
            raw_data = spark.sparkContext.textFile(datapath)
            data_rdd = raw_data.map(lambda l: l.split(separator)).map(
                lambda c:
                [int(c[0]), int(c[1]),
                 float(c[2]), int(c[3])][:len(schema)])
            df = spark.createDataFrame(data_rdd, schema)
        else:
            df = spark.read.csv(datapath,
                                schema=schema,
                                sep=separator,
                                header=DATA_FORMAT[size].has_header)

        # Merge rating df w/ item_df
        if item_df is not None:
            df = df.join(item_df, movie_col, 'left')

        return df
Exemple #10
0
def load_spark_df(
    spark,
    size="100k",
    header=DEFAULT_HEADER,
    schema=None,
    local_cache_path=None,
    dbutils=None,
    title_col=None,
    genres_col=None,
    year_col=None,
):
    """Loads the MovieLens dataset as pySpark.DataFrame.

    Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load

    Args:
        spark (pySpark.SparkSession)
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
        header (list or tuple): Rating dataset header.
            If schema is provided, this argument is ignored.
        schema (pySpark.StructType): Dataset schema. By default,
            StructType(
                [
                    StructField(DEFAULT_USER_COL, IntegerType()),
                    StructField(DEFAULT_ITEM_COL, IntegerType()),
                    StructField(DEFAULT_RATING_COL, FloatType()),
                    StructField(DEFAULT_TIMESTAMP_COL, LongType()),
                ]
            )
        local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
            If None, all the intermediate files will be stored in a temporary directory and removed after use.
        dbutils (Databricks.dbutils): Databricks utility object
        title_col (str): Title column name. If None, the column will not be loaded.
        genres_col (str): Genres column name. Genres are '|' separated string.
            If None, the column will not be loaded.
        year_col (str): Movie release year column name. If None, the column will not be loaded.

    Returns:
        pySpark.DataFrame: Movie rating dataset.
    """
    size = size.lower()
    if size not in DATA_FORMAT:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)

    schema = _get_schema(header, schema)
    if schema is None:
        raise ValueError(ERROR_NO_HEADER)

    movie_col = DEFAULT_ITEM_COL if len(schema) < 2 else schema[1].name

    with download_path(local_cache_path) as path:
        filepath = os.path.join(path, "ml-{}.zip".format(size))
        datapath, item_datapath = _maybe_download_and_extract(size, filepath)
        spark_datapath = "file://" + datapath

        # Load movie features such as title, genres, and release year.
        # Since the file size is small, we directly load as pd.DataFrame from the driver node
        # and then convert into spark.DataFrame
        item_df = spark.createDataFrame(
            _load_item_df(size, item_datapath, movie_col, title_col,
                          genres_col, year_col))

        if is_databricks():
            if dbutils is None:
                raise ValueError("""
                    To use on a Databricks, dbutils object should be passed as an argument.
                    E.g. load_spark_df(spark, dbutils=dbutils)
                """)

            # Move rating file to DBFS in order to load into spark.DataFrame
            dbfs_datapath = "dbfs:/tmp/" + datapath
            dbutils.fs.mv(spark_datapath, dbfs_datapath)
            spark_datapath = dbfs_datapath

        # Load rating data
        if len(schema) == 1 and item_df is not None:
            # MovieID should be loaded to merge rating df w/ item_df
            schema.add(StructField(movie_col, IntegerType()))

        # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
        separator = DATA_FORMAT[size].separator
        if len(separator) > 1:
            raw_data = spark.sparkContext.textFile(spark_datapath)
            data_rdd = raw_data.map(lambda l: l.split(separator)).map(
                lambda c:
                [int(c[0]), int(c[1]),
                 float(c[2]), int(c[3])][:len(schema)])
            df = spark.createDataFrame(data_rdd, schema)
        else:
            df = spark.read.csv(
                spark_datapath,
                schema=schema,
                sep=separator,
                header=DATA_FORMAT[size].has_header,
            )

        # Merge rating df w/ item_df
        if item_df is not None:
            df = df.join(item_df, movie_col, "left")

        # Cache and force trigger action since data-file might be removed.
        df.cache()
        df.count()

    return df
Exemple #11
0
def load_spark_df(
    spark,
    size="100k",
    header=DEFAULT_HEADER,
    schema=None,
    local_cache_path=None,
    dbutils=None,
    title_col=None,
    genres_col=None,
    year_col=None,
):
    """Loads the MovieLens dataset as pySpark.DataFrame.

    Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load
    To load movie information only, you can use load_item_df function. 

    Args:
        spark (pySpark.SparkSession)
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
        header (list or tuple): Rating dataset header.
            If schema is provided, this argument is ignored.
        schema (pySpark.StructType): Dataset schema. By default,
            StructType(
                [
                    StructField(DEFAULT_USER_COL, IntegerType()),
                    StructField(DEFAULT_ITEM_COL, IntegerType()),
                    StructField(DEFAULT_RATING_COL, FloatType()),
                    StructField(DEFAULT_TIMESTAMP_COL, LongType()),
                ]
            )
        local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
            If None, all the intermediate files will be stored in a temporary directory and removed after use.
        dbutils (Databricks.dbutils): Databricks utility object
        title_col (str): Title column name. If None, the column will not be loaded.
        genres_col (str): Genres column name. Genres are '|' separated string.
            If None, the column will not be loaded.
        year_col (str): Movie release year column name. If None, the column will not be loaded.

    Returns:
        pySpark.DataFrame: Movie rating dataset.
        
    Examples:
        To load just user-id, item-id, and ratings from MovieLens-1M dataset,
        >>> spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'))

        To load rating's timestamp together,
        >>> spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'))

        To load movie's title, genres, and released year info along with the ratings data,
        >>> spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'),
        ...     title_col='Title',
        ...     genres_col='Genres',
        ...     year_col='Year'
        ... )

        On DataBricks, pass the dbutils argument as follows:
        >>> spark_df = load_spark_df(spark, dbutils=dbutils)
    """
    size = size.lower()
    if size not in DATA_FORMAT:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)

    schema = _get_schema(header, schema)
    if schema is None or len(schema) < 2:
        raise ValueError(ERROR_NO_HEADER)

    movie_col = schema[1].name

    with download_path(local_cache_path) as path:
        filepath = os.path.join(path, "ml-{}.zip".format(size)) 
        datapath, item_datapath = _maybe_download_and_extract(size, filepath)
        spark_datapath = "file:///" + datapath  # shorten form of file://localhost/

        # Load movie features such as title, genres, and release year.
        # Since the file size is small, we directly load as pd.DataFrame from the driver node
        # and then convert into spark.DataFrame
        item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col)
        item_df = spark.createDataFrame(item_pd_df) if item_pd_df is not None else None

        if is_databricks():
            if dbutils is None:
                raise ValueError(
                    """
                    To use on a Databricks, dbutils object should be passed as an argument.
                    E.g. load_spark_df(spark, dbutils=dbutils)
                """
                )

            # Move rating file to DBFS in order to load into spark.DataFrame
            dbfs_datapath = "dbfs:/tmp/" + datapath
            dbutils.fs.mv(spark_datapath, dbfs_datapath)
            spark_datapath = dbfs_datapath

        # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
        separator = DATA_FORMAT[size].separator
        if len(separator) > 1:
            raw_data = spark.sparkContext.textFile(spark_datapath)
            data_rdd = raw_data.map(lambda l: l.split(separator)).map(
                lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][: len(schema)]
            )
            df = spark.createDataFrame(data_rdd, schema)
        else:
            df = spark.read.csv(
                spark_datapath,
                schema=schema,
                sep=separator,
                header=DATA_FORMAT[size].has_header,
            )

        # Merge rating df w/ item_df
        if item_df is not None:
            df = df.join(item_df, movie_col, "left")

        # Cache and force trigger action since data-file might be removed.
        df.cache()
        df.count()

    return df