def load_spark_df( spark, size="sample", header=DEFAULT_HEADER, local_cache_path=None, dbfs_datapath="dbfs:/FileStore/dac", dbutils=None, ): """Loads the Criteo DAC dataset as pySpark.DataFrame. The dataset consists of a portion of Criteo’s traffic over a period of 24 days. Each row corresponds to a display ad served by Criteo and the first column is indicates whether this ad has been clicked or not. There are 13 features taking integer values (mostly count features) and 26 categorical features. The values of the categorical features have been hashed onto 32 bits for anonymization purposes. The schema is: <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26> More details (need to accept user terms to see the information): http://labs.criteo.com/2013/12/download-terabyte-click-logs/ Args: spark (pySpark.SparkSession): Spark session. size (str): Dataset size. It can be "sample" or "full". local_cache_path (str): Path where to cache the tar.gz file locally. header (list): Dataset header names. dbfs_datapath (str): Where to store the extracted files on Databricks. dbutils (Databricks.dbutils): Databricks utility object. Returns: pySpark.DataFrame: Criteo DAC training dataset. """ with download_path(local_cache_path) as path: filepath = download_criteo(size, path) filepath = extract_criteo(size, filepath) if is_databricks(): try: # Driver node's file path node_path = "file:" + filepath ## needs to be on dbfs to load dbutils.fs.cp(node_path, dbfs_datapath, recurse=True) path = dbfs_datapath except: raise ValueError( "To use on a Databricks notebook, dbutils object should be passed as an argument" ) else: path = filepath schema = _get_spark_schema(header) df = spark.read.csv(path, schema=schema, sep="\t", header=False) df.cache().count( ) # trigger execution to overcome spark's lazy evaluation return df
def load_spark_df( spark, size="sample", header=DEFAULT_HEADER, local_cache_path=None, dbfs_datapath="dbfs:/FileStore/dac", dbutils=None, ): """Loads the Criteo DAC dataset as pySpark.DataFrame. The dataset consists of a portion of Criteo’s traffic over a period of 24 days. Each row corresponds to a display ad served by Criteo and the first column is indicates whether this ad has been clicked or not. There are 13 features taking integer values (mostly count features) and 26 categorical features. The values of the categorical features have been hashed onto 32 bits for anonymization purposes. The schema is: <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26> More details (need to accept user terms to see the information): http://labs.criteo.com/2013/12/download-terabyte-click-logs/ Args: spark (pySpark.SparkSession): Spark session. size (str): Dataset size. It can be "sample" or "full". local_cache_path (str): Path where to cache the tar.gz file locally. header (list): Dataset header names. dbfs_datapath (str): Where to store the extracted files on Databricks. dbutils (Databricks.dbutils): Databricks utility object. Returns: pySpark.DataFrame: Criteo DAC training dataset. """ with download_path(local_cache_path) as path: filepath = download_criteo(size, path) filepath = extract_criteo(size, filepath) if is_databricks(): try: # Driver node's file path node_path = "file:" + filepath ## needs to be on dbfs to load dbutils.fs.cp(node_path, dbfs_datapath, recurse=True) path = dbfs_datapath except: raise ValueError( "To use on a Databricks notebook, dbutils object should be passed as an argument" ) else: path = filepath schema = get_spark_schema(header) df = spark.read.csv(path, schema=schema, sep="\t", header=False) df.cache().count() # trigger execution to overcome spark's lazy evaluation return df
def test_is_jupyter(): # Test on the terminal assert is_jupyter() is False assert is_databricks() is False # Test on Jupyter notebook path = os.path.join("tests", "unit", "test_notebook_utils.ipynb") pm.execute_notebook( path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, ) nb = pm.read_notebook(OUTPUT_NOTEBOOK) df = nb.dataframe result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0] assert result_is_jupyter is True result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0] assert result_is_databricks is False
def test_is_jupyter(output_notebook, kernel_name): # Test on the terminal assert is_jupyter() is False assert is_databricks() is False # Test on Jupyter notebook path = Path(__file__).absolute().parent.joinpath( "test_notebook_utils.ipynb") pm.execute_notebook( path, output_notebook, kernel_name=kernel_name, ) nb = sb.read_notebook(output_notebook) df = nb.scraps.dataframe result_is_jupyter = df.loc[df["name"] == "is_jupyter", "data"].values[0] assert result_is_jupyter == True # is True not allowed result_is_databricks = df.loc[df["name"] == "is_databricks", "data"].values[0] assert result_is_databricks == False
def test_is_jupyter(): """Test if the module is running on Jupyter """ # Test on the terminal assert is_jupyter() is False assert is_databricks() is False # Test on Jupyter notebook pm.execute_notebook( 'test_notebook_utils.ipynb', OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, ) nb = pm.read_notebook(OUTPUT_NOTEBOOK) df = nb.dataframe result_is_jupyter = df.loc[df["name"] == "is_jupyter", "value"].values[0] assert result_is_jupyter is True result_is_databricks = df.loc[df["name"] == "is_databricks", "value"].values[0] assert result_is_databricks is False
def load_spark_df( spark, size="100k", header=None, schema=None, local_cache_path="ml.zip", dbutils=None ): """Loads the MovieLens dataset as pySpark.DataFrame. Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load Args: spark (pySpark.SparkSession) size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m") header (list): Dataset header. If both schema and header is None, use ["UserId", "MovieId", "Rating", "Timestamp"] by default. If both header and schema are provided, the header argument will be ignored. schema (pySpark.StructType): Dataset schema. If None, use StructType( [ StructField("UserId", IntegerType()), StructField("MovieId", IntegerType()), StructField("Rating", FloatType()), StructField("Timestamp", LongType()), ] ) local_cache_path (str): Path where to cache the zip file locally dbutils (Databricks.dbutils): Databricks utility object Returns: pySpark.DataFrame: Dataset """ if schema is None or len(schema) == 0: # Use header to generate schema if header is None or len(header) == 0: header = ["UserId", "MovieId", "Rating", "Timestamp"] elif len(header) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) header = header[:4] schema = StructType() try: schema.add(StructField(header[0], IntegerType())).add( StructField(header[1], IntegerType()) ).add(StructField(header[2], FloatType())).add( StructField(header[3], LongType()) ) except IndexError: pass else: if header is not None: warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER) if len(schema) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) schema = schema[:4] try: # User and movie IDs should be int type if not isinstance(schema[0].dataType, IntegerType): raise ValueError(ERROR_USER_ID_TYPE) if not isinstance(schema[1].dataType, IntegerType): raise ValueError(ERROR_MOVIE_ID_TYPE) # Ratings should be float type if not isinstance(schema[2].dataType, FloatType) and not isinstance(schema[2].dataType, DoubleType): raise ValueError(ERROR_RATING_TYPE) except IndexError: pass datapath = "file:" + _load_datafile(size, local_cache_path) if is_databricks(): _, dataname = os.path.split(_data_format[size].path) dbfs_datapath = "dbfs:/tmp/" + dataname try: dbutils.fs.mv(datapath, dbfs_datapath) except: raise ValueError("To use on a Databricks notebook, dbutils object should be passed as an argument") datapath = dbfs_datapath # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that separator = _data_format[size].separator if len(separator) > 1: raw_data = spark.sparkContext.textFile(datapath) data_rdd = raw_data.map(lambda l: l.split(separator)).map( lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][: len(schema)] ) df = spark.createDataFrame(data_rdd, schema) else: df = spark.read.csv( datapath, schema=schema, sep=separator, header=_data_format[size].has_header ) return df
def load_spark_df( spark, size="100k", header=None, schema=None, local_cache_path=None, dbutils=None, title_col=None, genres_col=None, year_col=None, ): """Loads the MovieLens dataset as `pyspark.DataFrame`. Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load as `pyspark.DataFrame`. To load movie information only, you can use `load_item_df` function. Args: spark (pyspark.SparkSession): Spark session. size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). header (list or tuple): Rating dataset header. If schema is provided, this argument is ignored. schema (pyspark.StructType): Dataset schema. local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. dbutils (Databricks.dbutils): Databricks utility object title_col (str): Title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col (str): Movie release year column name. If None, the column will not be loaded. Returns: pyspark.DataFrame: Movie rating dataset. **Examples** .. code-block:: python # To load just user-id, item-id, and ratings from MovieLens-1M dataset: spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating')) # The schema can be defined as well: schema = StructType([ StructField(DEFAULT_USER_COL, IntegerType()), StructField(DEFAULT_ITEM_COL, IntegerType()), StructField(DEFAULT_RATING_COL, FloatType()), StructField(DEFAULT_TIMESTAMP_COL, LongType()), ]) spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'), schema=schema) # To load rating's timestamp together: spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp')) # To load movie's title, genres, and released year info along with the ratings data: spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'), title_col='Title', genres_col='Genres', year_col='Year' ) # On DataBricks, pass the dbutils argument as follows: spark_df = load_spark_df(spark, dbutils=dbutils) """ size = size.lower() if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) schema = _get_schema(header, schema) if len(schema) < 2: raise ValueError(ERROR_HEADER) movie_col = schema[1].name with download_path(local_cache_path) as path: filepath = os.path.join(path, "ml-{}.zip".format(size)) datapath, item_datapath = _maybe_download_and_extract(size, filepath) spark_datapath = "file:///" + datapath # shorten form of file://localhost/ # Load movie features such as title, genres, and release year. # Since the file size is small, we directly load as pd.DataFrame from the driver node # and then convert into spark.DataFrame item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col) item_df = spark.createDataFrame( item_pd_df) if item_pd_df is not None else None if is_databricks(): if dbutils is None: raise ValueError(""" To use on a Databricks, dbutils object should be passed as an argument. E.g. load_spark_df(spark, dbutils=dbutils) """) # Move rating file to DBFS in order to load into spark.DataFrame dbfs_datapath = "dbfs:/tmp/" + datapath dbutils.fs.mv(spark_datapath, dbfs_datapath) spark_datapath = dbfs_datapath # pyspark's read csv currently doesn't support multi-character delimiter, thus we manually handle that separator = DATA_FORMAT[size].separator if len(separator) > 1: raw_data = spark.sparkContext.textFile(spark_datapath) data_rdd = raw_data.map(lambda l: l.split(separator)).map( lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][:len(schema)]) df = spark.createDataFrame(data_rdd, schema) else: df = spark.read.csv( spark_datapath, schema=schema, sep=separator, header=DATA_FORMAT[size].has_header, ) # Merge rating df w/ item_df if item_df is not None: df = df.join(item_df, movie_col, "left") # Cache and force trigger action since data-file might be removed. df.cache() df.count() return df
def load_spark_df( spark, size="100k", header=(DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL), schema=None, local_cache_path="ml.zip", dbutils=None, title_col=None, genres_col=None, ): """Loads the MovieLens dataset as pySpark.DataFrame. Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load Args: spark (pySpark.SparkSession) size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m") header (list or tuple): Rating dataset header. If None, ratings are not loaded. If schema is provided, this argument is ignored. schema (pySpark.StructType): Dataset schema. By default, StructType( [ StructField(DEFAULT_USER_COL, IntegerType()), StructField(DEFAULT_ITEM_COL, IntegerType()), StructField(DEFAULT_RATING_COL, FloatType()), StructField(DEFAULT_TIMESTAMP_COL, LongType()), ] ) local_cache_path (str): Path where to cache the zip file locally dbutils (Databricks.dbutils): Databricks utility object title_col (str): Title column name. If None, title is not loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, genres are not loaded. Returns: pySpark.DataFrame: Movie rating dataset. If header is None but either title_col or genres_col is not None, returns movie titles and/or genres. """ # fix capitalization size = size.lower() file_datapath, file_item_datapath = _load_datafile(size, local_cache_path) # Driver node's file path datapath = "file:///" + file_datapath item_datapath = "file:" + file_item_datapath if is_databricks(): # Move rating file to DBFS (we load items as pandas, so no need to move to DBFS) dbfs_datapath = "dbfs:/tmp/" + file_datapath try: dbutils.fs.mv(datapath, dbfs_datapath) except: raise ValueError( "To use on a Databricks notebook, dbutils object should be passed as an argument" ) datapath = dbfs_datapath schema = _get_schema(header, schema) # Load title and genres movie_col = DEFAULT_ITEM_COL if schema is None or len( schema) < 2 else schema[1].name item_df = _load_item_df(size, movie_col, title_col, genres_col, item_datapath) if item_df is not None: # Convert to spark DataFrame item_df = spark.createDataFrame(item_df) # Load rating data if schema is None: return item_df else: if len(schema) == 1 and item_df is not None: # MovieID should be loaded to merge rating df w/ item_df schema.add(StructField(movie_col, IntegerType())) # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that separator = DATA_FORMAT[size].separator if len(separator) > 1: raw_data = spark.sparkContext.textFile(datapath) data_rdd = raw_data.map(lambda l: l.split(separator)).map( lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][:len(schema)]) df = spark.createDataFrame(data_rdd, schema) else: df = spark.read.csv(datapath, schema=schema, sep=separator, header=DATA_FORMAT[size].has_header) # Merge rating df w/ item_df if item_df is not None: df = df.join(item_df, movie_col, 'left') return df
def load_spark_df( spark, size="100k", header=DEFAULT_HEADER, schema=None, local_cache_path=None, dbutils=None, title_col=None, genres_col=None, year_col=None, ): """Loads the MovieLens dataset as pySpark.DataFrame. Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load Args: spark (pySpark.SparkSession) size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). header (list or tuple): Rating dataset header. If schema is provided, this argument is ignored. schema (pySpark.StructType): Dataset schema. By default, StructType( [ StructField(DEFAULT_USER_COL, IntegerType()), StructField(DEFAULT_ITEM_COL, IntegerType()), StructField(DEFAULT_RATING_COL, FloatType()), StructField(DEFAULT_TIMESTAMP_COL, LongType()), ] ) local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. dbutils (Databricks.dbutils): Databricks utility object title_col (str): Title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col (str): Movie release year column name. If None, the column will not be loaded. Returns: pySpark.DataFrame: Movie rating dataset. """ size = size.lower() if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) schema = _get_schema(header, schema) if schema is None: raise ValueError(ERROR_NO_HEADER) movie_col = DEFAULT_ITEM_COL if len(schema) < 2 else schema[1].name with download_path(local_cache_path) as path: filepath = os.path.join(path, "ml-{}.zip".format(size)) datapath, item_datapath = _maybe_download_and_extract(size, filepath) spark_datapath = "file://" + datapath # Load movie features such as title, genres, and release year. # Since the file size is small, we directly load as pd.DataFrame from the driver node # and then convert into spark.DataFrame item_df = spark.createDataFrame( _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col)) if is_databricks(): if dbutils is None: raise ValueError(""" To use on a Databricks, dbutils object should be passed as an argument. E.g. load_spark_df(spark, dbutils=dbutils) """) # Move rating file to DBFS in order to load into spark.DataFrame dbfs_datapath = "dbfs:/tmp/" + datapath dbutils.fs.mv(spark_datapath, dbfs_datapath) spark_datapath = dbfs_datapath # Load rating data if len(schema) == 1 and item_df is not None: # MovieID should be loaded to merge rating df w/ item_df schema.add(StructField(movie_col, IntegerType())) # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that separator = DATA_FORMAT[size].separator if len(separator) > 1: raw_data = spark.sparkContext.textFile(spark_datapath) data_rdd = raw_data.map(lambda l: l.split(separator)).map( lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][:len(schema)]) df = spark.createDataFrame(data_rdd, schema) else: df = spark.read.csv( spark_datapath, schema=schema, sep=separator, header=DATA_FORMAT[size].has_header, ) # Merge rating df w/ item_df if item_df is not None: df = df.join(item_df, movie_col, "left") # Cache and force trigger action since data-file might be removed. df.cache() df.count() return df
def load_spark_df( spark, size="100k", header=DEFAULT_HEADER, schema=None, local_cache_path=None, dbutils=None, title_col=None, genres_col=None, year_col=None, ): """Loads the MovieLens dataset as pySpark.DataFrame. Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load To load movie information only, you can use load_item_df function. Args: spark (pySpark.SparkSession) size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). header (list or tuple): Rating dataset header. If schema is provided, this argument is ignored. schema (pySpark.StructType): Dataset schema. By default, StructType( [ StructField(DEFAULT_USER_COL, IntegerType()), StructField(DEFAULT_ITEM_COL, IntegerType()), StructField(DEFAULT_RATING_COL, FloatType()), StructField(DEFAULT_TIMESTAMP_COL, LongType()), ] ) local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. dbutils (Databricks.dbutils): Databricks utility object title_col (str): Title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col (str): Movie release year column name. If None, the column will not be loaded. Returns: pySpark.DataFrame: Movie rating dataset. Examples: To load just user-id, item-id, and ratings from MovieLens-1M dataset, >>> spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating')) To load rating's timestamp together, >>> spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp')) To load movie's title, genres, and released year info along with the ratings data, >>> spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'), ... title_col='Title', ... genres_col='Genres', ... year_col='Year' ... ) On DataBricks, pass the dbutils argument as follows: >>> spark_df = load_spark_df(spark, dbutils=dbutils) """ size = size.lower() if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) schema = _get_schema(header, schema) if schema is None or len(schema) < 2: raise ValueError(ERROR_NO_HEADER) movie_col = schema[1].name with download_path(local_cache_path) as path: filepath = os.path.join(path, "ml-{}.zip".format(size)) datapath, item_datapath = _maybe_download_and_extract(size, filepath) spark_datapath = "file:///" + datapath # shorten form of file://localhost/ # Load movie features such as title, genres, and release year. # Since the file size is small, we directly load as pd.DataFrame from the driver node # and then convert into spark.DataFrame item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col) item_df = spark.createDataFrame(item_pd_df) if item_pd_df is not None else None if is_databricks(): if dbutils is None: raise ValueError( """ To use on a Databricks, dbutils object should be passed as an argument. E.g. load_spark_df(spark, dbutils=dbutils) """ ) # Move rating file to DBFS in order to load into spark.DataFrame dbfs_datapath = "dbfs:/tmp/" + datapath dbutils.fs.mv(spark_datapath, dbfs_datapath) spark_datapath = dbfs_datapath # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that separator = DATA_FORMAT[size].separator if len(separator) > 1: raw_data = spark.sparkContext.textFile(spark_datapath) data_rdd = raw_data.map(lambda l: l.split(separator)).map( lambda c: [int(c[0]), int(c[1]), float(c[2]), int(c[3])][: len(schema)] ) df = spark.createDataFrame(data_rdd, schema) else: df = spark.read.csv( spark_datapath, schema=schema, sep=separator, header=DATA_FORMAT[size].has_header, ) # Merge rating df w/ item_df if item_df is not None: df = df.join(item_df, movie_col, "left") # Cache and force trigger action since data-file might be removed. df.cache() df.count() return df