def test_is_jupyter(output_notebook, kernel_name):
    # Test on the terminal
    assert is_jupyter() is False
    assert is_databricks() is False

    # Test on Jupyter notebook
    path = Path(__file__).absolute().parent.joinpath("test_notebook_utils.ipynb")
    pm.execute_notebook(
        path,
        output_notebook,
        kernel_name=kernel_name,
    )
    nb = sb.read_notebook(output_notebook)
    df = nb.scraps.dataframe
    result_is_jupyter = df.loc[df["name"] == "is_jupyter", "data"].values[0]
    assert result_is_jupyter == True  # is True not allowed
    result_is_databricks = df.loc[df["name"] == "is_databricks", "data"].values[0]
    assert result_is_databricks == False
Exemple #2
0
def load_spark_df(
    spark,
    size="100k",
    header=None,
    schema=None,
    local_cache_path=None,
    dbutils=None,
    title_col=None,
    genres_col=None,
    year_col=None,
):
    """Loads the MovieLens dataset as `pyspark.sql.DataFrame`.

    Download the dataset from http://files.grouplens.org/datasets/movielens, unzip, and load as `pyspark.sql.DataFrame`.

    To load movie information only, you can use `load_item_df` function.

    Args:
        spark (pyspark.SparkSession): Spark session.
        size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m").
        header (list or tuple): Rating dataset header.
            If schema is provided, this argument is ignored.
        schema (pyspark.StructType): Dataset schema.
        local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file.
            If None, all the intermediate files will be stored in a temporary directory and removed after use.
        dbutils (Databricks.dbutils): Databricks utility object
        title_col (str): Title column name. If None, the column will not be loaded.
        genres_col (str): Genres column name. Genres are '|' separated string.
            If None, the column will not be loaded.
        year_col (str): Movie release year column name. If None, the column will not be loaded.

    Returns:
        pyspark.sql.DataFrame: Movie rating dataset.

    **Examples**

    .. code-block:: python

        # To load just user-id, item-id, and ratings from MovieLens-1M dataset:
        spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'))

        # The schema can be defined as well:
        schema = StructType([
            StructField(DEFAULT_USER_COL, IntegerType()),
            StructField(DEFAULT_ITEM_COL, IntegerType()),
            StructField(DEFAULT_RATING_COL, FloatType()),
            StructField(DEFAULT_TIMESTAMP_COL, LongType()),
            ])
        spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating'), schema=schema)

        # To load rating's timestamp together:
        spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'))

        # To load movie's title, genres, and released year info along with the ratings data:
        spark_df = load_spark_df(spark, '1m', ('UserId', 'ItemId', 'Rating', 'Timestamp'),
            title_col='Title',
            genres_col='Genres',
            year_col='Year'
        )

        # On DataBricks, pass the dbutils argument as follows:
        spark_df = load_spark_df(spark, dbutils=dbutils)
    """
    size = size.lower()
    if size not in DATA_FORMAT:
        raise ValueError(ERROR_MOVIE_LENS_SIZE)

    schema = _get_schema(header, schema)
    if len(schema) < 2:
        raise ValueError(ERROR_HEADER)

    movie_col = schema[1].name

    with download_path(local_cache_path) as path:
        filepath = os.path.join(path, "ml-{}.zip".format(size))
        datapath, item_datapath = _maybe_download_and_extract(size, filepath)
        spark_datapath = "file:///" + datapath  # shorten form of file://localhost/

        # Load movie features such as title, genres, and release year.
        # Since the file size is small, we directly load as pd.DataFrame from the driver node
        # and then convert into pyspark.sql.DataFrame
        item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col,
                                   genres_col, year_col)
        item_df = spark.createDataFrame(
            item_pd_df) if item_pd_df is not None else None

        if is_databricks():
            if dbutils is None:
                raise ValueError("""
                    To use on a Databricks, dbutils object should be passed as an argument.
                    E.g. load_spark_df(spark, dbutils=dbutils)
                """)

            # Move rating file to DBFS in order to load into pyspark.sql.DataFrame
            dbfs_datapath = "dbfs:/tmp/" + datapath
            dbutils.fs.mv(spark_datapath, dbfs_datapath)
            spark_datapath = dbfs_datapath

        # pyspark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
        separator = DATA_FORMAT[size].separator
        if len(separator) > 1:
            raw_data = spark.sparkContext.textFile(spark_datapath)
            data_rdd = raw_data.map(lambda l: l.split(separator)).map(
                lambda c:
                [int(c[0]), int(c[1]),
                 float(c[2]), int(c[3])][:len(schema)])
            df = spark.createDataFrame(data_rdd, schema)
        else:
            df = spark.read.csv(
                spark_datapath,
                schema=schema,
                sep=separator,
                header=DATA_FORMAT[size].has_header,
            )

        # Merge rating df w/ item_df
        if item_df is not None:
            df = df.join(item_df, movie_col, "left")

        # Cache and force trigger action since data-file might be removed.
        df.cache()
        df.count()

    return df
Exemple #3
0
def load_spark_df(
    spark,
    size="sample",
    header=DEFAULT_HEADER,
    local_cache_path=None,
    dbfs_datapath="dbfs:/FileStore/dac",
    dbutils=None,
):
    """Loads the Criteo DAC dataset as `pySpark.DataFrame`.

    The dataset consists of a portion of Criteo’s traffic over a period
    of 24 days. Each row corresponds to a display ad served by Criteo and the first
    column is indicates whether this ad has been clicked or not.

    There are 13 features taking integer values (mostly count features) and 26
    categorical features. The values of the categorical features have been hashed
    onto 32 bits for anonymization purposes.

    The schema is:

    .. code-block:: python

        <label> <integer feature 1> ... <integer feature 13> <categorical feature 1> ... <categorical feature 26>

    More details (need to accept user terms to see the information):
    http://labs.criteo.com/2013/12/download-terabyte-click-logs/

    Args:
        spark (pySpark.SparkSession): Spark session.
        size (str): Dataset size. It can be "sample" or "full".
        local_cache_path (str): Path where to cache the tar.gz file locally.
        header (list): Dataset header names.
        dbfs_datapath (str): Where to store the extracted files on Databricks.
        dbutils (Databricks.dbutils): Databricks utility object.

    Returns:
        pyspark.sql.DataFrame: Criteo DAC training dataset.
    """
    with download_path(local_cache_path) as path:
        filepath = download_criteo(size, path)
        filepath = extract_criteo(size, filepath)

        if is_databricks():
            try:
                # Driver node's file path
                node_path = "file:" + filepath
                # needs to be on dbfs to load
                dbutils.fs.cp(node_path, dbfs_datapath, recurse=True)
                path = dbfs_datapath
            except Exception:
                raise ValueError(
                    "To use on a Databricks notebook, dbutils object should be passed as an argument"
                )
        else:
            path = filepath

        schema = get_spark_schema(header)
        df = spark.read.csv(path, schema=schema, sep="\t", header=False)
        df.cache().count(
        )  # trigger execution to overcome spark's lazy evaluation
    return df