Beispiel #1
0
def run(root_mod, recipe, *, dataframe_source=None, replacements=[]):
    """Run a recipe stored in .py file with exec(). The path is relative
    to the path of the mod.

    Args:
        root_mod (module): The root module on which to base the path.
        recipe (str): Relative path to the recipe file from the module dir.
        dataframe_source (DataframeSourceBase subclass): dataframe source
            E.g. LocalSource, Dataiku
        replacements (list): List of text replacements to enable recipe
        debugging. Example on how to limit data amount:

        [
            {
                "old": "dataframe.get(spark_session, ds_foo.name)",
                "new": "dataframe.get(spark_session, ds_foo.name).limit(10)"
            }
        ]

    Returns:
       Output of python exec() function.
    """
    rpath = recipe_path(root_mod, recipe)
    with open(rpath) as f:
        code = prepare_code(f.read(), recipe, replacements)
    if not dataframe_source:
        dataframe_source = contextsource.get()
    globals_dict = {
        'BIRGITTA_DATAFRAMESOURCE': dataframe_source
    }
    return exec_code(code, globals_dict)
def test_has_dataiku():
    reset_context()
    sys.modules['dataikuapi'] = mock.Mock()
    sys.modules['dataiku'] = mock.Mock()
    # To enable dataiku source loading, both these spark lines are needed
    sys.modules['dataiku'].spark = 'NonMock'
    sys.modules['dataiku.spark'] = 'NonMock'
    source = contextsource.get()
    assert type(source).__name__ == 'DataikuSource'
    # Reset dataiku mocks to none, to clean up for other tests
    reset_context()
Beispiel #3
0
def get(spark_session,
        dataset_name=None,
        *,
        prefix=None,
        dataset=None,
        schema=None,
        cast_binary_to_str=False,
        dataframe_source=None):
    """Obtain a dataframe. It will adjust to whatever
    storage the environment has set. Currently storage is supported in
    file, memory or dataiku (HDFS).

    Args:
        sqlContext (SqlContext): spark sql context used to load data frames.
        dataset_name (str): The data set to load.

    Kwargs:
        prefix (str): Prefix path or dataiku project_key for loading
        the data set.
        cast_binary_to_str (bool): Convert binary to str.
        schema (Schema): Birgitta schema to verify after read.
        dataset (Dataset): Birgitta dataset to use for name and schema
        dataframe_source (DataframeSourceBase): Option to override
        the data frame source defined in the context.
    Returns:
       Spark DataFrame.
    """
    if not dataframe_source:
        dataframe_source = contextsource.get()
    if dataset:
        if schema is None:
            schema = dataset.schema
        if dataset_name is None:
            dataset_name = dataset.name
    ret = dataframe_source.load(spark_session,
                                dataset_name,
                                prefix,
                                schema=schema)
    if cast_binary_to_str:
        ret = cast_binary_cols_to_string(ret)
    return ret
Beispiel #4
0
def write(df,
          dataset_name,
          *,
          prefix=None,
          schema=None,
          dataframe_source=None,
          skip_cast=False,
          **kwargs):
    """Write a dataframe to storage. It will adjust to whatever
    storage the environment has set. Currently storage is supported in
    file or dataiku (HDFS).

    Args:
        df (DataFrame): spark data frame to write.
        dataset_name (str): The data set to load.

    Kwargs:
        prefix (str): Prefix path or dataiku project_key for loading
        the data set.
        schema (Schema): Birgitta schema to apply on write.
        dataframe_source (DataframeSourceBase): Option to override
        the data frame source defined in the context.
        skip_cast (bool): If True, don't cast

    Returns:
       None.
    """
    if schema and not skip_cast:
        df = cast_schema(dataset_name, df, schema)
    if not dataframe_source:
        dataframe_source = contextsource.get()
    return dataframe_source.write(df,
                                  dataset_name,
                                  prefix,
                                  schema=schema,
                                  **kwargs)
def test_no_dataiku():
    reset_context()
    source = contextsource.get()
    assert not source