Ejemplo n.º 1
0
def dataframe_loader(_context, config):
    spark_read = _context.resources.pyspark.spark_session.read
    file_type, file_options = list(config.items())[0]
    path = file_options.get("path")

    if file_type == "csv":
        return spark_read.csv(path, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        return spark_read.parquet(path, **dict_without_keys(file_options, "path"))
    elif file_type == "json":
        return spark_read.json(path, **dict_without_keys(file_options, "path"))
    elif file_type == "jdbc":
        return spark_read.jdbc(**file_options)
    elif file_type == "orc":
        return spark_read.orc(path, **dict_without_keys(file_options, "path"))
    elif file_type == "table":
        return spark_read.table(**file_options)
    elif file_type == "text":
        return spark_read.text(path, **dict_without_keys(file_options, "path"))
    elif file_type == "other":
        return spark_read.load(**file_options)
    else:
        raise DagsterInvariantViolationError(
            "Unsupported file_type {file_type}".format(file_type=file_type)
        )
Ejemplo n.º 2
0
def test_dataframe_outputs(file_type, read, other):
    df = create_pyspark_df()

    @solid(output_defs=[
        OutputDefinition(dagster_type=DagsterPySparkDataFrame, name="df")
    ])
    def return_df(_):
        return df

    with get_temp_dir() as temp_path:
        shutil.rmtree(temp_path)

        options = {"path": temp_path}
        if other:
            options["format"] = file_type
            file_type = "other"

        result = execute_solid(
            return_df,
            run_config={
                "solids": {
                    "return_df": {
                        "outputs": [{
                            "df": {
                                file_type: options
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(options["path"], **dict_without_keys(options, "path"))
        assert sorted(df.collect()) == sorted(actual.collect())

        result = execute_solid(
            return_df,
            run_config={
                "solids": {
                    "return_df": {
                        "outputs": [{
                            "df": {
                                file_type:
                                dict(
                                    {
                                        "mode": "overwrite",
                                        "compression": "gzip",
                                    }, **options)
                            }
                        }]
                    }
                }
            },
        )
        assert result.success
        actual = read(options["path"], **dict_without_keys(options, "path"))
        assert sorted(df.collect()) == sorted(actual.collect())
Ejemplo n.º 3
0
def dataframe_loader(_context, config):
    file_type, file_options = list(config.items())[0]

    if file_type == "csv":
        path = file_options["path"]
        return pd.read_csv(path, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        return pd.read_parquet(file_options["path"])
    elif file_type == "table":
        return pd.read_csv(file_options["path"], sep="\t")
    else:
        raise DagsterInvariantViolationError(
            "Unsupported file_type {file_type}".format(file_type=file_type))
Ejemplo n.º 4
0
def dataframe_materializer(_context, config, pandas_df):
    check.inst_param(pandas_df, "pandas_df", pd.DataFrame)
    file_type, file_options = list(config.items())[0]

    if file_type == "csv":
        path = file_options["path"]
        pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, "path"))
    elif file_type == "parquet":
        pandas_df.to_parquet(file_options["path"])
    elif file_type == "table":
        pandas_df.to_csv(file_options["path"], sep="\t", index=False)
    elif file_type == "pickle":
        pandas_df.to_pickle(file_options["path"])
    else:
        check.failed("Unsupported file_type {file_type}".format(file_type=file_type))

    return AssetMaterialization.file(file_options["path"])
Ejemplo n.º 5
0
def test_dataframe_inputs(file_type, read, other):
    @solid(input_defs=[InputDefinition(dagster_type=DagsterPySparkDataFrame, name="input_df")],)
    def return_df(_, input_df):
        return input_df

    options = {"path": file_relative_path(__file__, "num.{file_type}".format(file_type=file_type))}
    if other:
        options["format"] = file_type
        file_type = "other"

    result = execute_solid(
        return_df,
        mode_def=ModeDefinition(resource_defs={"pyspark": pyspark_resource}),
        run_config={"solids": {"return_df": {"inputs": {"input_df": {file_type: options}}}}},
    )
    assert result.success
    actual = read(options["path"], **dict_without_keys(options, "path"))
    assert sorted(result.output_value().collect()) == sorted(actual.collect())