Exemple #1
0
def test_from_pandas():
    pd_df = pd.DataFrame({
        "x": [1, 3, 5, 7, 9],
        "y": [2, 4, 6, 8, 10],
        "z": [None, "a", "b", "c", "abc"]
    })

    ge_df = ge.from_pandas(pd_df)
    assert isinstance(ge_df, ge.data_asset.DataAsset)
    assert list(ge_df.columns) == ["x", "y", "z"]
    assert list(ge_df["x"]) == list(pd_df["x"])
    assert list(ge_df["y"]) == list(pd_df["y"])
    assert list(ge_df["z"]) == list(pd_df["z"])

    # make an empty subclass to test dataset_class argument
    class CustomPandasDataset(ge.dataset.PandasDataset):
        pass

    ge_df_custom = ge.from_pandas(pd_df, dataset_class=CustomPandasDataset)

    assert not isinstance(ge_df, CustomPandasDataset)
    assert isinstance(ge_df_custom, CustomPandasDataset)
    assert list(ge_df_custom.columns) == ["x", "y", "z"]
    assert list(ge_df_custom["x"]) == list(pd_df["x"])
    assert list(ge_df_custom["y"]) == list(pd_df["y"])
    assert list(ge_df_custom["z"]) == list(pd_df["z"])
def test_from_pandas():
    pd_df = pd.DataFrame({
        'x': [1, 3, 5, 7, 9],
        'y': [2, 4, 6, 8, 10],
        'z': [None, 'a', 'b', 'c', 'abc']
    })

    ge_df = ge.from_pandas(pd_df)
    assert isinstance(ge_df, ge.dataset.Dataset)
    assert list(ge_df.columns) == ['x', 'y', 'z']
    assert list(ge_df['x']) == list(pd_df['x'])
    assert list(ge_df['y']) == list(pd_df['y'])
    assert list(ge_df['z']) == list(pd_df['z'])

    # make an empty subclass to test dataset_class argument
    class CustomPandasDataset(ge.dataset.PandasDataset):
        pass

    ge_df_custom = ge.from_pandas(pd_df, dataset_class=CustomPandasDataset)

    assert not isinstance(ge_df, CustomPandasDataset)
    assert isinstance(ge_df_custom, CustomPandasDataset)
    assert list(ge_df_custom.columns)==['x', 'y', 'z']
    assert list(ge_df_custom['x'])==list(pd_df['x'])
    assert list(ge_df_custom['y'])==list(pd_df['y'])
    assert list(ge_df_custom['z'])==list(pd_df['z'])
    def _get_expectations(
        self, field_mapping: FieldMapping, field_name: str
    ) -> ge.dataset.Dataset:
        field_mapping_ge = ge.from_pandas(field_mapping.get_field_mapping_df())
        field_mapping_ge.set_default_expectation_argument("result_format", "COMPLETE")

        # Shape
        shape_expectation = field_mapping_ge.expect_table_columns_to_match_ordered_list(
            common.COLUMN_NAMES
        )

        # If the shape isn't correct then the following expectations will raise exceptions
        if not shape_expectation["success"]:
            return field_mapping_ge

        field = self.table_schema.get_field(field_name)
        # Check that there aren't multiple mappings for the same input value
        field_mapping_ge.expect_column_values_to_be_unique(common.INPUT_COLUMN_NAME)

        # Check that the mapped values are all part of the available options (blank value is also valid)
        valid_mapped_values = self._get_valid_mappings_for_field(field) + [""]
        field_mapping_ge.expect_column_values_to_be_in_set(
            common.OUTPUT_COLUMN_NAME, valid_mapped_values
        )

        # Check that the approved column values are all "Yes", "No", or blank (Assumed to be no)
        field_mapping_ge.expect_column_values_to_be_in_set(
            common.APPROVED_COLUMN_NAME, common.VALID_APPROVED_VALUES + ["None"]
        )

        return field_mapping_ge
Exemple #4
0
 def _do_expectation(_info, df):
     ge_df = ge.from_pandas(df)
     ge_result = ge_callback(ge_df)
     check.invariant('success' in ge_result)
     return ExpectationResult(
         success=ge_result['success'],
         metadata_entries=[
             EventMetadataEntry.json(label='result', data=ge_result)
         ],
     )
    def _get_expectations(self, field_mapping: FieldMapping) -> ge.dataset.Dataset:
        field_mapping_ge = ge.from_pandas(field_mapping.get_field_mapping_df())
        field_mapping_ge.set_default_expectation_argument("result_format", "COMPLETE")

        # Check that the approved column values are all "Yes"
        field_mapping_ge.expect_column_values_to_be_in_set(
            common.APPROVED_COLUMN_NAME, [common.APPROVED]
        )

        return field_mapping_ge
Exemple #6
0
def check_iris_data(df: DataFrame):
    gdf = ge.from_pandas(df)
    result = gdf.expect_column_values_to_be_in_set(
        "species", ["setosa", "virginica", "versicolor"], mostly=0.99
    )

    if not result.success:
        raise Exception("iris data is no good")

    return df
 def validate_with_great_expectations(
     self,
     dataframe: pd.DataFrame,
     expectation_suite: TypeVar("ge.core.ExpectationSuite"),
     ge_validate_kwargs: Optional[Dict[Any, Any]] = {},
 ):
     report = ge.from_pandas(
         dataframe, expectation_suite=expectation_suite
     ).validate(**ge_validate_kwargs)
     return report
    def test_from_pandas(self):
        pd_df = pd.DataFrame({
            'x': [1, 3, 5, 7, 9],
            'y': [2, 4, 6, 8, 10],
            'z': [None, 'a', 'b', 'c', 'abc']
        })

        ge_df = ge.from_pandas(pd_df)
        self.assertIsInstance(ge_df, ge.dataset.Dataset)
        self.assertEquals(list(ge_df.columns), ['x', 'y', 'z'])
        self.assertEquals(list(ge_df['x']), list(pd_df['x']))
        self.assertEquals(list(ge_df['y']), list(pd_df['y']))
        self.assertEquals(list(ge_df['z']), list(pd_df['z']))
Exemple #9
0
def sanity_check(date, bucket):
    s3 = s3fs.S3FileSystem()

    output = ge.from_pandas(
        pq.ParquetDataset("s3://" + bucket + "/usp/output/date=" + date,
                          filesystem=s3).read_pandas().to_pandas())

    output.expect_column_values_to_be_in_set('accommodation_ns', [100])

    status = output.validate()
    print(status)

    if not status['success']:
        raise ValueError
Exemple #10
0
 def _file_passes(_info, df):
     with open(file_path) as ff:
         expt_config = json.load(ff)
         # This is necessary because ge ends up coercing a type change
         # on the same dataframe instance, changing the type. But a
         # ge can't be copied, because it causes an error.
         # The error is
         #  AttributeError: 'PandasDataset' object has no attribute 'discard_subset_failing_expectations'
         # See https://github.com/great-expectations/great_expectations/issues/342
         df_copy = copy.deepcopy(df)
         ge_df = ge.from_pandas(df_copy, expt_config)
         validate_result = ge_df.validate()
         check.invariant('success' in validate_result)
         check.invariant('results' in validate_result)
         return ExpectationResult(success=validate_result['success'],
                                  result_context=validate_result)
def data_qual(df: pd.DataFrame):
    """
    Функция для тестирования данных в pipeline

    :param df:
    :return:
    """
    df = ge.from_pandas(df)

    # создаем проверки
    result = df.expect_column_values_to_be_in_set('Subscriber_Type',
                                                  list(df['Subscriber_Type'].unique()),
                                                  mostly=.95)

    if not result['success']:
        err = result["exception_info"]
        raise Exception(f"You get unexpected data in Subscriber_Type column\n{err}")
def test_dataframe(df):
    """
    Test the resulting dataframe to ensure data integrity. Can be expanded
    if further tests are needed.
    """
    print("Performing final validation test on dataframe")
    ge_df = ge.from_pandas(df)

    # Check nulls

    columns_not_null = df.columns

    for column in columns_not_null:
        result = ge_df.expect_column_values_to_not_be_null(column)
        assert (result["success"] == True
                ), "Coluns contain null value. Please check data."

    # Ensure result is only between 0 and 1
    result = ge_df.expect_column_values_to_be_between("result",
                                                      min_value=0,
                                                      max_value=1)

    assert result[
        "success"] == True, "Results not between 0 and 1, please check data"

    # Make sure the gold delta does not have outliders
    gold_delta = [
        "golddiffat10",
        "golddiffat15",
        "teamgolddiffat10",
        "teamgolddiffat15",
    ]

    for column in gold_delta:
        result = ge_df.expect_column_values_to_be_between(column,
                                                          min_value=-20000,
                                                          max_value=20000)
        assert result[
            "success"] == True, "Gold diff values too high, please check data"

    print("Data validation passed.")
    return df
Exemple #13
0
    def test_failure_map_values_unique(self):
        dataset_ge = ge.from_pandas(TEST_DATA)
        dataset_ge.set_default_expectation_argument("result_format",
                                                    "COMPLETE")
        dataset_ge.expect_column_values_to_be_unique("column_one")

        failure_map = common.ge_results_to_failure_map(
            {"dataset": dataset_ge.validate()})

        self.assertEqual(
            {
                "dataset": {
                    common.EXPECT_VALUES_UNIQUE_KEY: {
                        common.COLUMN_NAME_KEY: "column_one",
                        common.FAILED_VALUES_KEY: ["value1"],
                    }
                }
            },
            failure_map,
        )
def test_from_pandas_expectations_config():
    # Logic mostly copied from TestValidation.test_validate
    def load_ge_config(file):
        with open(file) as f:
            return json.load(f)

    my_expectations_config = load_ge_config(
        "./tests/test_sets/titanic_expectations.json")

    pd_df = pd.read_csv("./tests/test_sets/Titanic.csv")
    my_df = ge.from_pandas(pd_df, expectations_config=my_expectations_config)

    my_df.set_default_expectation_argument("result_format", "COMPLETE")

    results = my_df.validate(catch_exceptions=False)

    expected_results = load_ge_config(
        "./tests/test_sets/expected_results_20180303.json")

    assertDeepAlmostEqual(results, expected_results)
Exemple #15
0
    def _get_shape_expectations(self,
                                dataset: pd.DataFrame) -> ge.dataset.Dataset:
        """
        Validates dataset shape.

        Validations:
        - Dataset columns names are a subset of mapped column names and table_schema field names
        """
        dataset_ge = ge.from_pandas(dataset, dataset_class=ShapePandasDataset)
        dataset_ge.set_default_expectation_argument("result_format",
                                                    "COMPLETE")

        valid_field_names: List[str] = table_schema.get_valid_field_names(
            self.table_schema, self.row_format)
        valid_cols: List[str] = list(
            self.column_mapping.keys()) + valid_field_names

        dataset_ge.expect_table_columns_to_be_in_set(valid_cols)
        dataset_ge.expect_named_cols()

        return dataset_ge
Exemple #16
0
    def test_failure_map_values_in_set(self):
        dataset_ge = ge.from_pandas(TEST_DATA)
        dataset_ge.set_default_expectation_argument("result_format",
                                                    "COMPLETE")
        dataset_ge.expect_column_values_to_be_in_set(
            "column_two", set(["invalid_value", "row1_col2"]))

        failure_map = common.ge_results_to_failure_map(
            {"dataset": dataset_ge.validate()})

        self.assertEqual(
            {
                "dataset": {
                    common.EXPECT_VALUES_IN_SET_KEY: {
                        common.COLUMN_NAME_KEY: "column_two",
                        common.FAILED_VALUES_KEY: ["row2_col2"],
                    }
                }
            },
            failure_map,
        )
Exemple #17
0
    def test_failure_map_values(self):
        dataset_ge = ge.from_pandas(TEST_DATA,
                                    dataset_class=ShapePandasDataset)
        dataset_ge.set_default_expectation_argument("result_format",
                                                    "COMPLETE")
        dataset_ge.expect_table_columns_to_be_in_set(
            set(["column_one", "other_column"]))

        failure_map = common.ge_results_to_failure_map(
            {"dataset": dataset_ge.validate()})

        self.assertEqual(
            {
                "dataset": {
                    common.EXPECT_COLUMNS_IN_SET_KEY: {
                        common.FAILED_VALUES_KEY: ["column_two"]
                    }
                }
            },
            failure_map,
        )
def duplicate_and_obfuscuate(df):
    df_a = df.copy()
    df_b = df.copy()

    df_a["group"] = "a"
    df_b["group"] = "b"

    for column in df_b.columns:
        if column == "group":
            continue

        if df_b[column].dtype in ["int", "float"]:
            df_b[column] += np.max(df_b[column])
            continue

        if df_b[column].dtype == "object" and all(
            [(type(elem) == str) or (elem is None) for elem in df_b[column]]
        ):
            df_b[column] = df_b[column].astype(str) + "__obfuscate"
            continue

    return ge.from_pandas(pd.concat([df_a, df_b], ignore_index=True, sort=False))
Exemple #19
0
    def __init__(self, df, meta_data):
        """
        Takes a pandas dataframe and a table meta data object and checks
        the values in the dataframe against the meta data.
        """
        if not isinstance(df, pd.DataFrame):
            raise TypeError("df must be a pandas dataframe object")

        self.meta_data = meta_data
        self.validate_meta_data()

        self.meta_cols = meta_data["columns"]
        # Placeholer for proper schema check
        if not isinstance(self.meta_cols, list):
            raise TypeError("meta_cols must be a list of objects")

        # This never fails, but the resultant types are not guaranteed to be correct
        df = impose_metadata_types_on_pd_df(df, meta_data)

        self.df_ge = ge.from_pandas(df)

        self.vlog = ValidationLog(self)
Exemple #20
0
    def _get_expectations(self, column_mapping: pd.DataFrame) -> ge.dataset.Dataset:
        """
        Returns great_expectations object for a pd.DataFrame with expectations attached.

        If not all expectations have been satisfied, this function may fail early for
        readability of the failed expectation(s).

        Expectations:
        - Has columns matching exactly to COLUMN_NAMES
        - Does not have any repeated local column names
        - All supposed GII columns are valid ones
        """
        column_mapping_ge = ge.from_pandas(column_mapping)
        column_mapping_ge.set_default_expectation_argument("result_format", "COMPLETE")

        # Shape
        shape_expectation = column_mapping_ge.expect_table_columns_to_match_ordered_list(
            COLUMN_NAMES
        )

        # If the shape isn't correct then the following expectations will raise exceptions
        if not shape_expectation["success"]:
            return column_mapping_ge

        # Check that there are no repeats of an internal column name
        column_mapping_ge.expect_column_values_to_be_unique(
            INTERNAL_COLUMN_NAME_COLUMN_NAME
        )

        # Check that all Mission Impact field names are valid
        valid_field_names = table_schema.get_valid_field_names(
            self.table_schema, self.row_format
        )
        column_mapping_ge.expect_column_values_to_be_in_set(
            MI_FIELD_NAME_COLUMN_NAME, valid_field_names
        )

        return column_mapping_ge
Exemple #21
0
    def test_failure_map_column_ordered_list(self):
        dataset_ge = ge.from_pandas(TEST_DATA)
        dataset_ge.set_default_expectation_argument("result_format",
                                                    "COMPLETE")
        dataset_ge.expect_table_columns_to_match_ordered_list(
            ["column_one", "other_column"])

        failure_map = common.ge_results_to_failure_map(
            {"dataset": dataset_ge.validate()})

        self.assertEqual(
            {
                "dataset": {
                    common.EXPECT_COLUMNS_MATCH_KEY: {
                        common.EXPECTED_ORDERED_LIST_KEY: [
                            "column_one",
                            "other_column",
                        ],
                        common.FAILED_VALUES_KEY: ["column_two"],
                    }
                }
            },
            failure_map,
        )
Exemple #22
0
 def _do_expectation(_info, df):
     ge_df = ge.from_pandas(df)
     ge_result = ge_callback(ge_df)
     check.invariant('success' in ge_result)
     return ExpectationResult(success=ge_result['success'],
                              result_context=ge_result)
RVMS_Current_Budgeted_CPU['ModelName'] = RVMS_Current_Budgeted_CPU.apply(
    getModelName, axis='columns')
RVMS_Current_Budgeted_CPU['DestCode'] = RVMS_Current_Budgeted_CPU.apply(
    getDestCode, axis='columns')

# %%
RVMS_Current_Budgeted_CPU.head()

# %% [markdown]
# ### Perform data validation checks using Great Expectations library

# %% [markdown]
# #### Create Great Expectations dataframe from pandas dataframe:

# %%
ge_df = ge.from_pandas(RVMS_Current_Budgeted_CPU)

# %% [markdown]
# #### Check Model Years are between 1994 and 2099

# %%
if ge_df.expect_column_values_to_be_between(column="ModelYear",
                                            min_value='1994',
                                            max_value='2099')['success']:
    print('Passed Model Year Check')
else:
    print('FAILED Model Year Check')
    toaster = ToastNotifier()
    toaster.show_toast("### Check Status ###",
                       "FAILED Model Year Check",
                       icon_path="images/honda_logo.ico",
Exemple #24
0
def test_errors_warnings_validation_operator_run_slack_query(
        basic_data_context_config_for_validation_operator, tmp_path_factory,
        filesystem_csv_4):
    #####
    #####
    #
    # WARNING: PY2 SUPPORT IS UNTESTED BECAUSE OF DICTIONARY ORDER ISSUES NOT YET RESOLVED
    #
    #####
    #####
    if PY2:
        pytest.skip(
            "skipping test_errors_warnings_validation_operator_run_slack_query in py2"
        )

    project_path = str(tmp_path_factory.mktemp('great_expectations'))

    # NOTE: This setup is almost identical to test_DefaultDataContextAwareValidationOperator.
    # Consider converting to a single fixture.

    data_context = ConfigOnlyDataContext(
        basic_data_context_config_for_validation_operator,
        project_path,
    )

    data_context.add_datasource("my_datasource",
                                class_name="PandasDatasource",
                                base_directory=str(filesystem_csv_4))

    data_context.create_expectation_suite(
        data_asset_name="my_datasource/default/f1",
        expectation_suite_name="failure")
    df = data_context.get_batch("my_datasource/default/f1",
                                "failure",
                                batch_kwargs=data_context.yield_batch_kwargs(
                                    "my_datasource/default/f1"))
    df.expect_column_values_to_be_between(column="x", min_value=1, max_value=9)
    failure_expectations = df.get_expectation_suite(
        discard_failed_expectations=False)
    data_context.save_expectation_suite(
        failure_expectations,
        data_asset_name="my_datasource/default/f1",
        expectation_suite_name="failure")

    data_context.create_expectation_suite(
        data_asset_name="my_datasource/default/f1",
        expectation_suite_name="warning")
    df = data_context.get_batch("my_datasource/default/f1",
                                "warning",
                                batch_kwargs=data_context.yield_batch_kwargs(
                                    "my_datasource/default/f1"))
    df.expect_column_values_to_be_between(column="x", min_value=1, max_value=9)
    df.expect_column_values_to_not_be_null(column="y")
    warning_expectations = df.get_expectation_suite(
        discard_failed_expectations=False)
    data_context.save_expectation_suite(
        warning_expectations,
        data_asset_name="my_datasource/default/f1",
        expectation_suite_name="warning")

    data_context.save_expectation_suite(
        failure_expectations,
        data_asset_name="my_datasource/default/f2",
        expectation_suite_name="failure")
    data_context.save_expectation_suite(
        failure_expectations,
        data_asset_name="my_datasource/default/f3",
        expectation_suite_name="failure")
    data_context.save_expectation_suite(
        warning_expectations,
        data_asset_name="my_datasource/default/f2",
        expectation_suite_name="warning")
    data_context.save_expectation_suite(
        warning_expectations,
        data_asset_name="my_datasource/default/f3",
        expectation_suite_name="warning")

    vo = WarningAndFailureExpectationSuitesValidationOperator(
        data_context=data_context,
        action_list=[],
        slack_webhook="https://hooks.slack.com/services/test/slack/webhook")

    my_df_1 = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 4, None]})
    my_ge_df_1 = ge.from_pandas(my_df_1)
    my_ge_df_1._expectation_suite["data_asset_name"] = DataAssetIdentifier(
        "my_datasource", "default", "f1")

    my_df_2 = pd.DataFrame({"x": [1, 2, 3, 4, 99], "y": [1, 2, 3, 4, 5]})
    my_ge_df_2 = ge.from_pandas(my_df_2)
    my_ge_df_2._expectation_suite["data_asset_name"] = DataAssetIdentifier(
        "my_datasource", "default", "f2")

    my_df_3 = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 4, 5]})
    my_ge_df_3 = ge.from_pandas(my_df_3)
    my_ge_df_3._expectation_suite["data_asset_name"] = DataAssetIdentifier(
        "my_datasource", "default", "f3")

    return_obj = vo.run(
        assets_to_validate=[my_ge_df_1, my_ge_df_2, my_ge_df_3],
        run_id="test_100")
    slack_query = vo._build_slack_query(return_obj)
    expected_slack_query = {
        'blocks': [{
            'type': 'divider'
        }, {
            'type': 'section',
            'text': {
                'type': 'mrkdwn',
                'text': '*FailureVsWarning Validation Operator Completed.*'
            }
        }, {
            'type': 'divider'
        }, {
            'type': 'section',
            'text': {
                'type': 'mrkdwn',
                'text': '*Status*: Failed :x:'
            }
        }, {
            'type': 'section',
            'text': {
                'type':
                'mrkdwn',
                'text':
                '*Data Asset List:* [my_datasource/default/f1, my_datasource/default/f2, my_datasource/default/f3]'
            }
        }, {
            'type': 'section',
            'text': {
                ''
                'type': 'mrkdwn',
                'text': '*Failed Data Assets:* [my_datasource/default/f2]'
            }
        }, {
            'type': 'section',
            'text': {
                'type': 'mrkdwn',
                'text': '*Run ID:* test_100'
            }
        }, {
            'type': 'section',
            'text': {
                'type': 'mrkdwn',
                'text': '*Timestamp:* 09/26/2019 13:42:41'
            }
        }, {
            'type': 'divider'
        }, {
            'type':
            'context',
            'elements': [{
                'type':
                'mrkdwn',
                'text':
                'Learn about FailureVsWarning Validation Operators at https://docs.greatexpectations.io/en/latest/reference/validation_operators/warning_and_failure_expectation_suites_validation_operator.html'
            }]
        }]
    }

    # We're okay with system variation in locales (OS X likes 24 hour, but not Travis)
    slack_query['blocks'][7]['text']['text'] = \
        slack_query['blocks'][7]['text']['text'].replace('09/26/2019 13:42:41', 'LOCALEDATE')
    slack_query['blocks'][7]['text']['text'] = \
        slack_query['blocks'][7]['text']['text'].replace('09/26/2019 01:42:41 PM', 'LOCALEDATE')
    expected_slack_query['blocks'][7]['text']['text'] = \
        expected_slack_query['blocks'][7]['text']['text'].replace('09/26/2019 13:42:41', 'LOCALEDATE')
    expected_slack_query['blocks'][7]['text']['text'] = \
        expected_slack_query['blocks'][7]['text']['text'].replace('09/26/2019 01:42:41 PM', 'LOCALEDATE')

    import json
    print(json.dumps(slack_query, indent=2))
    print(json.dumps(expected_slack_query, indent=2))
    assert slack_query == expected_slack_query
Exemple #25
0
import pandas
from pandas_profiling import ProfileReport
import great_expectations as ge
from great_expectations.profile.basic_dataset_profiler import BasicDatasetProfiler

if __name__ == "__main__":
    with mlflow.start_run(run_name="check_verify_data") as run:

        mlflow.set_tag("mlflow.runName", "check_verify_data")

        df = pandas.read_csv("./data/raw/data.csv")

        describe_to_dict = df.describe().to_dict()
        mlflow.log_dict(describe_to_dict, "describe_data.json")

        pd_df_ge = ge.from_pandas(df)

        assert pd_df_ge.expect_column_values_to_match_strftime_format(
            "Date", "%Y-%m-%d").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "High", "float").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Low", "float").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Open", "float").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Close", "float").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Volume", "long").success == True
        assert pd_df_ge.expect_column_values_to_be_of_type(
            "Adj Close", "float").success == True
Exemple #26
0
 def _get_batch_kwargs(df: pd.DataFrame) -> dict:
     dataset = ge.from_pandas(df)
     return {"dataset": dataset, "datasource": "pandas"}