def test_get_batch_with_split_on_divided_integer_and_sample_on_list(test_df):
    split_df = PandasExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(
            batch_data=test_df,
            splitter_method="_split_on_divided_integer",
            splitter_kwargs={
                "column_name": "id",
                "divisor": 10,
                "batch_identifiers": {
                    "id": 5
                },
            },
            sampling_method="_sample_using_mod",
            sampling_kwargs={
                "column_name": "id",
                "mod": 5,
                "value": 4,
            },
        ))
    assert split_df.dataframe.shape == (2, 10)
    assert split_df.dataframe.id.min() == 54
    assert split_df.dataframe.id.max() == 59
def test_get_compute_domain_with_no_domain_kwargs():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={}, domain_type="identity")
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"

    # Trying same test with enum form of table domain - should work the same way
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={}, domain_type=MetricDomainTypes.TABLE)
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
Esempio n. 3
0
def test_reader_fn():
    engine = PandasExecutionEngine()

    # Testing that can recognize basic excel file
    fn = engine._get_reader_fn(path="myfile.xlsx")
    assert "<function read_excel" in str(fn)

    # Testing that can recognize basic sas7bdat file
    fn_read_sas7bdat = engine._get_reader_fn(path="myfile.sas7bdat")
    assert "<function read_sas" in str(fn_read_sas7bdat)

    # Testing that can recognize basic SAS xpt file
    fn_read_xpt = engine._get_reader_fn(path="myfile.xpt")
    assert "<function read_sas" in str(fn_read_xpt)

    # Ensuring that other way around works as well - reader_method should always override path
    fn_new = engine._get_reader_fn(reader_method="read_csv")
    assert "<function" in str(fn_new)
def test_get_compute_domain_with_column_pair_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4],
        "b": [2, 3, 4, 5],
        "c": [1, 2, 3, 4]
    })
    expected_identity = df.drop(columns=["c"])

    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b"
        },
        domain_type="column_pair")
    assert data.equals(df), "Data does not match after getting compute domain"
    assert compute_kwargs == {}, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {
        "column_A": "a",
        "column_B": "b",
    }, "Accessor kwargs have been modified"

    # Trying same test with enum form of table domain - should work the same way
    data, compute_kwargs, accessor_kwargs = engine.get_compute_domain(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b"
        },
        domain_type="identity")

    assert data.equals(
        expected_identity), "Data does not match after getting compute domain"
    assert compute_kwargs == {
        "column_A": "a",
        "column_B": "b",
    }, "Compute domain kwargs should be existent"
    assert accessor_kwargs == {}, "Accessor kwargs have been modified"
def test_get_batch_with_split_on_whole_table_s3():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
    keys: List[str] = [
        "path/A-100.csv",
        "path/A-101.csv",
        "directory/B-1.csv",
        "directory/B-2.csv",
    ]
    for key in keys:
        client.put_object(Bucket=bucket,
                          Body=test_df.to_csv(index=False).encode("utf-8"),
                          Key=key)

    path = "path/A-100.csv"
    full_path = f"s3a://{os.path.join(bucket, path)}"
    test_df = PandasExecutionEngine().get_batch_data(batch_spec=S3BatchSpec(
        path=full_path,
        reader_method="read_csv",
        splitter_method="_split_on_whole_table",
    ))
    assert test_df.dataframe.shape == (2, 2)

    # if S3 was not configured
    execution_engine_no_s3 = PandasExecutionEngine()
    execution_engine_no_s3._s3 = None
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        execution_engine_no_s3.get_batch_data(batch_spec=S3BatchSpec(
            path=full_path,
            reader_method="read_csv",
            splitter_method="_split_on_whole_table",
        ))
def ge_validator_pandas() -> Validator:
    validator = Validator(execution_engine=PandasExecutionEngine())
    return validator
def test_sample_using_random(test_df):
    random.seed(1)
    sampled_df = PandasExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(batch_data=test_df,
                             sampling_method="_sample_using_random"))
    assert sampled_df.dataframe.shape == (13, 10)
def test_get_batch_with_split_on_whole_table(test_df):
    split_df = PandasExecutionEngine().get_batch_data(
        RuntimeDataBatchSpec(batch_data=test_df,
                             splitter_method="_split_on_whole_table"))
    assert split_df.dataframe.shape == (120, 10)
Esempio n. 9
0
def test_get_batch_with_split_on_whole_table_s3(
        batch_with_split_on_whole_table_s3, test_df_small):
    df = PandasExecutionEngine().get_batch_data(
        batch_spec=batch_with_split_on_whole_table_s3)
    assert df.dataframe.shape == test_df_small.shape
Esempio n. 10
0
def test_get_domain_records_with_multicolumn_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, None, 5],
        "b": [2, 3, 4, 5, 6, 7],
        "c": [1, 2, 3, 4, None, 6],
    })
    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data = engine.get_domain_records(
        domain_kwargs={
            "column_list": ["a", "c"],
            "row_condition": "b>2",
            "condition_parser": "pandas",
            "ignore_row_if": "all_values_are_missing",
        })
    data = data.astype(int)

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [2, 3, 4, 5],
            "b": [3, 4, 5, 7],
            "c": [2, 3, 4, 6]
        },
        index=[1, 2, 3, 5])

    assert data.equals(
        expected_multicolumn_df
    ), "Data does not match after getting full access compute domain"

    data = engine.get_domain_records(
        domain_kwargs={
            "column_list": ["b", "c"],
            "row_condition": "a<5",
            "condition_parser": "pandas",
            "ignore_row_if": "any_value_is_missing",
        })
    data = data.astype(int)

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [1, 2, 3, 4],
            "b": [2, 3, 4, 5],
            "c": [1, 2, 3, 4]
        },
        index=[0, 1, 2, 3])

    assert data.equals(
        expected_multicolumn_df
    ), "Data does not match after getting full access compute domain"

    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, None, 5],
        "b": [2, 3, 4, 5, 6, 7],
        "c": [1, 2, 3, 4, None, 6],
    })
    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data = engine.get_domain_records(domain_kwargs={
        "column_list": ["b", "c"],
        "ignore_row_if": "never",
    })

    expected_multicolumn_df = pd.DataFrame(
        {
            "a": [1, 2, 3, 4, None, 5],
            "b": [2, 3, 4, 5, 6, 7],
            "c": [1, 2, 3, 4, None, 6],
        },
        index=[0, 1, 2, 3, 4, 5],
    )

    assert data.equals(
        expected_multicolumn_df
    ), "Data does not match after getting full access compute domain"
Esempio n. 11
0
def test_get_domain_records_with_column_pair_domain():
    engine = PandasExecutionEngine()
    df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5, 6],
        "b": [2, 3, 4, 5, None, 6],
        "c": [1, 2, 3, 4, 5, None],
    })
    # Loading batch data
    engine.load_batch_data(batch_data=df, batch_id="1234")

    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "a",
            "column_B": "b",
            "row_condition": "b>2",
            "condition_parser": "pandas",
            "ignore_row_if": "both_values_are_missing",
        })

    expected_column_pair_df = pd.DataFrame(
        {
            "a": [2, 3, 4, 6],
            "b": [3.0, 4.0, 5.0, 6.0],
            "c": [2.0, 3.0, 4.0, None],
        },
        index=[1, 2, 3, 5],
    )
    assert data.equals(
        expected_column_pair_df
    ), "Data does not match after getting full access compute domain"

    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "b",
            "column_B": "c",
            "row_condition": "b>2",
            "condition_parser": "pandas",
            "ignore_row_if": "either_value_is_missing",
        })
    data = data.astype(int)

    expected_column_pair_df = pd.DataFrame(
        {
            "a": [2, 3, 4],
            "b": [3, 4, 5],
            "c": [2, 3, 4]
        }, index=[1, 2, 3])

    assert data.equals(
        expected_column_pair_df
    ), "Data does not match after getting full access compute domain"

    data = engine.get_domain_records(
        domain_kwargs={
            "column_A": "b",
            "column_B": "c",
            "row_condition": "a<6",
            "condition_parser": "pandas",
            "ignore_row_if": "neither",
        })

    expected_column_pair_df = pd.DataFrame({
        "a": [1, 2, 3, 4, 5],
        "b": [2.0, 3.0, 4.0, 5.0, None],
        "c": [1.0, 2.0, 3.0, 4.0, 5.0],
    })

    assert data.equals(
        expected_column_pair_df
    ), "Data does not match after getting full access compute domain"
Esempio n. 12
0
def test_get_batch_with_gcs_misconfigured(gcs_batch_spec):
    # gcs_batchspec point to data that the ExecutionEngine does not have access to
    execution_engine_no_gcs = PandasExecutionEngine()
    # Raises error if batch_spec causes ExecutionEngine error
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        execution_engine_no_gcs.get_batch_data(batch_spec=gcs_batch_spec)