Example #1
0
def test_ge_generally(csvpath: Path):
    # a PandasDataset is a pd.Dataframe, unless GE ever switches over to not subclassing
    assert issubclass(PandasDataset, pd.DataFrame)

    df_ge: PandasDataset = ge.read_csv(str(csvpath))

    # check that a column is there
    res: ExpectationSuiteValidationResult = df_ge.expect_column_to_exist("opening_eco")
    assert res.success

    # check that a column value is in the set
    assert df_ge.expect_column_values_to_be_in_set("opening_eco", ["A22"]).success

    # check that things are of expected types
    expected_types = {
        "opening_eco": "object",
        "black_rating": "int64",
        "rated": "bool",
        "created_at": "float64",
    }

    for col, et in expected_types.items():
        assert df_ge.expect_column_values_to_be_of_type(col, et).success

    # there are a bunch of patzers in this dataset ;-) verify that
    # harrygz in the test data fixture to has a rating of 2100 in order to simulate a failure
    assert not df_ge.expect_column_values_to_be_between(
        column="white_rating", min_value=500, max_value=2000
    ).success

    # but black is ok! ;-) (chess joke)
    assert not df_ge.expect_column_values_to_be_between(
        column="black_rating", min_value=500, max_value=2000
    ).success

    # great expectations allows you to "save" the suite of expectations you called on a df,
    # and then use that suite generally. let's try it.
    expectation_suite: ExpectationSuite = df_ge.get_expectation_suite()
    assert len(expectation_suite.expectations) == 6
    assert expectation_suite.expectations[0].expectation_type == "expect_column_to_exist"

    # you can just save it do a directory
    suite_save_file = str(csvpath.parent / "suite.json")
    df_ge.save_expectation_suite(filepath=suite_save_file, discard_failed_expectations=False)

    # now lets load it in
    fresh_df: PandasDataset = ge.read_csv(str(csvpath))
    validation_report: ExpectationValidationResult = fresh_df.validate(
        expectation_suite=suite_save_file
    )
    assert not validation_report.success
    assert len(validation_report.results) == 8
    failed_result: ExpectationValidationResult = validation_report.results[7]
    assert not failed_result.success
    assert failed_result.result["partial_unexpected_list"] == [2100, 2001]
Example #2
0
 def _open_file(self, file):
     if (os.path.isfile(file)):
         extension = self._get_file_extension(file)
         if extension.__eq__(".gz"):
             return ge.read_csv(filename=file,
                                compression="gzip",
                                names=self.columns)
         else:
             return ge.read_csv(filename=file, names=self.columns)
     else:
         raise RuntimeError("File {0} doesn't exist".format(file))
def execute_great_expectations_test_case(title, input_file_path, expectations_config_path):
    """
    This method executes one
    ...
    :return: None. asserts correctness of results.
    """


    # Run the pipeline under test and get the run's output

    out_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'test_out.csv')
    SuperPipeline().process_file(input_file_path, out_file_path)

    # Load the output dataset into Great Expectations, along with the expectations for this output dataset
    # (must be read from the file - path passed as an argument to this method),
    # and validate the dataset.

    with open(expectations_config_path) as f:
        expectations_config = json.load(f)
    output_df = ge.read_csv(out_file_path, expectations_config=expectations_config)
    output_validation_results = output_df.validate(result_format="SUMMARY", catch_exceptions=True)['results']

    # Pass the validation results to the a method that will assert that the output dataset has met all expectations
    # and will list all the unmet expectations otherwise.

    process_validation_results(output_validation_results)
Example #4
0
def test_validate_with_invalid_result(validate_result_dict):
    with open(
        file_relative_path(__file__, "./test_sets/titanic_expectations.json")
    ) as f:
        my_expectation_suite = expectationSuiteSchema.loads(f.read())

    with mock.patch("uuid.uuid1") as uuid:
        uuid.return_value = "1234"
        my_df = ge.read_csv(
            file_relative_path(__file__, "./test_sets/Titanic.csv"),
            expectation_suite=my_expectation_suite,
        )
    my_df.set_default_expectation_argument("result_format", "COMPLETE")

    results = my_df.validate()  # catch_exceptions=True is default

    with open(
        file_relative_path(
            __file__,
            "./test_sets/titanic_expected_data_asset_validate_results_with_exceptions.json",
        )
    ) as f:
        expected_results = expectationSuiteValidationResultSchema.loads(f.read())

    del results.meta["great_expectations_version"]
    del results.meta["expectation_suite_meta"]["great_expectations_version"]

    for result in results.results:
        result.exception_info.pop("exception_traceback")

    assert results.to_json_dict() == expected_results.to_json_dict()
    def __init__(self, *args, **kwargs):
        super(TestUtilMethods, self).__init__(*args, **kwargs)
        self.D = ge.read_csv(
            './tests/test_sets/distributional_expectations_data_base.csv')

        with open('./tests/test_sets/test_partitions.json', 'r') as file:
            self.test_partitions = json.loads(file.read())
Example #6
0
def test_execute_expectation_suite_failure():
    df = ge.read_csv(file_relative_path(__file__, './num_bad_data.csv'))
    validation = df.validate(
        expectation_suite=file_relative_path(__file__, 'num_expectations.json')
    )

    ers = expectation_result_list_from_validation(validation)

    assert ers[0].success is True
    assert ers[1].success is True
    assert ers[2].success is False

    assert ers[2].metadata_entries[0].entry_data.data == {
        'success': False,
        'result': {
            'observed_value': -2.0,
            'element_count': 2,
            'missing_count': 0,
            'missing_percent': 0.0,
        },
        'exception_info': {
            'raised_exception': False,
            'exception_message': None,
            'exception_traceback': None,
        },
        'expectation_config': {
            'expectation_type': 'expect_column_mean_to_be_between',
            'kwargs': {'column': 'num1', 'min_value': 0, 'max_value': 10},
        },
    }
Example #7
0
    def test_infer_distribution_parameters(self):
        D = ge.read_csv(
            './tests/test_sets/fixed_distributional_test_dataset.csv')

        with self.assertRaises(TypeError):
            ge.dataset.util.infer_distribution_parameters(
                data=D.norm,
                distribution='norm',
                params=['wrong_param_format'])
        t = ge.dataset.util.infer_distribution_parameters(data=D.norm_std,
                                                          distribution='norm',
                                                          params=None)
        self.assertEqual(t['mean'], D.norm_std.mean())
        self.assertEqual(t['std_dev'], D.norm_std.std())
        self.assertEqual(t['loc'], 0)
        self.assertEqual(t['scale'], 1)

        # beta
        t = ge.dataset.util.infer_distribution_parameters(data=D.beta,
                                                          distribution='beta')
        self.assertEqual(t['alpha'], (t['mean']**2) *
                         (((1 - t['mean']) / t['std_dev']**2) -
                          (1 / t['mean'])), "beta dist, alpha infer")
        self.assertEqual(t['beta'], t['alpha'] * ((1 / t['mean']) - 1),
                         "beta dist, beta infer")

        # gamma
        t = ge.dataset.util.infer_distribution_parameters(data=D.gamma,
                                                          distribution='gamma')
        self.assertEqual(t['alpha'], D.gamma.mean())

        # uniform distributions
        t = ge.dataset.util.infer_distribution_parameters(
            data=D.uniform, distribution='uniform')
        self.assertEqual(t['min'], min(D.uniform), "uniform, min infer")
        self.assertEqual(t['max'],
                         max(D.uniform) - min(D.uniform), "uniform, max infer")

        uni_loc = 5
        uni_scale = 10
        t = ge.dataset.util.infer_distribution_parameters(
            data=D.uniform,
            distribution='uniform',
            params={
                'loc': uni_loc,
                'scale': uni_scale
            })
        self.assertEqual(t['min'], uni_loc, "uniform, min infer")
        self.assertEqual(t['max'], uni_scale, "uniform, max infer")

        # expon distribution
        with self.assertRaises(AttributeError):
            ge.dataset.util.infer_distribution_parameters(
                data=D.norm, distribution='fakedistribution')

        # chi2
        t = ge.dataset.util.infer_distribution_parameters(data=D.chi2,
                                                          distribution='chi2')
        self.assertEqual(t['df'], D.chi2.mean())
Example #8
0
def titanic_validator(titanic_data_context_modular_api):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected
    """
    df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv"))

    return get_pandas_runtime_validator(titanic_data_context_modular_api, df)
Example #9
0
    def test_validate(self):

        with open("./tests/test_sets/titanic_expectations.json") as f:
            my_expectations_config = json.load(f)

        my_df = ge.read_csv(
            "./tests/test_sets/Titanic.csv",
            expectations_config=my_expectations_config
        )
        my_df.set_default_expectation_argument("result_format", "COMPLETE")

        results = my_df.validate(catch_exceptions=False)
        # print json.dumps(results, indent=2)

        with open('./tests/test_sets/expected_results_20180303.json') as f:
            expected_results = json.load(f)
            #print json.dumps(expected_results, indent=2)

        self.maxDiff = None
        assertDeepAlmostEqual(self,
                              results,
                              expected_results
                              )

        # Now, change the results and ensure they are no longer equal
        results[0] = {}
        self.assertNotEqual(results,
                            expected_results
                            )

        # Finally, confirm that only_return_failures works
        # and does not affect the "statistics" field.
        validation_results = my_df.validate(only_return_failures=True)
        #print json.dumps(validation_results)
        assertDeepAlmostEqual(
            self,
            validation_results,
            {"results": [
                {"expectation_config": {
                     "expectation_type": "expect_column_values_to_be_in_set",
                     "kwargs": {"column": "PClass", "values_set": ["1st", "2nd", "3rd"], "result_format": "COMPLETE"}
                 },
                 "success": False,
                 "exception_info": {"exception_message": None,
                                    "exception_traceback": None,
                                    "raised_exception": False},
                 "result": {"partial_unexpected_index_list": [456], "unexpected_count": 1, "unexpected_list": ["*"],
                                "unexpected_percent": 0.0007616146230007616, "element_count": 1313,
                                "missing_percent": 0.0, "partial_unexpected_counts": [{"count": 1, "value": "*"}],
                                "partial_unexpected_list": ["*"],
                                "unexpected_percent_nonmissing": 0.0007616146230007616, "missing_count": 0,
                                "unexpected_index_list": [456]}}
            ],
            "success": expected_results["success"],  # unaffected
            "statistics": expected_results["statistics"],  # unaffected
            }
        )
Example #10
0
def test_snapshot_BasicDatasetProfiler_on_titanic():
    """
    A snapshot regression test for BasicDatasetProfiler.
    We are running the profiler on the Titanic dataset
    and comparing the EVRs to ones retrieved from a
    previously stored file.
    """
    df = ge.read_csv(file_relative_path(__file__, "../test_sets/Titanic.csv"))
    suite, evrs = df.profile(BasicDatasetProfiler)

    # Check to make sure BasicDatasetProfiler is adding meta.columns with a single "description" field for each column
    assert "columns" in suite.meta
    for k, v in suite.meta["columns"].items():
        assert v == {"description": ""}

    # Note: the above already produces an EVR; rerunning isn't strictly necessary just for EVRs
    evrs = df.validate(result_format="SUMMARY")

    # THIS IS NOT DEAD CODE. UNCOMMENT TO SAVE A SNAPSHOT WHEN UPDATING THIS TEST
    # with open('tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json', 'w+') as file:
    #     json.dump(expectationSuiteValidationResultSchema.dump(evrs).data, file, indent=2)
    #
    # with open('tests/render/fixtures/BasicDatasetProfiler_evrs.json', 'w+') as file:
    #     json.dump(expectationSuiteValidationResultSchema.dump(evrs).data, file, indent=2)

    with open(
            file_relative_path(
                __file__,
                "../test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json"
            ),
            "r",
    ) as file:
        expected_evrs = expectationSuiteValidationResultSchema.load(
            json.load(file, object_pairs_hook=OrderedDict)).data

    # We know that python 2 does not guarantee the order of value_counts, which causes a different
    # order for items in the partial_unexpected_value_counts list
    # Remove those before assertions.
    for result in evrs.results:
        if "partial_unexpected_counts" in result.result:
            result.result.pop("partial_unexpected_counts")

    for result in expected_evrs.results:
        if "partial_unexpected_counts" in result.result:
            result.result.pop("partial_unexpected_counts")

    # Version and RUN-ID will be different
    del expected_evrs.meta["great_expectations.__version__"]
    del evrs.meta["great_expectations.__version__"]
    del expected_evrs.meta["run_id"]
    del evrs.meta["run_id"]
    del evrs.meta["batch_kwargs"]["ge_batch_id"]

    # DISABLE TEST IN PY2 BECAUSE OF ORDER ISSUE AND NEAR-EOL
    if not PY2:
        assert expected_evrs == evrs
def taxi_validator_sqlalchemy(titanic_data_context_modular_api):
    """
    What does this test do and why?
    Ensures that all available expectation types work as expected
    """
    df = ge.read_csv(
        file_relative_path(__file__, "../test_sets/yellow_tripdata_sample_2019-01.csv"),
        parse_dates=["pickup_datetime", "dropoff_datetime"],
    )
    return get_sqlalchemy_runtime_validator_postgresql(df)
Example #12
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.D = ge.read_csv(
            file_relative_path(
                __file__,
                "../test_sets/distributional_expectations_data_base.csv"))

        with open(
                file_relative_path(
                    __file__, "../test_sets/test_partitions.json")) as file:
            self.test_partitions = json.loads(file.read())
    def test_custom_class(self):
        script_path = os.path.dirname(os.path.realpath(__file__))
        df = ge.read_csv(
            script_path+'/test_sets/Titanic.csv',
            dataset_class=CustomPandasDataset
        )
        df.set_default_expectation_argument("result_format", "COMPLETE")
        self.assertEqual(
            df.expect_column_values_to_be_prime(
                'Age')['result']['unexpected_list'],
            [30.0, 25.0, 0.92000000000000004, 63.0, 39.0, 58.0, 50.0, 24.0, 36.0, 26.0, 25.0, 25.0, 28.0, 45.0, 39.0,
             30.0, 58.0, 45.0, 22.0, 48.0, 44.0, 60.0, 45.0, 58.0, 36.0, 33.0, 36.0, 36.0, 14.0, 49.0, 36.0, 46.0, 27.0,
             27.0, 26.0, 64.0, 39.0, 55.0, 70.0, 69.0, 36.0, 39.0, 38.0, 27.0, 27.0, 4.0, 27.0, 50.0, 48.0, 49.0, 48.0,
             39.0, 36.0, 30.0, 24.0, 28.0, 64.0, 60.0, 49.0, 44.0, 22.0, 60.0, 48.0, 35.0, 22.0, 45.0, 49.0, 54.0, 38.0,
             58.0, 45.0, 46.0, 25.0, 21.0, 48.0, 49.0, 45.0, 36.0, 55.0, 52.0, 24.0, 16.0, 44.0, 51.0, 42.0, 35.0, 35.0,
             38.0, 35.0, 50.0, 49.0, 46.0, 58.0, 42.0, 40.0, 42.0, 55.0, 50.0, 16.0, 21.0, 30.0, 15.0, 30.0, 46.0, 54.0,
             36.0, 28.0, 65.0, 33.0, 44.0, 55.0, 36.0, 58.0, 64.0, 64.0, 22.0, 28.0, 22.0, 18.0, 52.0, 46.0, 56.0, 33.0,
             27.0, 55.0, 54.0, 48.0, 18.0, 21.0, 34.0, 40.0, 36.0, 50.0, 39.0, 56.0, 28.0, 56.0, 56.0, 24.0, 18.0, 24.0,
             45.0, 40.0, 6.0, 57.0, 32.0, 62.0, 54.0, 52.0, 62.0, 63.0, 46.0, 52.0, 39.0, 18.0, 48.0, 49.0, 39.0, 46.0,
             64.0, 60.0, 60.0, 55.0, 54.0, 21.0, 57.0, 45.0, 50.0, 50.0, 27.0, 20.0, 51.0, 21.0, 36.0, 40.0, 32.0, 33.0,
             30.0, 28.0, 18.0, 34.0, 32.0, 57.0, 18.0, 36.0, 28.0, 51.0, 32.0, 28.0, 36.0, 4.0, 1.0, 12.0, 34.0, 26.0,
             27.0, 15.0, 45.0, 40.0, 20.0, 25.0, 36.0, 25.0, 42.0, 26.0, 26.0, 0.82999999999999996, 54.0, 44.0, 52.0,
             30.0, 30.0, 27.0, 24.0, 35.0, 8.0, 22.0, 30.0, 20.0, 21.0, 49.0, 8.0, 28.0, 18.0, 28.0, 22.0, 25.0, 18.0,
             32.0, 18.0, 42.0, 34.0, 8.0, 21.0, 38.0, 38.0, 35.0, 35.0, 38.0, 24.0, 16.0, 26.0, 45.0, 24.0, 21.0, 22.0,
             34.0, 30.0, 50.0, 30.0, 1.0, 44.0, 28.0, 6.0, 30.0, 45.0, 24.0, 24.0, 49.0, 48.0, 34.0, 32.0, 21.0, 18.0,
             21.0, 52.0, 42.0, 36.0, 21.0, 33.0, 34.0, 22.0, 45.0, 30.0, 26.0, 34.0, 26.0, 22.0, 1.0, 25.0, 48.0, 57.0,
             27.0, 30.0, 20.0, 45.0, 46.0, 30.0, 48.0, 54.0, 64.0, 32.0, 18.0, 32.0, 26.0, 20.0, 39.0, 22.0, 24.0, 28.0,
             50.0, 20.0, 40.0, 42.0, 21.0, 32.0, 34.0, 33.0, 8.0, 36.0, 34.0, 30.0, 28.0, 0.80000000000000004, 25.0,
             50.0, 21.0, 25.0, 18.0, 20.0, 30.0, 30.0, 35.0, 22.0, 25.0, 25.0, 14.0, 50.0, 22.0, 27.0, 27.0, 30.0, 22.0,
             35.0, 30.0, 28.0, 12.0, 40.0, 36.0, 28.0, 32.0, 4.0, 36.0, 33.0, 32.0, 26.0, 30.0, 24.0, 18.0, 42.0, 16.0,
             35.0, 16.0, 25.0, 18.0, 20.0, 30.0, 26.0, 40.0, 24.0, 18.0, 0.82999999999999996, 20.0, 25.0, 35.0, 32.0,
             20.0, 39.0, 39.0, 6.0, 38.0, 9.0, 26.0, 4.0, 20.0, 26.0, 25.0, 18.0, 24.0, 35.0, 40.0, 38.0, 9.0, 45.0,
             27.0, 20.0, 32.0, 33.0, 18.0, 40.0, 26.0, 15.0, 45.0, 18.0, 27.0, 22.0, 26.0, 22.0, 20.0, 32.0, 21.0, 18.0,
             26.0, 6.0, 9.0, 40.0, 32.0, 26.0, 18.0, 20.0, 22.0, 22.0, 35.0, 21.0, 20.0, 18.0, 18.0, 38.0, 30.0, 21.0,
             21.0, 21.0, 24.0, 33.0, 33.0, 28.0, 16.0, 28.0, 24.0, 21.0, 32.0, 26.0, 18.0, 20.0, 24.0, 24.0, 36.0, 30.0,
             22.0, 35.0, 27.0, 30.0, 36.0, 9.0, 44.0, 45.0, 22.0, 30.0, 34.0, 28.0, 0.33000000000000002, 27.0, 25.0,
             24.0, 22.0, 21.0, 26.0, 33.0, 1.0, 0.17000000000000001, 25.0, 36.0, 36.0, 30.0, 26.0, 65.0, 42.0, 32.0,
             30.0, 24.0, 24.0, 24.0, 22.0, 18.0, 16.0, 45.0, 21.0, 18.0, 9.0, 48.0, 16.0, 25.0, 38.0, 22.0, 16.0, 33.0,
             9.0, 38.0, 40.0, 14.0, 16.0, 9.0, 10.0, 6.0, 40.0, 32.0, 20.0, 28.0, 24.0, 28.0, 24.0, 20.0, 45.0, 26.0,
             21.0, 27.0, 18.0, 26.0, 22.0, 28.0, 22.0, 27.0, 42.0, 27.0, 25.0, 27.0, 20.0, 48.0, 34.0, 22.0, 33.0, 32.0,
             26.0, 49.0, 1.0, 33.0, 4.0, 24.0, 32.0, 27.0, 21.0, 32.0, 20.0, 21.0, 30.0, 21.0, 22.0, 4.0, 39.0, 20.0,
             21.0, 44.0, 42.0, 21.0, 24.0, 25.0, 22.0, 22.0, 39.0, 26.0, 4.0, 22.0, 26.0, 1.5, 36.0, 18.0, 25.0, 22.0,
             20.0, 26.0, 22.0, 32.0, 21.0, 21.0, 36.0, 39.0, 25.0, 45.0, 36.0, 30.0, 20.0, 21.0, 1.5, 25.0, 18.0, 63.0,
             18.0, 15.0, 28.0, 36.0, 28.0, 10.0, 36.0, 30.0, 22.0, 14.0, 22.0, 51.0, 18.0, 45.0, 28.0, 21.0, 27.0, 36.0,
             27.0, 15.0, 27.0, 26.0, 22.0, 24.0]
        )

        primes = [3, 5, 7, 11, 13, 17, 23, 31]
        df["primes"] = df.Age.map(lambda x: random.choice(primes))
        self.assertEqual(
            df.expect_column_values_to_be_prime(
                "primes")['result']['unexpected_list'],
            []
        )
Example #14
0
    def test_validate(self):

        with open("./tests/test_sets/titanic_expectations.json") as f:
            my_expectation_suite = json.load(f)

        my_df = ge.read_csv("./tests/test_sets/Titanic.csv",
                            profiler=ge.profile.ColumnsExistProfiler)

        self.assertEqual(len(my_df.get_expectation_suite()['expectations']), 7)

        # For column_expectations, _append_expectation should only replace expectations where the expetation_type AND the column match
        my_df.expect_column_to_exist("PClass")
        self.assertEqual(len(my_df.get_expectation_suite()['expectations']), 7)
Example #15
0
def test_full_oobe_flow():
    df = ge.read_csv("examples/data/Titanic.csv")
    df.profile(BasicDatasetProfiler)
    evrs = df.validate()  # ["results"]

    rendered_json = ProfilingResultsPageRenderer().render(evrs)
    rendered_page = DefaultJinjaPageView().render(rendered_json)

    with open('./tests/render/output/test_full_oobe_flow.html', 'wb') as f:
        f.write(rendered_page.encode("utf-8"))

    assert rendered_page[:15] == "<!DOCTYPE html>"
    assert rendered_page[-7:] == "</html>"
Example #16
0
    def test_validate(self):

        with open("./tests/test_sets/titanic_expectations.json") as f:
            my_expectations_config = json.load(f)

        my_df = ge.read_csv("./tests/test_sets/Titanic.csv",
                            expectations_config=my_expectations_config)
        my_df.set_default_expectation_argument("output_format", "COMPLETE")

        results = my_df.validate(catch_exceptions=False)
        # print json.dumps(results, indent=2)

        with open('./tests/test_sets/expected_results_20170721.json') as f:
            expected_results = json.load(f)
            # print json.dumps(expected_results, indent=2)

        self.maxDiff = None
        #!!! This needs to be converted to unicode, I think

        # print json.dumps(results, indent=2)
        # print '-'*80
        # print json.dumps(expected_results, indent=2)
        # self.assertEqual(
        #     json.loads(json.dumps(results)),
        #     json.loads(json.dumps(expected_results))
        # )
        assertDeepAlmostEqual(self, results, expected_results)

        #Now, change the results and ensure they are no longer equal
        results[0] = {}
        self.assertNotEqual(results, expected_results)

        validation_results = my_df.validate(only_return_failures=True)
        # print json.dumps(validation_results, indent=2)
        assertDeepAlmostEqual(
            self, validation_results, {
                "results": [{
                    "exception_traceback": None,
                    "expectation_type": "expect_column_values_to_be_in_set",
                    "success": False,
                    "exception_list": ["*"],
                    "raised_exception": False,
                    "kwargs": {
                        "column": "PClass",
                        "output_format": "COMPLETE",
                        "values_set": ["1st", "2nd", "3rd"]
                    },
                    "exception_index_list": [456]
                }]
            })
Example #17
0
def test_display_column_evrs_as_section():
    #TODO: We should add a fixture that contains EVRs
    df = ge.read_csv("./tests/test_sets/Titanic.csv")
    df.profile(BasicDatasetProfiler)
    evrs = df.validate(result_format="SUMMARY")  # ["results"]

    html_to_display = jux.display_column_evrs_as_section(
        evrs, "Name", include_styling=False, return_without_displaying=True)
    print(html_to_display)

    #FIXME: This isn't a full snapshot test.
    assert '<div id="section-1" class="ge-section container-fluid">' in html_to_display
    assert '<span class="badge badge-info" >Carlsson, Mr Frans Olof</span>' in html_to_display
    assert '<li class="list-group-item d-flex justify-content-between align-items-center" >expect_column_values_to_be_in_type_list <span class="badge badge-secondary badge-pill" >True</span></li>' in html_to_display
Example #18
0
    def get_ge_df(self, dataset_name, bucket_name=None, **kwargs):
        if not self.check_for_key(dataset_name, bucket_name):
            aws_access_key_id, aws_secret_access_key, region_name, s3_endpoint_url = self._get_credentials('eu-west-1')
            self.log.info(aws_access_key_id)
            self.log.info(aws_secret_access_key)
            raise AirflowException("The source key {0} does not exist in bucket {1}".format(dataset_name, bucket_name))

        s3_key_object = self.get_key(dataset_name, bucket_name)
        with NamedTemporaryFile("w") as temp_file:
            self.log.info("Temp dumping S3 file {0} contents to local {1} file".format(dataset_name, temp_file.name))
            s3_key_object.download_file(temp_file.name)
            temp_file.flush()

            return ge.read_csv(temp_file.name, **kwargs)
Example #19
0
def test_full_oobe_flow():
    df = ge.read_csv(file_relative_path(__file__, "../../examples/data/Titanic.csv"))
    df.data_asset_name = "my_datasource/my_generator/my_asset"
    df.profile(BasicDatasetProfiler)
    evrs = df.validate()  # results

    rendered_content = ProfilingResultsPageRenderer().render(evrs)
    rendered_page = DefaultJinjaPageView().render(rendered_content)

    with open(file_relative_path(__file__, './output/test_full_oobe_flow.html'), 'wb') as f:
        f.write(rendered_page.encode("utf-8"))

    assert rendered_page[:15] == "<!DOCTYPE html>"
    assert rendered_page[-7:] == "</html>"
Example #20
0
def test_validate_with_invalid_result_catch_exceptions_false(validate_result_dict):

    with open(file_relative_path(__file__, "./test_sets/titanic_expectations.json")) as f:
        my_expectation_suite = expectationSuiteSchema.loads(f.read())

    with mock.patch("uuid.uuid1") as uuid:
        uuid.return_value = "1234"
        my_df = ge.read_csv(
            file_relative_path(__file__, "./test_sets/Titanic.csv"),
            expectation_suite=my_expectation_suite
        )
    my_df.set_default_expectation_argument("result_format", "COMPLETE")

    with pytest.raises(InvalidCacheValueError):
        my_df.validate(catch_exceptions=False)
Example #21
0
def test_BasicDatasetProfiler_on_titanic():
    """
    A snapshot test for BasicDatasetProfiler.
    We are running the profiler on the Titanic dataset
    and comparing the EVRs to ones retrieved from a
    previously stored file.
    """
    df = ge.read_csv("./tests/test_sets/Titanic.csv")
    suite, evrs = df.profile(BasicDatasetProfiler)

    # Check to make sure BasicDatasetProfiler is adding meta.columns with a single "description" field for each column
    print(json.dumps(suite["meta"], indent=2))
    assert "columns" in suite["meta"]
    for k, v in suite["meta"]["columns"].items():
        assert v == {"description": ""}

    # Note: the above already produces an EVR; rerunning isn't strictly necessary just for EVRs
    evrs = df.validate(result_format="SUMMARY")  # ["results"]

    # with open('tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json', 'w+') as file:
    #     file.write(json.dumps(evrs))
    #
    # with open('tests/render/fixtures/BasicDatasetProfiler_evrs.json', 'w+') as file:
    #     file.write(json.dumps(evrs))

    with open(
            'tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json',
            'r') as file:
        expected_evrs = json.load(file, object_pairs_hook=OrderedDict)

    expected_evrs.pop("meta")
    evrs.pop("meta")

    # We know that python 2 does not guarantee the order of value_counts, which causes a different
    # order for items in the partial_unexpected_value_counts list
    # Remove those before test.
    for result in evrs["results"]:
        if "partial_unexpected_counts" in result["result"]:
            result["result"].pop("partial_unexpected_counts")

    for result in expected_evrs["results"]:
        if "partial_unexpected_counts" in result["result"]:
            result["result"].pop("partial_unexpected_counts")

    # DISABLE TEST IN PY2 BECAUSE OF ORDER ISSUE AND NEAR-EOL
    if not PY2:
        assertDeepAlmostEqual(expected_evrs, evrs)
    def _get_dataframe(self):
        """
        Load dataframe based on specified connection
        :return:
        """
        if self.source_conn is None:  # Use local file
            return ge.read_csv(self.dataset_name, **self.dataset_params)

        if isinstance(self.source_conn, S3Hook):
            hook = ExpectationS3CsvHook(aws_conn_id=self.source_conn_id)

            return hook.get_ge_df(self.dataset_name, self.source_bucket_name, **self.dataset_params)

        if isinstance(self.source_conn, DbApiHook):
            hook = ExpectationMySQLHook(mysql_conn_id=self.source_conn_id)

            return hook.get_ge_df(self.dataset_name, **self.dataset_params)
Example #23
0
def validate(parsed_args):
    """
    Read a dataset file and validate it using a config saved in another file. Uses parameters defined in the dispatch
    method.

    :param parsed_args: A Namespace object containing parsed arguments from the dispatch method.
    :return: The number of unsucessful expectations
    """
    parsed_args = vars(parsed_args)
    data_set = parsed_args['dataset']
    expectations_config_file = parsed_args['expectations_config_file']

    expectations_config = json.load(open(expectations_config_file))

    if parsed_args["evaluation_parameters"] is not None:
        evaluation_parameters = json.load(
            open(parsed_args["evaluation_parameters"]))
    else:
        evaluation_parameters = None

    if parsed_args["custom_dataset_module"]:
        sys.path.insert(0,
                        os.path.dirname(parsed_args["custom_dataset_module"]))
        module_name = os.path.basename(
            parsed_args["custom_dataset_module"]).split('.')[0]
        custom_module = __import__(module_name)
        dataset_class = getattr(custom_module,
                                parsed_args["custom_dataset_class"])

    else:
        dataset_class = PandasDataset

    df = read_csv(data_set,
                  expectations_config=expectations_config,
                  dataset_class=dataset_class)

    result = df.validate(
        evaluation_parameters=evaluation_parameters,
        result_format=parsed_args["result_format"],
        catch_exceptions=parsed_args["catch_exceptions"],
        only_return_failures=parsed_args["only_return_failures"],
    )

    print(json.dumps(result, indent=2))
    return result['statistics']['unsuccessful_expectations']
    def test_validate(self):

        with open("./tests/test_sets/titanic_expectations.json") as f:
            my_expectations_config = json.load(f)

        my_df = ge.read_csv("./tests/test_sets/Titanic.csv",
                            autoinspect_func=columns_exist)

        self.assertEqual(
            len(my_df.get_expectations_config()['expectations']),
            7
        )

        # For column_expectations, _append_expectation should only replace expectations where the expetation_type AND the column match
        my_df.expect_column_to_exist("PClass")
        self.assertEqual(
            len(my_df.get_expectations_config()['expectations']),
            7
        )
Example #25
0
def test_dummy_ge():
    df = ge.read_csv(file_relative_path(__file__, './num.csv'))
    ge_evr = df.expect_column_mean_to_be_between('num1', 0, 10)

    er = create_expectation_result('expect_column_mean_to_be_between', ge_evr)

    assert er.label == 'expect_column_mean_to_be_between'
    assert er.success
    assert len(er.metadata_entries) == 1
    assert er.metadata_entries[0].label == 'evr'
    assert er.metadata_entries[0].entry_data.data == {
        'success': True,
        'result': {
            'observed_value': 2.0,
            'element_count': 2,
            'missing_count': 0,
            'missing_percent': 0.0,
        },
    }
Example #26
0
def test_display_column_evrs_as_section(empty_data_context):
    #TODO: We should add a fixture that contains EVRs
    df = ge.read_csv("./tests/test_sets/Titanic.csv")
    df.profile(BasicDatasetProfiler)
    evrs = df.validate(result_format="SUMMARY")  # ["results"]

    html_to_display = jux.display_column_evrs_as_section(
        evrs, "Name", include_styling=False, return_without_displaying=True)
    print(html_to_display)

    #FIXME: This isn't a full snapshot test.
    assert '<div id="section-1" class="ge-section container-fluid">' in html_to_display
    assert '<span class="badge badge-info" style="word-break:break-all;" >Carlsson, Mr Frans Olof</span>' in html_to_display
    assert """\
    <span class="cooltip" >
                    Type: None
                    <span class=top>
                        expect_column_values_to_be_of_type <br>expect_column_values_to_be_in_type_list
                    </span>
                </span>""" in html_to_display
def test_validate_with_invalid_result_catch_exceptions_false(empty_data_context):
    context: DataContext = empty_data_context
    with open(
        file_relative_path(__file__, "./test_sets/titanic_expectations.json")
    ) as f:
        my_expectation_suite_dict: dict = expectationSuiteSchema.loads(f.read())
        my_expectation_suite: ExpectationSuite = ExpectationSuite(
            **my_expectation_suite_dict, data_context=context
        )

    with mock.patch("uuid.uuid1") as uuid:
        uuid.return_value = "1234"
        my_df = ge.read_csv(
            file_relative_path(__file__, "./test_sets/Titanic.csv"),
            expectation_suite=my_expectation_suite,
        )
    my_df.set_default_expectation_argument("result_format", "COMPLETE")

    with pytest.raises(InvalidCacheValueError):
        with pytest.warns(Warning, match=r"No great_expectations version found"):
            my_df.validate(catch_exceptions=False)
def test_BasicDatasetProfiler_on_titanic():
    """
    A snapshot test for BasicDatasetProfiler.
    We are running the profiler on the Titanic dataset
    and comparing the EVRs to ones retrieved from a
    previously stored file.
    """
    df = ge.read_csv("./tests/test_sets/Titanic.csv")
    suite, evrs = df.profile(BasicDatasetProfiler)

    # Note: the above already produces an EVR; rerunning isn't strictly necessary just for EVRs
    evrs = df.validate(result_format="SUMMARY")  # ["results"]

    # with open('tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json', 'w+') as file:
    #     file.write(json.dumps(evrs))
    #
    # with open('tests/render/fixtures/BasicDatasetProfiler_evrs.json', 'w+') as file:
    #     file.write(json.dumps(evrs))

    with open(
            'tests/test_sets/expected_evrs_BasicDatasetProfiler_on_titanic.json',
            'r') as file:
        expected_evrs = json.load(file, object_pairs_hook=OrderedDict)

    expected_evrs.pop("meta")
    evrs.pop("meta")

    # We know that python 2 does not guarantee the order of value_counts, which causes a different
    # order for items in the partial_unexpected_value_counts list
    # Remove those before test.
    for result in evrs["results"]:
        if "partial_unexpected_counts" in result["result"]:
            result["result"].pop("partial_unexpected_counts")

    for result in expected_evrs["results"]:
        if "partial_unexpected_counts" in result["result"]:
            result["result"].pop("partial_unexpected_counts")

    assertDeepAlmostEqual(expected_evrs, evrs)
Example #29
0
def validate_csv_using_greatexpectations(
        csv_path: InputPath(),
        expectation_suite_path: InputPath(),
        data_doc_path: OutputPath(),
):
    """Validate a CSV dataset against a Great Expectations suite and create
    Data Doc (a validation report). This component fails if validation is not
    successful.

    Annotations:
        authors: Yaroslav Beshta <*****@*****.**>, Anton Kiselev <*****@*****.**>

    Args:
        csv_path: Path to the CSV file with the dataset.
        expectation_suite_path: Path to Great Expectations expectation suite (in JSON format)
    """
    import json
    import os
    import sys

    import great_expectations as ge
    from great_expectations.render import DefaultJinjaPageView
    from great_expectations.render.renderer import ValidationResultsPageRenderer

    with open(expectation_suite_path, 'r') as json_file:
        expectation_suite = json.load(json_file)
    df = ge.read_csv(csv_path, expectation_suite=expectation_suite)
    result = df.validate()

    document_model = ValidationResultsPageRenderer().render(result)
    os.makedirs(os.path.dirname(data_doc_path), exist_ok=True)
    with open(data_doc_path, 'w') as writer:
        writer.write(DefaultJinjaPageView().render(document_model))

    print(f'Saved: {data_doc_path}')

    if not result.success:
        sys.exit(1)
Example #30
0
def validate(parsed_args):
    parsed_args = vars(parsed_args)
    data_set = parsed_args['dataset']
    expectations_config_file = parsed_args['expectations_config_file']

    expectations_config = json.load(open(expectations_config_file))

    if parsed_args["evaluation_parameters"] is not None:
        evaluation_parameters = json.load(
            open(parsed_args["evaluation_parameters"]))
    else:
        evaluation_parameters = None

    if parsed_args["custom_dataset_module"]:
        sys.path.insert(0,
                        os.path.dirname(parsed_args["custom_dataset_module"]))
        module_name = os.path.basename(
            parsed_args["custom_dataset_module"]).split('.')[0]
        custom_module = __import__(module_name)
        dataset_class = getattr(custom_module,
                                parsed_args["custom_dataset_class"])

    else:
        dataset_class = PandasDataset

    df = read_csv(data_set,
                  expectations_config=expectations_config,
                  dataset_class=dataset_class)

    result = df.validate(
        evaluation_parameters=evaluation_parameters,
        result_format=parsed_args["result_format"],
        catch_exceptions=parsed_args["catch_exceptions"],
        only_return_failures=parsed_args["only_return_failures"],
    )

    print(json.dumps(result, indent=2))