def test_warning_tf_multiple_dp_with_update(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) print('running dp1') profile1 = dp.Profiler(data, profiler_options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) print('running dp2') profile2 = dp.Profiler(data, profiler_options=profile_options) profile1.update_profile(data)
def test_warning_tf(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "chi2_homogeneity.is_enabled": False, "datetime.is_enabled": False }) profile = dp.StructuredProfiler(data, options=profile_options) results = profile.report() columns = [] predictions = [] for i in range(len(results['data_stats'])): columns.append(i) predictions.append(results['data_stats'][i]['data_label'])
def test_warning_tf_run_dp_multiple_times(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") path = os.path.join(test_dir, "csv/diamonds.csv") for i in range(3): print("running dp =============================", i) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "chi2_homogeneity.is_enabled": False, "datetime.is_enabled": False, } ) profile = dp.StructuredProfiler(data, options=profile_options) results = profile.report() columns = [] predictions = [] for j in range(len(results["data_stats"])): columns.append(j) predictions.append(results["data_stats"][j]["data_label"])
def _pandas( cls, execution_engine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration, ): df, _, _ = execution_engine.get_compute_domain( metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE) first_profile = None try: first_profile_path = metric_value_kwargs["profile_path"] first_profile = dp.Profiler.load(first_profile_path) except FileNotFoundError: raise ValueError( "'profile_path' does not point to a valid DataProfiler stored profile." ) profiler_opts = dp.ProfilerOptions() profiler_opts.structured_options.multiprocess.is_enabled = False new_profile = dp.Profiler(df, options=profiler_opts) report_diff = new_profile.diff( first_profile) # Results in diff of new_prof - first_prof # Values in this report indicate +/- change from old profile return report_diff
def test_warning_tf_run_dp_multiple_times(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') for i in range(3): print('running dp =============================', i) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) profile = dp.Profiler(data, profiler_options=profile_options) results = profile.report() columns = [] predictions = [] for col in results['data_stats']: columns.append(col) predictions.append(results['data_stats'][col]['data_label'])
def setUpClass(cls): cls.data = pd.DataFrame( [[1, 'a', 1.0, '1/2/2021'], [None, 'b', None, '1/2/2020'], [3, 'c', 3.5, '1/2/2022'], [4, 'd', 4.5, '1/2/2023'], [5, 'e', 6.0, '5/2/2020'], [None, 'f', None, '1/5/2020'], [1, 'g', 1.0, '2/5/2020'], [None, 1, 10.0, '3/5/2020']], columns=['int', 'str', 'float', 'datetime']) cls.options = dp.ProfilerOptions() cls.options.set({"data_labeler.is_enabled": False}) cls.options.set({"multiprocess.is_enabled": False}) cls.profiler = dp.StructuredProfiler(cls.data, options=cls.options)
def test_warning_tf_multiple_dp_with_update(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") path = os.path.join(test_dir, "csv/diamonds.csv") data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False, } ) print("running dp1") profile1 = dp.StructuredProfiler(data, options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False, } ) print("running dp2") profile2 = dp.StructuredProfiler(data, options=profile_options) profile1.update_profile(data)
def test_integrated_merge_diff_options(self): options = dp.ProfilerOptions() options.set({'data_labeler.is_enabled': False}) data = pd.DataFrame([1, 2, 3, 4]) profile1 = dp.Profiler(data, profiler_options=options) profile2 = dp.Profiler(data) with self.assertRaisesRegex( ValueError, 'Structured profilers were not setup with ' 'the same options, hence they do not ' 'calculate the same profiles and cannot be ' 'added together.'): profile1 + profile2
def test_warning_tf_run_dp_merge(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False }) print('running dp1') profile1 = dp.StructuredProfiler(data, options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False }) print('running dp2') profile2 = dp.StructuredProfiler(data, options=profile_options) profile = profile1 + profile2
def setUpClass(cls): cls.data = pd.DataFrame( [ [1, "a", 1.0, "1/2/2021"], [None, "b", None, "1/2/2020"], [3, "c", 3.5, "1/2/2022"], [4, "d", 4.5, "1/2/2023"], [5, "e", 6.0, "5/2/2020"], [None, "f", None, "1/5/2020"], [1, "g", 1.0, "2/5/2020"], [None, 1, 10.0, "3/5/2020"], ], columns=["int", "str", "float", "datetime"], ) cls.options = dp.ProfilerOptions() cls.options.set({"data_labeler.is_enabled": False}) cls.options.set({"multiprocess.is_enabled": False}) cls.profiler = dp.StructuredProfiler(cls.data, options=cls.options)
def test_marginal_dist_detection(): iris = datasets.load_iris() data = pd.DataFrame( data=np.c_[iris["data"], iris["target"]], columns=iris["feature_names"] + ["target"], ) data.target = data.target.astype(int) profile_options = dp.ProfilerOptions() profile_options.set({ "data_labeler.is_enabled": False, "correlation.is_enabled": True, "structured_options.multiprocess.is_enabled": False, }) profile = dp.Profiler(data, options=profile_options) report = profile.report() marginal_dist_list = detect_dist(report) assert len(marginal_dist_list) == len( report["data_stats"] ), "Length of distributions list must be equal to number of columns" for col_num, col in enumerate(report["data_stats"]): dist_name = marginal_dist_list[col_num]["dist"] assert hasattr( stats, dist_name ), "The detected distribution must be defined in scipy.stats" dist_method = getattr(stats, dist_name) if col["data_type"] == "float": assert isinstance( dist_method, stats.rv_continuous ), "Detected distribution must be continuous for columns with continuous random variables" else: assert isinstance( dist_method, stats.rv_discrete ), "Detected distribution must be discrete for columns with discrete random variables"
def setUpClass(cls): cls.options = dp.ProfilerOptions() cls.options.set({"data_labeler.is_enabled": False}) cls.options.set({"multiprocess.is_enabled": False}) cls.options.set({"correlation.is_enabled": False}) cls.options.set({"chi2_homogeneity.is_enabled": False})
class ExpectColumnValuesToBeEqualToOrGreaterThanProfileMin(ColumnMapExpectation): """ This function builds upon the custom column map expectations of Great Expectations. This function asks a yes/no question of each row in the user-specified column; namely, is the value greater than or equal to the minimum value of the respective column within the provided profile report generated from the DataProfiler. Args: column(str): The column that you want to check. profile(dict(str, Any)): The report, which is assumed to contain a column of the same name, previously generated using the DataProfiler. df.expect_column_values_to_be_equal_to_or_greater_than_profile_min( column, profile ) """ # These examples will be shown in the public gallery. # They will also be executed as unit tests for your Expectation. data = [ [-36, -25, -44], [18, 45, 46], [-16, -29, -49], [21, 4, 35], [-18, -7, -40], [22, -4, -37], [-17, -21, 11], [48, -32, -48], [0, -44, 20], ] cols = ["col_a", "col_b", "col_c"] df = pd.DataFrame(data, columns=cols) profiler_opts = dp.ProfilerOptions() profiler_opts.structured_options.multiprocess.is_enabled = False profileObj = dp.Profiler(df, options=profiler_opts) profileReport = profileObj.report(report_options={"output_format": "serializable"}) profileReport["global_stats"]["profile_schema"] = dict( profileReport["global_stats"]["profile_schema"] ) examples = [ { "data": { "col_a": [-3, 21, 20, 5], "col_b": [-7, 41, -47, 12], "col_c": [54, -10, 19, 19], }, "tests": [ { "title": "column_lower_bounded_by_min", "exact_match_out": False, "include_in_gallery": True, "in": { "column": "col_a", "profile": profileReport, }, "out": {"success": True}, }, { "title": "column_has_value_less_than_min", "exact_match_out": False, "include_in_gallery": True, "in": { "column": "col_b", "profile": profileReport, }, "out": {"success": False}, }, ], } ] # This is the id string of the Metric used by this Expectation. # For most Expectations, it will be the same as the `condition_metric_name` defined in your Metric class above. map_metric = "column_values.greater_than_or_equal_to_profile_min" # This is a list of parameter names that can affect whether the Expectation evaluates to True or False success_keys = ( "profile", "mostly", ) # This dictionary contains default values for any parameters that should have default values default_kwarg_values = { "profile": None, "result_format": "BASIC", "include_config": True, "catch_exceptions": False, } # This object contains metadata for display in the public Gallery library_metadata = { "requirements": ["dataprofiler", "tensorflow", "scikit-learn", "numpy"], "maturity": "experimental", # "concept_only", "experimental", "beta", or "production" "tags": ["dataprofiler"], # Tags for this Expectation in the Gallery "contributors": [ # Github handles for all contributors to this Expectation. "@stevensecreti", # Don't forget to add your github handle here! ], }
try: import sys sys.path.insert(0, "../../..") import dataprofiler as dp except ImportError: import dataprofiler as dp # suppress TF warnings tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) ################################################################################ ######################## set any optional changes here ######################### ################################################################################ options = dp.ProfilerOptions() # these two options default to True if commented out options.structured_options.multiprocess.is_enabled = False # options.structured_options.data_labeler.is_enabled = False # parameter alteration ALLOW_SUBSAMPLING = True # profiler to subsample the dataset if large PERCENT_TO_NAN = 0.0 # Value must be between 0 and 100 sample_sizes = [100, 1000, 5000, 7500, int(1e5)] ################################################################################ if __name__ == "__main__": # set seed
class ExpectProfileNumericColumnsDiffBetweenInclusiveThresholdRange( TableExpectation): """ This expectation takes the difference report between the data it is called on and a DataProfiler profile of the same schema loaded from a provided path. This function builds upon the custom table expectations of Great Expectations. Each numerical column will be checked against a user provided dictionary of columns paired with dictionaries of statistics containing lower and upper bounds. It is expected that a statistics value for a given column is within the specified threshold, inclusive. Args: profile_path (str): A path to a saved DataProfiler profile object on the local filesystem. limit_check_report_keys (dict): A dict, containing column names as keys and dicts as values that contain statistics as keys and dicts as values containing two keys: "lower" denoting the lower bound for the threshold range, and "upper" denoting the upper bound for the threshold range. mostly (float - optional): a value indicating the lower bound percentage of successful values that must be present to evaluate to success=True. validator.expect_profile_numerical_columns_diff_between_threshold_range( profile_path = "C:/path_to/my_profile.pkl", limit_check_report_keys = { "column_one": { "min": {"lower": 2.0, "upper": 10.0}, }, "*": { "*": {"lower": 0, "upper": 100}, }, } ) Note: In limit_check_report_keys, "*" in place of a column denotes a general operator in which the value it stores will be applied to every column in the data that has no explicit key. "*" in place of a statistic denotes a general operator in which the bounds it stores will be applied to every statistic for the given column that has no explicit key. """ example_profile_data = [ [2, 5, "10", "ten", 25], [4, 10, "20", "twenty", 50], [6, 15, "30", "thirty", 75], [8, 20, "40", "forty", 100], [10, 25, "50", "fifty", 125], ] example_profile_columns = [ "by_2", "by_5", "str_by_10", "words_by_10", "by_25", ] df = pd.DataFrame(example_profile_data, columns=example_profile_columns) profiler_opts = dp.ProfilerOptions() profiler_opts.structured_options.multiprocess.is_enabled = False example_profile = dp.Profiler(df, options=profiler_opts) profile_path = ( "/example_profiles/expect_profile_diff_less_than_threshold_profile.pkl" ) dir_path = os.path.dirname(os.path.abspath(__file__)) profile_path = dir_path + profile_path example_profile.save(filepath=profile_path) examples = [ { "data": { "by_2": [4, 6, 8, 10, 12], "by_5": [10, 15, 20, 25, 30], "str_by_10": ["20", "30", "40", "50", "60"], "words_by_10": ["twenty", "thirty", "forty", "fifty", "sixty"], "by_25": [50, 75, 100, 125, 150], }, "tests": [ { "title": "profile_min_delta_witin_threshold", "exact_match_out": False, "include_in_gallery": True, "in": { "profile_path": profile_path, "limit_check_report_keys": { "*": { "min": { "lower": 0, "upper": 50 }, }, }, }, "out": { "success": True }, }, { "title": "profile_all_stats_beyond_delta_threshold", "exact_match_out": False, "include_in_gallery": True, "in": { "profile_path": profile_path, "limit_check_report_keys": { "*": { "*": { "lower": 0, "upper": 0 } }, "by_2": { "min": { "lower": -1, "upper": 1 }, }, }, }, "out": { "success": False }, }, { "title": "checking_single_failure_in_one_column", "exact_match_out": False, "include_in_gallery": True, "in": { "profile_path": profile_path, "limit_check_report_keys": { "*": { "*": { "lower": -25, "upper": 50 } }, "by_2": { "min": { "lower": 0, "upper": 0 } }, }, }, "out": { "success": False }, }, { "title": "single_failure_still_mostly_successful", "exact_match_out": False, "include_in_gallery": True, "in": { "profile_path": profile_path, "limit_check_report_keys": { "*": { "*": { "lower": -25, "upper": 50 } }, "by_2": { "min": { "lower": 0, "upper": 0 } }, }, "mostly": 0.75, }, "out": { "success": True }, }, ], }, ] metric_dependencies = ( "data_profiler.profile_numeric_columns_diff_between_inclusive_threshold_range", ) success_keys = ( "profile_path", "limit_check_report_keys", "numerical_diff_statistics", "mostly", ) default_limit_check_report_keys = { "*": { "min": { "lower": 0, "upper": 0 }, "max": { "lower": 0, "upper": 0 }, "sum": { "lower": 0, "upper": 0 }, "mean": { "lower": 0, "upper": 0 }, "median": { "lower": 0, "upper": 0 }, "median_absolute_deviation": { "lower": 0, "upper": 0 }, "variance": { "lower": 0, "upper": 0 }, "stddev": { "lower": 0, "upper": 0 }, "unique_count": { "lower": 0, "upper": 0 }, "unique_ratio": { "lower": 0, "upper": 0 }, "gini_impurity": { "lower": 0, "upper": 0 }, "unalikeability": { "lower": 0, "upper": 0 }, "sample_size": { "lower": 0, "upper": 0 }, "null_count": { "lower": 0, "upper": 0 }, } } numerical_diff_statistics = list( default_limit_check_report_keys["*"].keys()) default_kwarg_values = { "limit_check_report_keys": default_limit_check_report_keys, "numerical_diff_statistics": numerical_diff_statistics, "mostly": 1.0, } def _validate( self, configuration: ExpectationConfiguration, metrics: Dict, runtime_configuration: dict = None, execution_engine: ExecutionEngine = None, ): delta_between_thresholds = metrics.get( "data_profiler.profile_numeric_columns_diff_between_inclusive_threshold_range" ) mostly = self.get_success_kwargs().get( "mostly", self.default_kwarg_values.get("mostly")) unexpected_values = {} total_stats = 0.0 failed_stats = 0.0 for column, value in delta_between_thresholds.items(): column_unexpected_values = {} for stat, val in value.items(): if val is not True: column_unexpected_values[stat] = val failed_stats += 1.0 total_stats += 1.0 if column_unexpected_values != {}: unexpected_values[column] = column_unexpected_values successful_stats = total_stats - failed_stats percent_successful = successful_stats / total_stats success = percent_successful >= mostly results = { "success": success, "expectation_config": configuration, "result": { "unexpected_values": unexpected_values, }, } return results library_metadata = { "requirements": ["dataprofiler", "tensorflow", "scikit-learn", "numpy"], "maturity": "experimental", # "concept_only", "experimental", "beta", or "production" "tags": [ "dataprofiler", "dataassistance", ], # Tags for this Expectation in the Gallery "contributors": [ # Github handles for all contributors to this Expectation. "@stevensecreti", # Don't forget to add your github handle here! ], }