def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any], run_id: str, read_from_catalog: bool): if self.expectation_context is None: return for dataset_name, dataset_value in data.items(): ran_suite_for_dataset = False target_suite_names = get_suite_names(self.expectations_map, dataset_name, self.suite_types) dataset = catalog._get_dataset(dataset_name) dataset_path = getattr(dataset, "_filepath", None) if read_from_catalog: df = dataset_value if isinstance( dataset, MemoryDataSet) else dataset.load() else: df = dataset_value try: for target_suite_name in target_suite_names: if (target_suite_name not in self.expectation_suite_names or target_suite_name in self._finished_suites): continue validation = self._run_suite(dataset_name, dataset_path, df, target_suite_name, run_id) if self._fail_fast and not validation.success: raise SuiteValidationFailure( f"Suite {target_suite_name} for DataSet {dataset_name} failed!" ) elif not validation.success: self._failed_suites.append( FailedSuite(target_suite_name, dataset_name)) self._finished_suites.add(target_suite_name) ran_suite_for_dataset = True if not ran_suite_for_dataset: self.logger.warning( f"Missing Expectation Suite for DataSet: {dataset_name}" ) except UnsupportedDataSet: self.logger.warning( f"Unsupported DataSet Type: {dataset_name}({type(dataset)})" )
def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any], run_id: str): for dataset_name, dataset_value in data.items(): if dataset_name not in self.DATASET_EXPECTATION_MAPPING: continue dataset = catalog._get_dataset(dataset_name) dataset_path = str(dataset._filepath) expectation_suite = self.DATASET_EXPECTATION_MAPPING[dataset_name] expectation_context = ge.data_context.DataContext() batch = expectation_context.get_batch( { 'path': dataset_path, 'datasource': 'files_datasource' }, expectation_suite) expectation_context.run_validation_operator( "action_list_operator", assets_to_validate=[batch], run_id=run_id)
def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any], run_id: str): for dataset_name, dataset_value in data.items(): for suite_type in self.suite_types: if suite_type is None: target_expectation_suite_name = f'{self.expectations_map.get(dataset_name, dataset_name)}' else: target_expectation_suite_name = f'{self.expectations_map.get(dataset_name, dataset_name)}.{suite_type}' if target_expectation_suite_name not in self.expectation_suite_names: self.logger.warning( f"Missing Expectation Suite: {target_expectation_suite_name}" ) continue dataset = catalog._get_dataset(dataset_name) dataset_class = self._get_ge_class_name(dataset) if dataset_class is None: self.logger.warning( f"Unsupported DataSet Type: {dataset_name}({type(dataset)})" ) continue self._run_suite(dataset, target_expectation_suite_name, run_id)
def _log_artifact(self, artifact_name: str, catalog: DataCatalog): with mlflow.start_run(run_id=self.run_id): logger.info("Logging artifact %s", artifact_name) dataset = catalog._get_dataset(artifact_name) dataset_path = str(dataset._filepath) mlflow.log_artifact(dataset_path)
def _get_dataset_data_params(namespace: str, catalog: DataCatalog): try: node_data: Optional[AbstractDataSet] = catalog._get_dataset(namespace) except DataSetNotFoundError: node_data = None return node_data