Ejemplo n.º 1
0
    def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any],
                        run_id: str, read_from_catalog: bool):
        if self.expectation_context is None:
            return

        for dataset_name, dataset_value in data.items():
            ran_suite_for_dataset = False

            target_suite_names = get_suite_names(self.expectations_map,
                                                 dataset_name,
                                                 self.suite_types)

            dataset = catalog._get_dataset(dataset_name)
            dataset_path = getattr(dataset, "_filepath", None)
            if read_from_catalog:
                df = dataset_value if isinstance(
                    dataset, MemoryDataSet) else dataset.load()
            else:
                df = dataset_value

            try:
                for target_suite_name in target_suite_names:
                    if (target_suite_name not in self.expectation_suite_names
                            or target_suite_name in self._finished_suites):
                        continue

                    validation = self._run_suite(dataset_name, dataset_path,
                                                 df, target_suite_name, run_id)

                    if self._fail_fast and not validation.success:
                        raise SuiteValidationFailure(
                            f"Suite {target_suite_name} for DataSet {dataset_name} failed!"
                        )
                    elif not validation.success:
                        self._failed_suites.append(
                            FailedSuite(target_suite_name, dataset_name))

                    self._finished_suites.add(target_suite_name)
                    ran_suite_for_dataset = True

                if not ran_suite_for_dataset:
                    self.logger.warning(
                        f"Missing Expectation Suite for DataSet: {dataset_name}"
                    )
            except UnsupportedDataSet:
                self.logger.warning(
                    f"Unsupported DataSet Type: {dataset_name}({type(dataset)})"
                )
Ejemplo n.º 2
0
    def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any],
                        run_id: str):
        for dataset_name, dataset_value in data.items():
            if dataset_name not in self.DATASET_EXPECTATION_MAPPING:
                continue

            dataset = catalog._get_dataset(dataset_name)
            dataset_path = str(dataset._filepath)
            expectation_suite = self.DATASET_EXPECTATION_MAPPING[dataset_name]

            expectation_context = ge.data_context.DataContext()
            batch = expectation_context.get_batch(
                {
                    'path': dataset_path,
                    'datasource': 'files_datasource'
                }, expectation_suite)
            expectation_context.run_validation_operator(
                "action_list_operator",
                assets_to_validate=[batch],
                run_id=run_id)
Ejemplo n.º 3
0
    def _run_validation(self, catalog: DataCatalog, data: Dict[str, Any],
                        run_id: str):
        for dataset_name, dataset_value in data.items():
            for suite_type in self.suite_types:
                if suite_type is None:
                    target_expectation_suite_name = f'{self.expectations_map.get(dataset_name, dataset_name)}'
                else:
                    target_expectation_suite_name = f'{self.expectations_map.get(dataset_name, dataset_name)}.{suite_type}'

                if target_expectation_suite_name not in self.expectation_suite_names:
                    self.logger.warning(
                        f"Missing Expectation Suite: {target_expectation_suite_name}"
                    )
                    continue

                dataset = catalog._get_dataset(dataset_name)
                dataset_class = self._get_ge_class_name(dataset)
                if dataset_class is None:
                    self.logger.warning(
                        f"Unsupported DataSet Type: {dataset_name}({type(dataset)})"
                    )
                    continue

                self._run_suite(dataset, target_expectation_suite_name, run_id)
Ejemplo n.º 4
0
 def _log_artifact(self, artifact_name: str, catalog: DataCatalog):
     with mlflow.start_run(run_id=self.run_id):
         logger.info("Logging artifact %s", artifact_name)
         dataset = catalog._get_dataset(artifact_name)
         dataset_path = str(dataset._filepath)
         mlflow.log_artifact(dataset_path)
Ejemplo n.º 5
0
def _get_dataset_data_params(namespace: str, catalog: DataCatalog):
    try:
        node_data: Optional[AbstractDataSet] = catalog._get_dataset(namespace)
    except DataSetNotFoundError:
        node_data = None
    return node_data