def get_batch(self, batch_kwargs, batch_parameters=None):
        """class-private implementation of get_data_asset"""
        if self.spark is None:
            logger.error("No spark session available")
            return None

        reader_options = batch_kwargs.get("reader_options", {})

        # We need to build batch_markers to be used with the DataFrame
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "path" in batch_kwargs or "s3" in batch_kwargs:
            # If both are present, let s3 override
            path = batch_kwargs.get("path")
            path = batch_kwargs.get("s3", path)
            reader_method = batch_kwargs.get("reader_method")
            reader = self.spark.read

            for option in reader_options.items():
                reader = reader.option(*option)
            reader_fn = self._get_reader_fn(reader, reader_method, path)
            df = reader_fn(path)

        elif "query" in batch_kwargs:
            df = self.spark.sql(batch_kwargs["query"])

        elif "dataset" in batch_kwargs and isinstance(
                batch_kwargs["dataset"], (DataFrame, SparkDFDataset)):
            df = batch_kwargs.get("dataset")
            # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs
            batch_kwargs = {
                k: batch_kwargs[k]
                for k in batch_kwargs if k != "dataset"
            }
            if isinstance(df, SparkDFDataset):
                # Grab just the spark_df reference, since we want to override everything else
                df = df.spark_df
            # Record this in the kwargs *and* the id
            batch_kwargs["SparkDFRef"] = True
            batch_kwargs["ge_batch_id"] = str(uuid.uuid1())

        else:
            raise BatchKwargsError(
                "Unrecognized batch_kwargs for spark_source", batch_kwargs)

        if "limit" in batch_kwargs:
            df = df.limit(batch_kwargs["limit"])

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=df,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context,
        )
Example #2
0
    def get_batch(self, batch_kwargs, batch_parameters=None):
        # pandas cannot take unicode as a delimiter, which can happen in py2. Handle this case explicitly.
        # We handle it here so that the updated value will be in the batch_kwargs for transparency to the user.
        if PY2 and "reader_options" in batch_kwargs and "sep" in batch_kwargs['reader_options'] and \
                batch_kwargs['reader_options']['sep'] is not None:
            batch_kwargs['reader_options']['sep'] = str(batch_kwargs['reader_options']['sep'])
        # We will use and manipulate reader_options along the way
        reader_options = batch_kwargs.get("reader_options", {})

        # We need to build a batch_markers to be used in the dataframe
        batch_markers = BatchMarkers({
            "ge_load_time": datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "path" in batch_kwargs:
            path = batch_kwargs['path']
            reader_method = batch_kwargs.get("reader_method")
            reader_fn = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)

        elif "s3" in batch_kwargs:
            try:
                import boto3
                s3 = boto3.client("s3", **self._boto3_options)
            except ImportError:
                raise BatchKwargsError("Unable to load boto3 client to read s3 asset.", batch_kwargs)
            raw_url = batch_kwargs["s3"]
            reader_method = batch_kwargs.get("reader_method")
            url = S3Url(raw_url)
            logger.debug("Fetching s3 object. Bucket: %s Key: %s" % (url.bucket, url.key))
            s3_object = s3.get_object(Bucket=url.bucket, Key=url.key)
            reader_fn = self._get_reader_fn(reader_method, url.key)
            df = reader_fn(
                StringIO(s3_object["Body"].read().decode(s3_object.get("ContentEncoding", "utf-8"))),
                **reader_options
            )

        elif "dataset" in batch_kwargs and isinstance(batch_kwargs["dataset"], (pd.DataFrame, pd.Series)):
            df = batch_kwargs.get("dataset")
            # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs
            batch_kwargs = {k: batch_kwargs[k] for k in batch_kwargs if k != 'dataset'}
            batch_kwargs["PandasInMemoryDF"] = True
            batch_kwargs["ge_batch_id"] = str(uuid.uuid1())

        else:
            raise BatchKwargsError("Invalid batch_kwargs: path, s3, or df is required for a PandasDatasource",
                                   batch_kwargs)

        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hashlib.md5(pd.util.hash_pandas_object(
                df, index=True).values).hexdigest()

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=df,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context
        )
    def get_batch(self, batch_kwargs, batch_parameters=None):
        # We need to build a batch_id to be used in the dataframe
        batch_markers = BatchMarkers({
            "ge_load_time": datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "bigquery_temp_table" in batch_kwargs:
            query_support_table_name = batch_kwargs.get("bigquery_temp_table")
        elif "snowflake_transient_table" in batch_kwargs:
            # Snowflake uses a transient table, so we expect a table_name to be provided
            query_support_table_name = batch_kwargs.get("snowflake_transient_table")
        else:
            query_support_table_name = None

        if "query" in batch_kwargs:
            if "limit" in batch_kwargs or "offset" in batch_kwargs:
                logger.warning("Limit and offset parameters are ignored when using query-based batch_kwargs; consider "
                               "adding limit and offset directly to the generated query.")
            if "query_parameters" in batch_kwargs:
                query = Template(batch_kwargs["query"]).safe_substitute(batch_kwargs["query_parameters"])
            else:
                query = batch_kwargs["query"]
            batch_reference = SqlAlchemyBatchReference(engine=self.engine, query=query, table_name=query_support_table_name,
                                                       schema=batch_kwargs.get("schema"))
        elif "table" in batch_kwargs:
            limit = batch_kwargs.get('limit')
            offset = batch_kwargs.get('offset')
            if limit is not None or offset is not None:
                logger.info("Generating query from table batch_kwargs based on limit and offset")
                raw_query = sqlalchemy.select([sqlalchemy.text("*")])\
                    .select_from(sqlalchemy.schema.Table(batch_kwargs['table'], sqlalchemy.MetaData(),
                                                         schema=batch_kwargs.get("schema")))\
                    .offset(offset)\
                    .limit(limit)
                query = str(raw_query.compile(self.engine, compile_kwargs={"literal_binds": True}))
                batch_reference = SqlAlchemyBatchReference(engine=self.engine, query=query, table_name=query_support_table_name,
                                                           schema=batch_kwargs.get("schema"))
            else:
                batch_reference = SqlAlchemyBatchReference(engine=self.engine, table_name=batch_kwargs["table"],
                                                           schema=batch_kwargs.get("schema"))
        else:
            raise ValueError("Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified")

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=batch_reference,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context
        )
Example #4
0
    def _run_suite(
        self,
        dataset_name: str,
        dataset_path: Optional[str],
        df: Any,
        target_expectation_suite_name: str,
        run_id: str,
    ):
        target_suite = self.expectation_context.get_expectation_suite(
            target_expectation_suite_name)
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_kwargs = {"datasource": generate_datasource_name(dataset_name)}

        if dataset_path:
            dataasset_name, _ = os.path.splitext(
                os.path.basename(dataset_path))
            batch_kwargs["path"] = str(dataset_path)
            batch_kwargs["data_asset_name"] = dataasset_name

        batch = Batch(
            "kedro",
            batch_kwargs=BatchKwargs(batch_kwargs),
            data=df,
            batch_parameters=None,
            batch_markers=batch_markers,
            data_context=self.expectation_context,
        )

        try:
            v = Validator(
                batch=batch,
                expectation_suite=target_suite,
            )
        except ValueError:
            raise UnsupportedDataSet

        validator_dataset_batch = v.get_dataset()
        return self.expectation_context.run_validation_operator(
            "action_list_operator", [validator_dataset_batch], run_id=run_id)
Example #5
0
 def _run_suite(self, dataset, target_expectation_suite_name, run_id):
     class_name = self._get_ge_class_name(dataset)
     target_suite = self.expectation_context.get_expectation_suite(
         target_expectation_suite_name)
     df = dataset.load()
     batch = Batch(
         'kedro', BatchKwargs({
             'path': 'kedro',
             'datasource': 'kedro'
         }), df, None,
         BatchMarkers({
             "ge_load_time":
             datetime.datetime.now(
                 datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
         }), self.expectation_context)
     v = Validator(batch, target_suite, {
         'module_name': 'great_expectations.dataset',
         'class_name': class_name
     })
     vgdf = v.get_dataset()
     self.expectation_context.run_validation_operator(
         'action_list_operator', [vgdf], run_id=run_id)
Example #6
0
    def get_batch(self, batch_kwargs, batch_parameters=None):
        # We will use and manipulate reader_options along the way
        reader_options = batch_kwargs.get("reader_options", {})

        # We need to build a batch_markers to be used in the dataframe
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "path" in batch_kwargs:
            path = batch_kwargs["path"]
            reader_method = batch_kwargs.get("reader_method")
            reader_fn = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)

        elif "s3" in batch_kwargs:
            try:
                import boto3

                s3 = boto3.client("s3", **self._boto3_options)
            except ImportError:
                raise BatchKwargsError(
                    "Unable to load boto3 client to read s3 asset.",
                    batch_kwargs)
            raw_url = batch_kwargs["s3"]
            reader_method = batch_kwargs.get("reader_method")
            url = S3Url(raw_url)
            logger.debug("Fetching s3 object. Bucket: {} Key: {}".format(
                url.bucket, url.key))
            s3_object = s3.get_object(Bucket=url.bucket, Key=url.key)
            reader_fn = self._get_reader_fn(reader_method, url.key)
            default_reader_options = self._infer_default_options(
                reader_fn, reader_options)
            if not reader_options.get(
                    "encoding") and default_reader_options.get("encoding"):
                reader_options["encoding"] = s3_object.get(
                    "ContentEncoding", default_reader_options.get("encoding"))
            df = reader_fn(BytesIO(s3_object["Body"].read()), **reader_options)

        elif "dataset" in batch_kwargs and isinstance(
                batch_kwargs["dataset"], (pd.DataFrame, pd.Series)):
            df = batch_kwargs.get("dataset")
            # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs
            batch_kwargs = {
                k: batch_kwargs[k]
                for k in batch_kwargs if k != "dataset"
            }
            batch_kwargs["PandasInMemoryDF"] = True
            batch_kwargs["ge_batch_id"] = str(uuid.uuid1())

        else:
            raise BatchKwargsError(
                "Invalid batch_kwargs: path, s3, or df is required for a PandasDatasource",
                batch_kwargs,
            )

        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(
                df)

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=df,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context,
        )
    def get_batch(self, batch_kwargs, batch_parameters=None):
        # We need to build a batch_id to be used in the dataframe
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "bigquery_temp_table" in batch_kwargs:
            query_support_table_name = batch_kwargs.get("bigquery_temp_table")
        elif "snowflake_transient_table" in batch_kwargs:
            # Snowflake uses a transient table, so we expect a table_name to be provided
            query_support_table_name = batch_kwargs.get(
                "snowflake_transient_table")
        else:
            query_support_table_name = None

        if "query" in batch_kwargs:
            if "limit" in batch_kwargs or "offset" in batch_kwargs:
                logger.warning(
                    "Limit and offset parameters are ignored when using query-based batch_kwargs; consider "
                    "adding limit and offset directly to the generated query.")
            if "query_parameters" in batch_kwargs:
                query = Template(batch_kwargs["query"]).safe_substitute(
                    batch_kwargs["query_parameters"])
            else:
                query = batch_kwargs["query"]
            batch_reference = SqlAlchemyBatchReference(
                engine=self.engine,
                query=query,
                table_name=query_support_table_name,
                schema=batch_kwargs.get("schema"),
            )
        elif "table" in batch_kwargs:
            table = batch_kwargs["table"]
            limit = batch_kwargs.get("limit")
            offset = batch_kwargs.get("offset")
            if limit is not None or offset is not None:
                # AWS Athena does not support offset
                if (offset is not None
                        and self.engine.dialect.name.lower() == "awsathena"):
                    raise NotImplementedError(
                        "AWS Athena does not support OFFSET.")
                logger.info(
                    "Generating query from table batch_kwargs based on limit and offset"
                )
                # In BigQuery the table name is already qualified with its schema name
                if self.engine.dialect.name.lower() == "bigquery":
                    schema = None
                else:
                    schema = batch_kwargs.get("schema")
                raw_query = (sqlalchemy.select(
                    [sqlalchemy.text("*")]).select_from(
                        sqlalchemy.schema.Table(
                            table, sqlalchemy.MetaData(),
                            schema=schema)).offset(offset).limit(limit))
                query = str(
                    raw_query.compile(self.engine,
                                      compile_kwargs={"literal_binds": True}))
                batch_reference = SqlAlchemyBatchReference(
                    engine=self.engine,
                    query=query,
                    table_name=query_support_table_name,
                    schema=batch_kwargs.get("schema"),
                )
            else:
                batch_reference = SqlAlchemyBatchReference(
                    engine=self.engine,
                    table_name=table,
                    schema=batch_kwargs.get("schema"),
                )
        else:
            raise ValueError(
                "Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified"
            )

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=batch_reference,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context,
        )