Esempio n. 1
0
    def build_configuration(cls,
                            data_asset_type=None,
                            generators=None,
                            **kwargs):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            generators: Generator configuration dictionary
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """
        if generators is None:
            generators = {"default": {"class_name": "TableGenerator"}}

        if data_asset_type is None:
            data_asset_type = ClassConfig(class_name="SqlAlchemyDataset")
        else:
            try:
                data_asset_type = ClassConfig(**data_asset_type)
            except TypeError:
                # In this case, we allow the passed config, for now, in case they're using a legacy string-only config
                pass

        configuration = kwargs
        configuration.update({
            "data_asset_type": data_asset_type,
            "generators": generators,
        })
        return configuration
Esempio n. 2
0
    def __init__(self,
                 name="default",
                 data_context=None,
                 data_asset_type=None,
                 profile=None,
                 generators=None,
                 **kwargs):
        if not sqlalchemy:
            raise DatasourceInitializationError(
                name, "ModuleNotFoundError: No module named 'sqlalchemy'")

        if generators is None:
            generators = {"default": {"type": "queries"}}

        if data_asset_type is None:
            data_asset_type = ClassConfig(class_name="SqlAlchemyDataset")
        else:
            try:
                data_asset_type = ClassConfig(**data_asset_type)
            except TypeError:
                # In this case, we allow the passed config, for now, in case they're using a legacy string-only config
                pass

        super(SqlAlchemyDatasource,
              self).__init__(name,
                             type_="sqlalchemy",
                             data_context=data_context,
                             data_asset_type=data_asset_type,
                             generators=generators)
        if profile is not None:
            self._datasource_config.update({"profile": profile})

        try:
            # if an engine was provided, use that
            if "engine" in kwargs:
                self.engine = kwargs.pop("engine")

            # if a connection string or url was provided, use that
            elif "connection_string" in kwargs:
                connection_string = kwargs.pop("connection_string")
                self.engine = create_engine(connection_string, **kwargs)
                self.engine.connect()
            elif "url" in kwargs:
                url = kwargs.pop("url")
                self.engine = create_engine(url, **kwargs)
                self.engine.connect()

            # Otherwise, connect using remaining kwargs
            else:
                self.engine = create_engine(
                    self._get_sqlalchemy_connection_options(**kwargs))
                self.engine.connect()

        except sqlalchemy.exc.OperationalError as sqlalchemy_error:
            raise DatasourceInitializationError(self._name,
                                                str(sqlalchemy_error))

        self._build_generators()
Esempio n. 3
0
    def build_configuration(cls, data_asset_type=None, generators=None, boto3_options=None, **kwargs):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            generators: Generator configuration dictionary
            boto3_options: Optional dictionary with key-value pairs to pass to boto3 during instantiation.
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """
        if generators is None:
            # Provide a gentle way to build a datasource with a sane default,
            # including ability to specify the base_directory and reader_options
            base_directory = kwargs.pop("base_directory", "data")
            # By default, use CSV sniffer to infer separator, which requires the python engine
            reader_options = kwargs.pop("reader_options", {
                "sep": None,
                "engine": "python"
            })
            generators = {
                "default": {
                    "class_name": "SubdirReaderGenerator",
                    "base_directory": base_directory,
                    "reader_options": reader_options
                }
            }
        if data_asset_type is None:
            data_asset_type = ClassConfig(
                class_name="PandasDataset")
        else:
            try:
                data_asset_type = ClassConfig(**data_asset_type)
            except TypeError:
                # In this case, we allow the passed config, for now, in case they're using a legacy string-only config
                pass

        configuration = kwargs
        configuration.update({
            "data_asset_type": data_asset_type,
            "generators": generators,
        })
        if boto3_options is not None:
            if isinstance(boto3_options, dict):
                configuration.update(boto3_options)
            else:
                raise ValueError("boto3_options must be a dictionary of key-value pairs to pass to boto3 upon "
                                 "initialization.")
        return configuration
    def build_configuration(cls,
                            data_asset_type=None,
                            generators=None,
                            spark_config=None,
                            **kwargs):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            generators: Generator configuration dictionary
            spark_config: dictionary of key-value pairs to pass to the spark builder
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """
        if generators is None:
            # Provide a gentle way to build a datasource with a sane default,
            # including ability to specify the base_directory
            base_directory = kwargs.pop("base_directory", "/data")
            reader_options = kwargs.pop("reader_options", {})
            generators = {
                "default": {
                    "class_name": "SubdirReaderGenerator",
                    "base_directory": base_directory,
                    "reader_options": reader_options
                }
            }

        if data_asset_type is None:
            data_asset_type = ClassConfig(class_name="SparkDFDataset")
        else:
            try:
                data_asset_type = ClassConfig(**data_asset_type)
            except TypeError:
                # In this case, we allow the passed config, for now, in case they're using a legacy string-only config
                pass

        if spark_config is None:
            spark_config = {}

        configuration = kwargs
        configuration.update({
            "data_asset_type": data_asset_type,
            "generators": generators,
            "spark_config": spark_config
        })
        return configuration
    def build_configuration(cls,
                            data_asset_type=None,
                            batch_kwargs_generators=None,
                            **kwargs):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            batch_kwargs_generators: Generator configuration dictionary
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """

        if data_asset_type is None:
            data_asset_type = {
                "class_name": "SqlAlchemyDataset",
                "module_name": "great_expectations.dataset",
            }
        else:
            data_asset_type = classConfigSchema.dump(
                ClassConfig(**data_asset_type))

        configuration = kwargs
        configuration["data_asset_type"] = data_asset_type
        if batch_kwargs_generators is not None:
            configuration["batch_kwargs_generators"] = batch_kwargs_generators

        return configuration
    def build_configuration(cls, data_asset_type=None, generators=None, spark_config=None, **kwargs):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            generators: Generator configuration dictionary
            spark_config: dictionary of key-value pairs to pass to the spark builder
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """

        if data_asset_type is None:
            data_asset_type = {"class_name": "SparkDFDataset"}
        else:
            data_asset_type = classConfigSchema.dump(ClassConfig(**data_asset_type))

        if spark_config is None:
            spark_config = {}

        configuration = kwargs
        configuration.update({
            "data_asset_type": data_asset_type,
            "spark_config": spark_config
        })
        if generators:
            configuration["generators"] = generators

        return configuration
    def build_configuration(cls,
                            data_asset_type=None,
                            generators=None,
                            boto3_options=None,
                            reader_method=None,
                            reader_options=None,
                            limit=None,
                            **kwargs):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            generators: Generator configuration dictionary
            boto3_options: Optional dictionary with key-value pairs to pass to boto3 during instantiation.
            reader_method: Optional default reader_method for generated batches
            reader_options: Optional default reader_options for generated batches
            limit: Optional default limit for generated batches
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """

        if data_asset_type is None:
            data_asset_type = {"class_name": "PandasDataset"}
        else:
            data_asset_type = classConfigSchema.dump(
                ClassConfig(**data_asset_type))

        configuration = kwargs
        configuration["data_asset_type"] = data_asset_type
        if generators:
            configuration["generators"] = generators

        if boto3_options is not None:
            if isinstance(boto3_options, dict):
                configuration.update(boto3_options)
            else:
                raise ValueError(
                    "boto3_options must be a dictionary of key-value pairs to pass to boto3 upon "
                    "initialization.")

        if reader_options is not None:
            if isinstance(reader_options, dict):
                configuration.update(reader_options)
            else:
                raise ValueError(
                    "boto3_options must be a dictionary of key-value pairs to pass to boto3 upon "
                    "initialization.")

        if reader_method is not None:
            configuration["reader_method"] = reader_method

        if limit is not None:
            configuration["limit"] = limit

        return configuration
Esempio n. 8
0
    def __init__(self,
                 name="default",
                 data_context=None,
                 data_asset_type=None,
                 generators=None,
                 **kwargs):
        if generators is None:
            # Provide a gentle way to build a datasource with a sane default,
            # including ability to specify the base_directory
            base_directory = kwargs.pop("base_directory", "/data")
            reader_options = kwargs.pop("reader_options", {})
            generators = {
                "default": {
                    "type": "subdir_reader",
                    "base_directory": base_directory,
                    "reader_options": reader_options
                }
            }

        if data_asset_type is None:
            data_asset_type = ClassConfig(class_name="SparkDFDataset")
        else:
            try:
                data_asset_type = ClassConfig(**data_asset_type)
            except TypeError:
                # In this case, we allow the passed config, for now, in case they're using a legacy string-only config
                pass

        super(SparkDFDatasource,
              self).__init__(name,
                             type_="spark",
                             data_context=data_context,
                             data_asset_type=data_asset_type,
                             generators=generators)
        try:
            self.spark = SparkSession.builder.getOrCreate()
        except Exception:
            logger.error(
                "Unable to load spark context; install optional spark dependency for support."
            )
            self.spark = None

        self._build_generators()
    def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs):
        batch_kwargs.update(kwargs)

        if "data_asset_type" in batch_kwargs:
            # Sqlalchemy does not use reader_options or need to remove batch_kwargs since it does not pass
            # options through to a later reader
            data_asset_type_config = batch_kwargs["data_asset_type"]
            try:
                data_asset_type_config = ClassConfig(**data_asset_type_config)
            except TypeError:
                # We tried; we'll pass the config downstream, probably as a string, and handle an error later
                pass
        else:
            data_asset_type_config = self._data_asset_type

        data_asset_type = self._get_data_asset_class(data_asset_type_config)

        if not issubclass(data_asset_type, SqlAlchemyDataset):
            raise ValueError("SqlAlchemyDatasource cannot instantiate batch with data_asset_type: '%s'. It "
                             "must be a subclass of SqlAlchemyDataset." % data_asset_type.__name__)

        # We need to build a batch_id to be used in the dataframe
        batch_id = BatchId({
            "timestamp": time.time()
        })

        if "schema" in batch_kwargs:
            schema = batch_kwargs["schema"]
        else:
            schema = None

        if "table" in batch_kwargs:
            return data_asset_type(
                table_name=batch_kwargs["table"],
                engine=self.engine,
                schema=schema,
                data_context=self._data_context,
                expectation_suite=expectation_suite,
                batch_kwargs=batch_kwargs,
                batch_id=batch_id
            )

        elif "query" in batch_kwargs:
            query = Template(batch_kwargs["query"]).safe_substitute(**kwargs)
            return data_asset_type(
                custom_sql=query,
                engine=self.engine,
                data_context=self._data_context,
                expectation_suite=expectation_suite,
                batch_kwargs=batch_kwargs,
                batch_id=batch_id
            )
    
        else:
            raise ValueError("Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified")
Esempio n. 10
0
    def build_configuration(cls,
                            data_asset_type=None,
                            generators=None,
                            **kwargs):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            generators: Generator configuration dictionary
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """

        # As of 0.9.0, we do not require generators be configured
        #     generators = {
        #         "default": {
        #             "class_name": "TableBatchKwargsGenerator"
        #         },
        #         "passthrough": {
        #             "class_name": "PassthroughGenerator",
        #         }
        #     }

        if data_asset_type is None:
            data_asset_type = ClassConfig(class_name="SqlAlchemyDataset")
        else:
            try:
                data_asset_type = ClassConfig(**data_asset_type)
            except TypeError:
                # In this case, we allow the passed config, for now, in case they're using a legacy string-only config
                pass

        configuration = kwargs
        configuration["data_asset_type"] = data_asset_type
        if generators is not None:
            configuration["generators"] = generators

        return configuration
    def __init__(self,
                 name="pandas",
                 data_context=None,
                 data_asset_type=None,
                 generators=None,
                 **kwargs):
        if generators is None:
            # Provide a gentle way to build a datasource with a sane default,
            # including ability to specify the base_directory and reader_options
            base_directory = kwargs.pop("base_directory", "/data")
            # By default, use CSV sniffer to infer separator, which requires the python engine
            reader_options = kwargs.pop("reader_options", {
                "sep": None,
                "engine": "python"
            })
            generators = {
                "default": {
                    "type": "subdir_reader",
                    "base_directory": base_directory,
                    "reader_options": reader_options
                }
            }
        if data_asset_type is None:
            data_asset_type = ClassConfig(class_name="PandasDataset")
        else:
            try:
                data_asset_type = ClassConfig(**data_asset_type)
            except TypeError:
                # In this case, we allow the passed config, for now, in case they're using a legacy string-only config
                pass

        super(PandasDatasource, self).__init__(name,
                                               type_="pandas",
                                               data_context=data_context,
                                               data_asset_type=data_asset_type,
                                               generators=generators)
        self._build_generators()
Esempio n. 12
0
    def build_configuration(
        cls,
        data_asset_type=None,
        batch_kwargs_generators=None,
        spark_config=None,
        force_reuse_spark_context=False,
        **kwargs,
    ):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            batch_kwargs_generators: Generator configuration dictionary
            spark_config: dictionary of key-value pairs to pass to the spark builder
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """

        if data_asset_type is None:
            data_asset_type = {
                "class_name": "SparkDFDataset",
                "module_name": "great_expectations.dataset",
            }
        else:
            data_asset_type = classConfigSchema.dump(
                ClassConfig(**data_asset_type))

        if spark_config is None:
            spark_config = {}

        configuration = kwargs
        configuration.update({
            "data_asset_type":
            data_asset_type,
            "spark_config":
            spark_config,
            "force_reuse_spark_context":
            force_reuse_spark_context,
        })

        if batch_kwargs_generators:
            configuration["batch_kwargs_generators"] = batch_kwargs_generators

        return configuration
Esempio n. 13
0
    def __init__(self,
                 batch,
                 expectation_suite,
                 expectation_engine=None,
                 **kwargs):
        self.batch = batch
        self.expectation_suite = expectation_suite

        if isinstance(expectation_engine, dict):
            expectation_engine = ClassConfig(**expectation_engine)

        if isinstance(expectation_engine, ClassConfig):
            module_name = expectation_engine.module_name or "great_expectations.dataset"
            verify_dynamic_loading_support(module_name=module_name)
            expectation_engine = load_class(
                class_name=expectation_engine.class_name,
                module_name=module_name)

        self.expectation_engine = expectation_engine
        if self.expectation_engine is None:
            # Guess the engine
            try:
                import pandas as pd

                if isinstance(batch.data, pd.DataFrame):
                    self.expectation_engine = PandasDataset
            except ImportError:
                pass
        if self.expectation_engine is None:
            if isinstance(batch.data, SqlAlchemyBatchReference):
                self.expectation_engine = SqlAlchemyDataset

        if self.expectation_engine is None:
            try:
                import pyspark

                if isinstance(batch.data, pyspark.sql.DataFrame):
                    self.expectation_engine = SparkDFDataset
            except ImportError:
                pass

        if self.expectation_engine is None:
            raise ValueError(
                "Unable to identify expectation_engine. It must be a subclass of DataAsset."
            )

        self.init_kwargs = kwargs
Esempio n. 14
0
    def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs):
        for k, v in kwargs.items():
            if isinstance(v, dict):
                if k in batch_kwargs and isinstance(batch_kwargs[k], dict):
                    batch_kwargs[k].update(v)
                else:
                    batch_kwargs[k] = v
            else:
                batch_kwargs[k] = v

        if "data_asset_type" in batch_kwargs:
            # Sqlalchemy does not use reader_options or need to remove batch_kwargs since it does not pass
            # options through to a later reader
            data_asset_type_config = batch_kwargs["data_asset_type"]
            try:
                data_asset_type_config = ClassConfig(**data_asset_type_config)
            except TypeError:
                # We tried; we'll pass the config downstream, probably as a string, and handle an error later
                pass
        else:
            data_asset_type_config = self._data_asset_type

        data_asset_type = self._get_data_asset_class(data_asset_type_config)

        if not issubclass(data_asset_type, SqlAlchemyDataset):
            raise ValueError(
                "SqlAlchemyDatasource cannot instantiate batch with data_asset_type: '%s'. It "
                "must be a subclass of SqlAlchemyDataset." %
                data_asset_type.__name__)

        # We need to build a batch_id to be used in the dataframe
        batch_id = BatchId({"timestamp": time.time()})

        if "schema" in batch_kwargs:
            schema = batch_kwargs["schema"]
        else:
            schema = None

        if "table" in batch_kwargs:
            limit = batch_kwargs.get('limit')
            offset = batch_kwargs.get('offset')
            if limit is not None or offset is not None:
                logger.info(
                    "Generating query from table batch_kwargs based on limit and offset"
                )
                raw_query = sqlalchemy.select([sqlalchemy.text("*")])\
                    .select_from(sqlalchemy.schema.Table(batch_kwargs['table'], sqlalchemy.MetaData(), schema=schema))\
                    .offset(offset)\
                    .limit(limit)
                query = str(
                    raw_query.compile(self.engine,
                                      compile_kwargs={"literal_binds": True}))
                return data_asset_type(custom_sql=query,
                                       engine=self.engine,
                                       data_context=self._data_context,
                                       expectation_suite=expectation_suite,
                                       batch_kwargs=batch_kwargs,
                                       batch_id=batch_id)

            else:
                return data_asset_type(table_name=batch_kwargs["table"],
                                       engine=self.engine,
                                       schema=schema,
                                       data_context=self._data_context,
                                       expectation_suite=expectation_suite,
                                       batch_kwargs=batch_kwargs,
                                       batch_id=batch_id)

        elif "query" in batch_kwargs:
            if "limit" in batch_kwargs or "offset" in batch_kwargs:
                logger.warning(
                    "Limit and offset parameters are ignored when using query-based batch_kwargs; consider "
                    "adding limit and offset directly to the generated query.")
            if "bigquery_temp_table" in batch_kwargs:
                table_name = batch_kwargs.get("bigquery_temp_table")
            else:
                table_name = None

            query = Template(batch_kwargs["query"]).safe_substitute(**kwargs)
            return data_asset_type(custom_sql=query,
                                   engine=self.engine,
                                   table_name=table_name,
                                   data_context=self._data_context,
                                   expectation_suite=expectation_suite,
                                   batch_kwargs=batch_kwargs,
                                   batch_id=batch_id)

        else:
            raise ValueError(
                "Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified"
            )
Esempio n. 15
0
    def build_configuration(cls, data_asset_type=None, generators=None, boto3_options=None, reader_method=None,
                            reader_options=None, limit=None, **kwargs):
        """
        Build a full configuration object for a datasource, potentially including generators with defaults.

        Args:
            data_asset_type: A ClassConfig dictionary
            generators: Generator configuration dictionary
            boto3_options: Optional dictionary with key-value pairs to pass to boto3 during instantiation.
            reader_method: Optional default reader_method for generated batches
            reader_options: Optional default reader_options for generated batches
            limit: Optional default limit for generated batches
            **kwargs: Additional kwargs to be part of the datasource constructor's initialization

        Returns:
            A complete datasource configuration.

        """

        # PENDING DELETION - JPC - 20200130
        # if generators is None:
            # Provide a gentle way to build a datasource with a sane default,
            # including ability to specify the base_directory and reader_options
            # base_directory = kwargs.pop("base_directory", "data")
            # By default, use CSV sniffer to infer separator, which requires the python engine
            # reader_options = kwargs.pop("reader_options", {
            #     "sep": None,
            #     "engine": "python"
            # })
            # generators = {
            #     # "default": {
            #     #     "class_name": "SubdirReaderBatchKwargsGenerator",
            #     #     "base_directory": base_directory,
            #     #     "reader_options": reader_options
            #     # },
            #     # "passthrough": {
            #     #     "class_name": "PassthroughGenerator",
            #     # }
            # }
        if data_asset_type is None:
            data_asset_type = ClassConfig(
                class_name="PandasDataset")
        else:
            try:
                data_asset_type = ClassConfig(**data_asset_type)
            except TypeError:
                # In this case, we allow the passed config, for now, in case they're using a legacy string-only config
                pass

        configuration = kwargs
        configuration["data_asset_type"] = data_asset_type
        if generators:
            configuration["generators"] = generators

        if boto3_options is not None:
            if isinstance(boto3_options, dict):
                configuration.update(boto3_options)
            else:
                raise ValueError("boto3_options must be a dictionary of key-value pairs to pass to boto3 upon "
                                 "initialization.")

        if reader_options is not None:
            if isinstance(reader_options, dict):
                configuration.update(reader_options)
            else:
                raise ValueError("boto3_options must be a dictionary of key-value pairs to pass to boto3 upon "
                                 "initialization.")

        if reader_method is not None:
            configuration["reader_method"] = reader_method

        if limit is not None:
            configuration["limit"] = limit

        return configuration
Esempio n. 16
0
    def _get_data_asset(self,
                        batch_kwargs,
                        expectation_suite,
                        caching=True,
                        **kwargs):
        """class-private implementation of get_data_asset"""
        if self.spark is None:
            logger.error("No spark session available")
            return None

        batch_kwargs.update(kwargs)
        reader_options = batch_kwargs.copy()

        if "data_asset_type" in reader_options:
            data_asset_type_config = reader_options.pop(
                "data_asset_type")  # Get and remove the config
            try:
                data_asset_type_config = ClassConfig(**data_asset_type_config)
            except TypeError:
                # We tried; we'll pass the config downstream, probably as a string, and handle an error later
                pass
        else:
            data_asset_type_config = self._data_asset_type

        data_asset_type = self._get_data_asset_class(data_asset_type_config)
        if not issubclass(data_asset_type, SparkDFDataset):
            raise ValueError(
                "SparkDFDatasource cannot instantiate batch with data_asset_type: '%s'. It "
                "must be a subclass of SparkDFDataset." %
                data_asset_type.__name__)

        if "path" in batch_kwargs or "s3" in batch_kwargs:
            if "path" in batch_kwargs:
                path = reader_options.pop(
                    "path"
                )  # We remove this so it is not used as a reader option
            else:
                path = reader_options.pop("s3")
            reader_options.pop("timestamp",
                               "")  # ditto timestamp (but missing ok)
            reader_method = reader_options.pop("reader_method", None)
            if reader_method is None:
                reader_method = self._guess_reader_method_from_path(path)
                if reader_method is None:
                    raise BatchKwargsError(
                        "Unable to determine reader for path: %s" % path,
                        batch_kwargs)
            else:
                try:
                    reader_method = ReaderMethods[reader_method]
                except KeyError:
                    raise BatchKwargsError(
                        "Unknown reader method: %s" % reader_method,
                        batch_kwargs)

            reader = self.spark.read

            for option in reader_options.items():
                reader = reader.option(*option)

            if reader_method == ReaderMethods.CSV:
                df = reader.csv(path)
            elif reader_method == ReaderMethods.parquet:
                df = reader.parquet(path)
            elif reader_method == ReaderMethods.delta:
                df = reader.format("delta").load(path)
            else:
                raise BatchKwargsError(
                    "Unsupported reader: %s" % reader_method.name,
                    batch_kwargs)

        elif "query" in batch_kwargs:
            df = self.spark.sql(batch_kwargs["query"])

        elif "df" in batch_kwargs and isinstance(batch_kwargs["df"],
                                                 (DataFrame, SparkDFDataset)):
            df = batch_kwargs.pop(
                "df")  # We don't want to store the actual DataFrame in kwargs
            if isinstance(df, SparkDFDataset):
                # Grab just the spark_df reference, since we want to override everything else
                df = df.spark_df
            batch_kwargs["SparkDFRef"] = True

        else:
            raise BatchKwargsError(
                "Unrecognized batch_kwargs for spark_source", batch_kwargs)

        return data_asset_type(df,
                               expectation_suite=expectation_suite,
                               data_context=self._data_context,
                               batch_kwargs=batch_kwargs,
                               caching=caching)
Esempio n. 17
0
    def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs):
        batch_kwargs.update(kwargs)
        # pandas cannot take unicode as a delimiter, which can happen in py2. Handle this case explicitly.
        # We handle it here so that the updated value will be in the batch_kwargs for transparency to the user.
        if PY2 and "sep" in batch_kwargs and batch_kwargs["sep"] is not None:
            batch_kwargs["sep"] = str(batch_kwargs["sep"])
        # We will use and manipulate reader_options along the way
        reader_options = batch_kwargs.copy()

        # We need to build a batch_id to be used in the dataframe
        batch_id = BatchId({"timestamp": time.time()})

        if "data_asset_type" in batch_kwargs:
            data_asset_type_config = reader_options.pop(
                "data_asset_type")  # Get and remove the config
            try:
                data_asset_type_config = ClassConfig(**data_asset_type_config)
            except TypeError:
                # We tried; we'll pass the config downstream, probably as a string, and handle an error later
                pass
        else:
            data_asset_type_config = self._data_asset_type

        data_asset_type = self._get_data_asset_class(data_asset_type_config)

        if not issubclass(data_asset_type, PandasDataset):
            raise ValueError(
                "PandasDatasource cannot instantiate batch with data_asset_type: '%s'. It "
                "must be a subclass of PandasDataset." %
                data_asset_type.__name__)

        if "path" in batch_kwargs:
            path = reader_options.pop(
                "path")  # We remove this so it is not used as a reader option
            reader_options.pop("timestamp",
                               "")  # ditto timestamp (but missing ok)
            reader_options.pop("partition_id", "")

            reader_method = reader_options.pop("reader_method", None)
            reader_fn, reader_fn_options = self._get_reader_fn(
                reader_method, path, reader_options)
            try:
                df = getattr(pd, reader_fn)(path, **reader_fn_options)
            except AttributeError:
                raise BatchKwargsError(
                    "Unsupported reader: %s" % reader_method.name,
                    batch_kwargs)

        elif "s3" in batch_kwargs:
            try:
                import boto3
                s3 = boto3.client("s3")
            except ImportError:
                raise BatchKwargsError(
                    "Unable to load boto3 client to read s3 asset.",
                    batch_kwargs)
            raw_url = reader_options.pop(
                "s3")  # We need to remove from the reader
            reader_options.pop("timestamp",
                               "")  # ditto timestamp (but missing ok)
            reader_method = reader_options.pop("reader_method", None)
            url = S3Url(raw_url)
            logger.debug("Fetching s3 object. Bucket: %s Key: %s" %
                         (url.bucket, url.key))
            s3_object = s3.get_object(Bucket=url.bucket, Key=url.key)
            reader_fn, reader_fn_options = self._get_reader_fn(
                reader_method, url.key, reader_options)

            try:
                df = getattr(pd, reader_fn)(StringIO(
                    s3_object["Body"].read().decode(
                        s3_object.get("ContentEncoding", "utf-8"))),
                                            **reader_fn_options)
            except AttributeError:
                raise BatchKwargsError(
                    "Unsupported reader: %s" % reader_method.name,
                    batch_kwargs)
            except IOError:
                raise

        elif "dataset" in batch_kwargs and isinstance(
                batch_kwargs["dataset"], (pd.DataFrame, pd.Series)):
            df = batch_kwargs.pop(
                "dataset"
            )  # We don't want to store the actual dataframe in kwargs
            # Record this in the kwargs *and* the id
            batch_kwargs["PandasInMemoryDF"] = True
            batch_id["PandasInMemoryDF"] = True

        else:
            raise BatchKwargsError(
                "Invalid batch_kwargs: path, s3, or df is required for a PandasDatasource",
                batch_kwargs)

        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_id["fingerprint"] = hashlib.md5(
                pd.util.hash_pandas_object(df, index=True).values).hexdigest()
        return data_asset_type(df,
                               expectation_suite=expectation_suite,
                               data_context=self._data_context,
                               batch_kwargs=batch_kwargs,
                               batch_id=batch_id)
Esempio n. 18
0
    def _get_data_asset(self,
                        batch_kwargs,
                        expectation_suite,
                        caching=True,
                        **kwargs):
        """class-private implementation of get_data_asset"""
        if self.spark is None:
            logger.error("No spark session available")
            return None

        for k, v in kwargs.items():
            if isinstance(v, dict):
                if k in batch_kwargs and isinstance(batch_kwargs[k], dict):
                    batch_kwargs[k].update(v)
                else:
                    batch_kwargs[k] = v
            else:
                batch_kwargs[k] = v

        reader_options = batch_kwargs.get("reader_options", {})

        # We need to build a batch_id to be used in the dataframe
        batch_id = BatchId({"timestamp": time.time()})

        if "data_asset_type" in batch_kwargs:
            data_asset_type_config = reader_options.pop(
                "data_asset_type")  # Get and remove the config
            try:
                data_asset_type_config = ClassConfig(**data_asset_type_config)
            except TypeError:
                # We tried; we'll pass the config downstream, probably as a string, and handle an error later
                pass
        else:
            data_asset_type_config = self._data_asset_type

        data_asset_type = self._get_data_asset_class(data_asset_type_config)

        if not issubclass(data_asset_type, SparkDFDataset):
            raise ValueError(
                "SparkDFDatasource cannot instantiate batch with data_asset_type: '%s'. It "
                "must be a subclass of SparkDFDataset." %
                data_asset_type.__name__)

        if "path" in batch_kwargs or "s3" in batch_kwargs:
            # If both are present, let s3 override
            path = batch_kwargs.get("path")
            path = batch_kwargs.get("s3", path)
            reader_method = batch_kwargs.get("reader_method")
            if reader_method is None:
                reader_method = self._guess_reader_method_from_path(path)
                if reader_method is None:
                    raise BatchKwargsError(
                        "Unable to determine reader for path: %s" % path,
                        batch_kwargs)
            else:
                try:
                    reader_method = ReaderMethods[reader_method]
                except KeyError:
                    raise BatchKwargsError(
                        "Unknown reader method: %s" % reader_method,
                        batch_kwargs)

            reader = self.spark.read

            for option in reader_options.items():
                reader = reader.option(*option)

            if reader_method == ReaderMethods.CSV:
                df = reader.csv(path)
            elif reader_method == ReaderMethods.parquet:
                df = reader.parquet(path)
            elif reader_method == ReaderMethods.delta:
                df = reader.format("delta").load(path)
            else:
                raise BatchKwargsError(
                    "Unsupported reader: %s" % reader_method.name,
                    batch_kwargs)

        elif "query" in batch_kwargs:
            df = self.spark.sql(batch_kwargs["query"])

        elif "dataset" in batch_kwargs and isinstance(
                batch_kwargs["dataset"], (DataFrame, SparkDFDataset)):
            df = batch_kwargs.get("dataset")
            # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs
            batch_kwargs = {
                k: batch_kwargs[k]
                for k in batch_kwargs if k != 'dataset'
            }
            if isinstance(df, SparkDFDataset):
                # Grab just the spark_df reference, since we want to override everything else
                df = df.spark_df
            # Record this in the kwargs *and* the id
            batch_kwargs["SparkDFRef"] = True
            batch_id["SparkDFRef"] = True

        else:
            raise BatchKwargsError(
                "Unrecognized batch_kwargs for spark_source", batch_kwargs)

        if "limit" in batch_kwargs:
            df = df.limit(batch_kwargs['limit'])

        return data_asset_type(df,
                               expectation_suite=expectation_suite,
                               data_context=self._data_context,
                               batch_kwargs=batch_kwargs,
                               caching=caching,
                               batch_id=batch_id)
    def _get_data_asset(self, batch_kwargs, expectation_suite, **kwargs):
        batch_kwargs.update(kwargs)
        reader_options = batch_kwargs.copy()

        if "data_asset_type" in reader_options:
            data_asset_type_config = reader_options.pop(
                "data_asset_type")  # Get and remove the config
            try:
                data_asset_type_config = ClassConfig(**data_asset_type_config)
            except TypeError:
                # We tried; we'll pass the config downstream, probably as a string, and handle an error later
                pass
        else:
            data_asset_type_config = self._data_asset_type

        data_asset_type = self._get_data_asset_class(data_asset_type_config)

        if not issubclass(data_asset_type, PandasDataset):
            raise ValueError(
                "PandasDatasource cannot instantiate batch with data_asset_type: '%s'. It "
                "must be a subclass of PandasDataset." %
                data_asset_type.__name__)

        if "path" in batch_kwargs:
            path = reader_options.pop(
                "path")  # We need to remove from the reader
            reader_options.pop("timestamp",
                               "")  # ditto timestamp (but missing ok)

            reader_method = reader_options.pop("reader_method", None)
            if reader_method is None:
                reader_method = self._guess_reader_method_from_path(path)
                if reader_method is None:
                    raise BatchKwargsError(
                        "Unable to determine reader for path: %s" % path,
                        batch_kwargs)
            else:
                try:
                    reader_method = ReaderMethods[reader_method]
                except KeyError:
                    raise BatchKwargsError(
                        "Unknown reader method: %s" % reader_method,
                        batch_kwargs)

            if reader_method == ReaderMethods.CSV:
                df = pd.read_csv(path, **reader_options)
            elif reader_method == ReaderMethods.parquet:
                df = pd.read_parquet(path, **reader_options)
            elif reader_method == ReaderMethods.excel:
                df = pd.read_excel(path, **reader_options)
            elif reader_method == ReaderMethods.JSON:
                df = pd.read_json(path, **reader_options)
            else:
                raise BatchKwargsError(
                    "Unsupported reader: %s" % reader_method.name,
                    batch_kwargs)

        elif "df" in batch_kwargs and isinstance(batch_kwargs["df"],
                                                 (pd.DataFrame, pd.Series)):
            df = batch_kwargs.pop(
                "df")  # We don't want to store the actual dataframe in kwargs
            batch_kwargs["PandasInMemoryDF"] = True
        else:
            raise BatchKwargsError(
                "Invalid batch_kwargs: path or df is required for a PandasDatasource",
                batch_kwargs)

        return data_asset_type(df,
                               expectation_suite=expectation_suite,
                               data_context=self._data_context,
                               batch_kwargs=batch_kwargs)