Exemple #1
0
    def build_batch_spec(self, batch_definition: BatchDefinition) -> BatchSpec:
        """
        Builds batch_spec from batch_definition by generating batch_spec params and adding any pass_through params

        Args:
            batch_definition (BatchDefinition): required batch_definition parameter for retrieval
        Returns:
            BatchSpec object built from BatchDefinition

        """
        batch_spec_params: dict = (
            self._generate_batch_spec_parameters_from_batch_definition(
                batch_definition=batch_definition
            )
        )
        # batch_spec_passthrough via Data Connector config
        batch_spec_passthrough: dict = deepcopy(self.batch_spec_passthrough)

        # batch_spec_passthrough from batch_definition supercedes batch_spec_passthrough from Data Connector config
        if isinstance(batch_definition.batch_spec_passthrough, dict):
            batch_spec_passthrough.update(batch_definition.batch_spec_passthrough)

        batch_spec_params.update(batch_spec_passthrough)
        batch_spec: BatchSpec = BatchSpec(**batch_spec_params)
        return batch_spec
Exemple #2
0
def batch_fixture() -> Batch:
    """
    Fixture for Batch object that contains data, BatchRequest, BatchDefinition
    as well as BatchSpec and BatchMarkers. To be used in unittesting.
    """
    df: pd.DataFrame = pd.DataFrame(
        {"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}
    )
    batch_request: BatchRequest = BatchRequest(
        datasource_name="my_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="my_data_asset_name",
    )
    batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="my_data_asset_name",
        batch_identifiers=IDDict({"id": "A"}),
    )
    batch_spec: BatchSpec = BatchSpec(path="/some/path/some.file")
    batch_markers: BatchMarkers = BatchMarkers(ge_load_time="FAKE_LOAD_TIME")
    batch: Batch = Batch(
        data=df,
        batch_request=batch_request,
        batch_definition=batch_definition,
        batch_spec=batch_spec,
        batch_markers=batch_markers,
    )
    return batch
Exemple #3
0
 def build_batch_spec(self, batch_definition: BatchDefinition) -> BatchSpec:
     batch_spec_params: dict = self._generate_batch_spec_parameters_from_batch_definition(
         batch_definition=batch_definition)
     batch_spec_passthrough: dict = batch_definition.batch_spec_passthrough
     if isinstance(batch_spec_passthrough, dict):
         batch_spec_params.update(batch_spec_passthrough)
     batch_spec: BatchSpec = BatchSpec(**batch_spec_params)
     return batch_spec
Exemple #4
0
    def __init__(
        self,
        data,
        batch_request: BatchRequest = None,
        batch_definition: BatchDefinition = None,
        batch_spec: BatchSpec = None,
        batch_markers: BatchMarkers = None,
        # The remaining parameters are for backward compatibility.
        data_context=None,
        datasource_name=None,
        batch_parameters=None,
        batch_kwargs=None,
    ):
        self._data = data
        if batch_request is None:
            batch_request = dict()
        self._batch_request = batch_request
        if batch_definition is None:
            batch_definition = IDDict()
        self._batch_definition = batch_definition
        if batch_spec is None:
            batch_spec = BatchSpec()
        self._batch_spec = batch_spec

        if batch_markers is None:
            batch_markers = BatchMarkers(
                {
                    "ge_load_time": datetime.datetime.now(
                        datetime.timezone.utc
                    ).strftime("%Y%m%dT%H%M%S.%fZ")
                }
            )
        self._batch_markers = batch_markers

        # The remaining parameters are for backward compatibility.
        self._data_context = data_context
        self._datasource_name = datasource_name
        self._batch_parameters = batch_parameters
        self._batch_kwargs = batch_kwargs or BatchKwargs()
Exemple #5
0
def test_sample_using_limit_builds_correct_query_where_clause_none(
    dialect_name: GESqlDialect, dialect_name_to_sql_statement, sa
):
    """What does this test and why?

    split_on_limit should build the appropriate query based on input parameters.
    This tests dialects that differ from the standard dialect, not each dialect exhaustively.
    """

    # 1. Setup
    class MockSqlAlchemyExecutionEngine:
        def __init__(self, dialect_name: GESqlDialect):
            self._dialect_name = dialect_name
            self._connection_string = self.dialect_name_to_connection_string(
                dialect_name
            )

        DIALECT_TO_CONNECTION_STRING_STUB: dict = {
            GESqlDialect.POSTGRESQL: "postgresql://",
            GESqlDialect.MYSQL: "mysql+pymysql://",
            GESqlDialect.ORACLE: "oracle+cx_oracle://",
            GESqlDialect.MSSQL: "mssql+pyodbc://",
            GESqlDialect.SQLITE: "sqlite:///",
            GESqlDialect.BIGQUERY: "bigquery://",
            GESqlDialect.SNOWFLAKE: "snowflake://",
            GESqlDialect.REDSHIFT: "redshift+psycopg2://",
            GESqlDialect.AWSATHENA: f"awsathena+rest://@athena.us-east-1.amazonaws.com/some_test_db?s3_staging_dir=s3://some-s3-path/",
            GESqlDialect.DREMIO: "dremio://",
            GESqlDialect.TERADATASQL: "teradatasql://",
            GESqlDialect.TRINO: "trino://",
            GESqlDialect.HIVE: "hive://",
        }

        @property
        def dialect_name(self) -> str:
            return self._dialect_name.value

        def dialect_name_to_connection_string(self, dialect_name: GESqlDialect) -> str:
            return self.DIALECT_TO_CONNECTION_STRING_STUB.get(dialect_name)

        _BIGQUERY_MODULE_NAME = "sqlalchemy_bigquery"

        @property
        def dialect(self) -> sa.engine.Dialect:
            # TODO: AJB 20220512 move this dialect retrieval to a separate class from the SqlAlchemyExecutionEngine
            #  and then use it here.
            dialect_name: GESqlDialect = self._dialect_name
            if dialect_name == GESqlDialect.ORACLE:
                # noinspection PyUnresolvedReferences
                return import_library_module(
                    module_name="sqlalchemy.dialects.oracle"
                ).dialect()
            elif dialect_name == GESqlDialect.SNOWFLAKE:
                # noinspection PyUnresolvedReferences
                return import_library_module(
                    module_name="snowflake.sqlalchemy.snowdialect"
                ).dialect()
            elif dialect_name == GESqlDialect.DREMIO:
                # WARNING: Dremio Support is experimental, functionality is not fully under test
                # noinspection PyUnresolvedReferences
                return import_library_module(
                    module_name="sqlalchemy_dremio.pyodbc"
                ).dialect()
            # NOTE: AJB 20220512 Redshift dialect is not yet fully supported.
            # The below throws an `AttributeError: type object 'RedshiftDialect_psycopg2' has no attribute 'positional'`
            # elif dialect_name == "redshift":
            #     return import_library_module(
            #         module_name="sqlalchemy_redshift.dialect"
            #     ).RedshiftDialect
            elif dialect_name == GESqlDialect.BIGQUERY:
                # noinspection PyUnresolvedReferences
                return import_library_module(
                    module_name=self._BIGQUERY_MODULE_NAME
                ).dialect()
            elif dialect_name == GESqlDialect.TERADATASQL:
                # WARNING: Teradata Support is experimental, functionality is not fully under test
                # noinspection PyUnresolvedReferences
                return import_library_module(
                    module_name="teradatasqlalchemy.dialect"
                ).dialect()
            else:
                return sa.create_engine(self._connection_string).dialect

    mock_execution_engine: MockSqlAlchemyExecutionEngine = (
        MockSqlAlchemyExecutionEngine(dialect_name=dialect_name)
    )

    data_sampler: SqlAlchemyDataSampler = SqlAlchemyDataSampler()

    # 2. Create query using sampler
    table_name: str = "test_table"
    batch_spec: BatchSpec = BatchSpec(
        table_name=table_name,
        schema_name="test_schema_name",
        sampling_method="sample_using_limit",
        sampling_kwargs={"n": 10},
    )
    query = data_sampler.sample_using_limit(
        execution_engine=mock_execution_engine, batch_spec=batch_spec, where_clause=None
    )

    if not isinstance(query, str):
        query_str: str = clean_query_for_comparison(
            str(
                query.compile(
                    dialect=mock_execution_engine.dialect,
                    compile_kwargs={"literal_binds": True},
                )
            )
        )
    else:
        query_str: str = clean_query_for_comparison(query)

    expected: str = clean_query_for_comparison(
        dialect_name_to_sql_statement(dialect_name)
    )

    assert query_str == expected