def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: selectable = self._build_selectable_from_batch_spec( batch_spec=batch_spec) if "bigquery_temp_table" in batch_spec: temp_table_name = batch_spec.get("bigquery_temp_table") else: temp_table_name = None source_table_name = batch_spec.get("table_name", None) source_schema_name = batch_spec.get("schema_name", None) batch_data = SqlAlchemyBatchData( execution_engine=self, selectable=selectable, temp_table_name=temp_table_name, create_temp_table=batch_spec.get("create_temp_table", self._create_temp_table), source_table_name=source_table_name, source_schema_name=source_schema_name, ) batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) return batch_data, batch_markers
def test_to_make_sure_splitter_and_sampler_methods_are_optional( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec({ "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "sampling_method": "_sample_using_mod", "sampling_kwargs": { "column_name": "id", "mod": 10, "value": 8, }, })) assert len(batch_data.head(fetch_all=True)) == 12 batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec({ "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, })) assert len(batch_data.head(fetch_all=True)) == 120 batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec({ "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, })) assert len(batch_data.head(fetch_all=True)) == 120
def _build_selectable_from_batch_spec( self, batch_spec: BatchSpec ) -> Union[Selectable, str]: if "splitter_method" in batch_spec: splitter_fn: Callable = self._get_splitter_method( splitter_method_name=batch_spec["splitter_method"] ) split_clause = splitter_fn( batch_identifiers=batch_spec["batch_identifiers"], **batch_spec["splitter_kwargs"], ) else: split_clause = True table_name: str = batch_spec["table_name"] sampling_method: Optional[str] = batch_spec.get("sampling_method") if sampling_method is not None: if sampling_method in [ "_sample_using_limit", "sample_using_limit", "_sample_using_random", "sample_using_random", ]: sampler_fn = self._data_sampler.get_sampler_method(sampling_method) return sampler_fn( execution_engine=self, batch_spec=batch_spec, where_clause=split_clause, ) else: sampler_fn = self._data_sampler.get_sampler_method(sampling_method) return ( sa.select("*") .select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None)) ) .where( sa.and_( split_clause, sampler_fn(batch_spec), ) ) ) return ( sa.select("*") .select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None)) ) .where(split_clause) )
def test_sampling_method__limit( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec( { "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_limit", "sampling_kwargs": {"n": 20}, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 20 assert ( validator.expect_column_values_to_be_in_set( "date", value_set=["2020-01-02"] ).success == False )
def test_batch__str__method(): batch = Batch( data=None, batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", ), batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", batch_identifiers=IDDict({}), ), batch_spec=BatchSpec(path="/some/path/some.file"), batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"), ) print(batch.__str__()) assert (batch.__str__() == """{ "data": "None", "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "my_data_asset_name" }, "batch_definition": { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "my_data_asset_name", "batch_identifiers": {} }, "batch_spec": "{'path': '/some/path/some.file'}", "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}" }""")
def build_batch_spec(self, batch_definition: BatchDefinition): data_asset_name = batch_definition.data_asset_name batch_spec = BatchSpec({ "table_name": data_asset_name, "partition_definition": batch_definition.partition_definition, **self.data_assets[data_asset_name], }) return batch_spec
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: # batch_data batch_data: DataFrame # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data batch_spec.batch_data = "SparkDataFrame" elif isinstance(batch_spec, (PathBatchSpec, S3BatchSpec)): reader_method: str = batch_spec.get("reader_method") reader_options: dict = batch_spec.get("reader_options") or {} path: str = batch_spec.get("path") or batch_spec.get("s3") try: reader_options = self.spark.read.options(**reader_options) reader_fn: Callable = self._get_reader_fn( reader=reader_options, reader_method=reader_method, path=path, ) batch_data = reader_fn(path) except AttributeError: raise ExecutionEngineError(""" Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine. """) else: raise BatchSpecError(""" Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate. """) batch_data = self._apply_splitting_and_sampling_methods( batch_spec, batch_data) typed_batch_data = SparkDFBatchData(batch_data) return typed_batch_data, batch_markers
def test_instantiation_via_connection_string(sa, test_db_connection_string): my_execution_engine = SqlAlchemyExecutionEngine( connection_string=test_db_connection_string) assert my_execution_engine.connection_string == test_db_connection_string assert my_execution_engine.credentials == None assert my_execution_engine.url == None my_execution_engine.get_batch_data_and_markers( BatchSpec( table_name="main.table_1", sampling_method="_sample_using_limit", sampling_kwargs={"n": 5}, ))
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: if not isinstance( batch_spec, (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec)): raise InvalidBatchSpecError( f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received). """) batch_data: Optional[SqlAlchemyBatchData] = None batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) source_schema_name: str = batch_spec.get("schema_name", None) source_table_name: str = batch_spec.get("table_name", None) temp_table_schema_name: Optional[str] = batch_spec.get( "temp_table_schema_name") temp_table_name: Optional[str] = batch_spec.get("bigquery_temp_table") create_temp_table: bool = batch_spec.get("create_temp_table", self._create_temp_table) if isinstance(batch_spec, RuntimeQueryBatchSpec): # query != None is already checked when RuntimeQueryBatchSpec is instantiated query: str = batch_spec.query batch_spec.query = "SQLQuery" batch_data = SqlAlchemyBatchData( execution_engine=self, query=query, temp_table_schema_name=temp_table_schema_name, temp_table_name=temp_table_name, create_temp_table=create_temp_table, source_table_name=source_table_name, source_schema_name=source_schema_name, ) elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec): if self.engine.dialect.name.lower() == "oracle": selectable: str = self._build_selectable_from_batch_spec( batch_spec=batch_spec) else: selectable: Selectable = self._build_selectable_from_batch_spec( batch_spec=batch_spec) batch_data = SqlAlchemyBatchData( execution_engine=self, selectable=selectable, temp_table_name=temp_table_name, create_temp_table=create_temp_table, source_table_name=source_table_name, source_schema_name=source_schema_name, ) return batch_data, batch_markers
def test_sampling_method__limit( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec({ "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_limit", "sampling_kwargs": { "n": 20 }, })) assert len(batch_data.head(fetch_all=True)) == 20
def test_instantiation_via_url(sa): db_file = file_relative_path( __file__, os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"), ) my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file) assert my_execution_engine.connection_string == None assert my_execution_engine.credentials == None assert my_execution_engine.url[ -36:] == "test_cases_for_sql_data_connector.db" my_execution_engine.get_batch_data_and_markers( BatchSpec( table_name="table_partitioned_by_date_column__A", sampling_method="_sample_using_limit", sampling_kwargs={"n": 5}, ))
def test_sampling_method__random( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec({ "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_random", "sampling_kwargs": { "p": 1.0 }, })) # random.seed() is no good here: the random number generator is in the database, not python # assert len(batch_data.head(fetch_all=True)) == 63 pass
def test_sampling_method__a_list( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec({ "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_a_list", "sampling_kwargs": { "column_name": "id", "value_list": [10, 20, 30, 40], }, })) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 4
def build_batch_spec(self, batch_definition: BatchDefinition): """ Build BatchSpec from batch_definition with the following components: 1. data_asset_name from batch_definition 2. partition_definition from batch_definition 3. data_asset from data_connector Args: batch_definition (BatchDefinition): to be used to build batch_spec Returns: BatchSpec built from batch_definition """ data_asset_name = batch_definition.data_asset_name batch_spec = BatchSpec({ "table_name": data_asset_name, "partition_definition": batch_definition.partition_definition, **self.data_assets[data_asset_name], }) return batch_spec
def get_batch_data_and_markers( self, batch_spec: BatchSpec ) -> Tuple[Any, BatchMarkers]: if not isinstance( batch_spec, (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec) ): raise InvalidBatchSpecError( f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received). """ ) batch_data: Optional[SqlAlchemyBatchData] = None batch_markers: BatchMarkers = BatchMarkers( { "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime( "%Y%m%dT%H%M%S.%fZ" ) } ) source_schema_name: str = batch_spec.get("schema_name", None) source_table_name: str = batch_spec.get("table_name", None) temp_table_schema_name: Optional[str] = batch_spec.get("temp_table_schema_name") if batch_spec.get("bigquery_temp_table"): # deprecated-v0.15.3 warnings.warn( "BigQuery tables that are created as the result of a query are no longer created as " "permanent tables. Thus, a named permanent table through the `bigquery_temp_table`" "parameter is not required. The `bigquery_temp_table` parameter is deprecated as of" "v0.15.3 and will be removed in v0.18.", DeprecationWarning, ) create_temp_table: bool = batch_spec.get( "create_temp_table", self._create_temp_table ) if isinstance(batch_spec, RuntimeQueryBatchSpec): # query != None is already checked when RuntimeQueryBatchSpec is instantiated query: str = batch_spec.query batch_spec.query = "SQLQuery" batch_data = SqlAlchemyBatchData( execution_engine=self, query=query, temp_table_schema_name=temp_table_schema_name, create_temp_table=create_temp_table, source_table_name=source_table_name, source_schema_name=source_schema_name, ) elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec): if self.engine.dialect.name.lower() == "oracle": selectable: str = self._build_selectable_from_batch_spec( batch_spec=batch_spec ) else: selectable: Selectable = self._build_selectable_from_batch_spec( batch_spec=batch_spec ) batch_data = SqlAlchemyBatchData( execution_engine=self, selectable=selectable, create_temp_table=create_temp_table, source_table_name=source_table_name, source_schema_name=source_schema_name, ) return batch_data, batch_markers
def test_instantiation_with_and_without_temp_table(sqlite_view_engine, sa): print(get_sqlite_temp_table_names(sqlite_view_engine)) assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1 assert get_sqlite_temp_table_names(sqlite_view_engine) == { "test_temp_view" } engine = SqlAlchemyExecutionEngine(engine=sqlite_view_engine) # When the SqlAlchemyBatchData object is based on a table, a new temp table is NOT created, even if create_temp_table=True SqlAlchemyBatchData( execution_engine=engine, table_name="test_table", create_temp_table=True, ) assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1 selectable = sa.select("*").select_from(sa.text("main.test_table")) # If create_temp_table=False, a new temp table should NOT be created SqlAlchemyBatchData( execution_engine=engine, selectable=selectable, create_temp_table=False, ) assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1 # If create_temp_table=True, a new temp table should be created SqlAlchemyBatchData( execution_engine=engine, selectable=selectable, create_temp_table=True, ) assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 2 # If create_temp_table=True, a new temp table should be created SqlAlchemyBatchData( execution_engine=engine, selectable=selectable, # create_temp_table defaults to True ) assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 3 # testing whether schema is supported selectable = sa.select("*").select_from( sa.table(name="test_table", schema="main")) SqlAlchemyBatchData( execution_engine=engine, selectable=selectable, # create_temp_table defaults to True ) assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 4 # test schema with execution engine # TODO : Will20210222 Add tests for specifying schema with non-sqlite backend that actually supports new schema creation my_batch_spec = BatchSpec( **{ "table_name": "test_table", "partition_definition": {}, "schema_name": "main", }) res = engine.get_batch_data_and_markers(batch_spec=my_batch_spec) assert len(res) == 2
def _build_selectable_from_batch_spec( self, batch_spec: BatchSpec) -> Union[Selectable, str]: table_name: str = batch_spec["table_name"] if "splitter_method" in batch_spec: splitter_fn = getattr(self, batch_spec["splitter_method"]) split_clause = splitter_fn( table_name=table_name, batch_identifiers=batch_spec["batch_identifiers"], **batch_spec["splitter_kwargs"], ) else: split_clause = True if "sampling_method" in batch_spec: if batch_spec["sampling_method"] == "_sample_using_limit": # SQLalchemy's semantics for LIMIT are different than normal WHERE clauses, # so the business logic for building the query needs to be different. if self.engine.dialect.name.lower() == "oracle": # limit doesn't compile properly for oracle so we will append rownum to query string later raw_query = (sa.select("*").select_from( sa.table( table_name, schema=batch_spec.get("schema_name", None))).where(split_clause)) query = str( raw_query.compile( self.engine, compile_kwargs={"literal_binds": True})) query += "\nAND ROWNUM <= %d" % batch_spec[ "sampling_kwargs"]["n"] return query else: return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(split_clause).limit( batch_spec["sampling_kwargs"]["n"])) elif batch_spec["sampling_method"] == "_sample_using_random": num_rows: int = self.engine.execute( sa.select([sa.func.count()]).select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(split_clause)).scalar() p: float = batch_spec["sampling_kwargs"]["p"] or 1.0 sample_size: int = round(p * num_rows) return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(split_clause).order_by( sa.func.random()).limit(sample_size)) else: sampler_fn = getattr(self, batch_spec["sampling_method"]) return (sa.select("*").select_from( sa.table( table_name, schema=batch_spec.get("schema_name", None))).where( sa.and_( split_clause, sampler_fn(**batch_spec["sampling_kwargs"]), ))) return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None))).where(split_clause))