def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: selectable = self._build_selectable_from_batch_spec( batch_spec=batch_spec) if "bigquery_temp_table" in batch_spec: temp_table_name = batch_spec.get("bigquery_temp_table") else: temp_table_name = None source_table_name = batch_spec.get("table_name", None) source_schema_name = batch_spec.get("schema_name", None) batch_data = SqlAlchemyBatchData( execution_engine=self, selectable=selectable, temp_table_name=temp_table_name, create_temp_table=batch_spec.get("create_temp_table", self._create_temp_table), source_table_name=source_table_name, source_schema_name=source_schema_name, ) batch_markers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) return batch_data, batch_markers
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: if not isinstance( batch_spec, (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec)): raise InvalidBatchSpecError( f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received). """) batch_data: Optional[SqlAlchemyBatchData] = None batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) source_schema_name: str = batch_spec.get("schema_name", None) source_table_name: str = batch_spec.get("table_name", None) temp_table_schema_name: Optional[str] = batch_spec.get( "temp_table_schema_name") temp_table_name: Optional[str] = batch_spec.get("bigquery_temp_table") create_temp_table: bool = batch_spec.get("create_temp_table", self._create_temp_table) if isinstance(batch_spec, RuntimeQueryBatchSpec): # query != None is already checked when RuntimeQueryBatchSpec is instantiated query: str = batch_spec.query batch_spec.query = "SQLQuery" batch_data = SqlAlchemyBatchData( execution_engine=self, query=query, temp_table_schema_name=temp_table_schema_name, temp_table_name=temp_table_name, create_temp_table=create_temp_table, source_table_name=source_table_name, source_schema_name=source_schema_name, ) elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec): if self.engine.dialect.name.lower() == "oracle": selectable: str = self._build_selectable_from_batch_spec( batch_spec=batch_spec) else: selectable: Selectable = self._build_selectable_from_batch_spec( batch_spec=batch_spec) batch_data = SqlAlchemyBatchData( execution_engine=self, selectable=selectable, temp_table_name=temp_table_name, create_temp_table=create_temp_table, source_table_name=source_table_name, source_schema_name=source_schema_name, ) return batch_data, batch_markers
def _build_selectable_from_batch_spec( self, batch_spec: BatchSpec ) -> Union[Selectable, str]: if "splitter_method" in batch_spec: splitter_fn: Callable = self._get_splitter_method( splitter_method_name=batch_spec["splitter_method"] ) split_clause = splitter_fn( batch_identifiers=batch_spec["batch_identifiers"], **batch_spec["splitter_kwargs"], ) else: split_clause = True table_name: str = batch_spec["table_name"] sampling_method: Optional[str] = batch_spec.get("sampling_method") if sampling_method is not None: if sampling_method in [ "_sample_using_limit", "sample_using_limit", "_sample_using_random", "sample_using_random", ]: sampler_fn = self._data_sampler.get_sampler_method(sampling_method) return sampler_fn( execution_engine=self, batch_spec=batch_spec, where_clause=split_clause, ) else: sampler_fn = self._data_sampler.get_sampler_method(sampling_method) return ( sa.select("*") .select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None)) ) .where( sa.and_( split_clause, sampler_fn(batch_spec), ) ) ) return ( sa.select("*") .select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None)) ) .where(split_clause) )
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: # batch_data batch_data: DataFrame # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data batch_spec.batch_data = "SparkDataFrame" elif isinstance(batch_spec, (PathBatchSpec, S3BatchSpec)): reader_method: str = batch_spec.get("reader_method") reader_options: dict = batch_spec.get("reader_options") or {} path: str = batch_spec.get("path") or batch_spec.get("s3") try: reader_options = self.spark.read.options(**reader_options) reader_fn: Callable = self._get_reader_fn( reader=reader_options, reader_method=reader_method, path=path, ) batch_data = reader_fn(path) except AttributeError: raise ExecutionEngineError(""" Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine. """) else: raise BatchSpecError(""" Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate. """) batch_data = self._apply_splitting_and_sampling_methods( batch_spec, batch_data) typed_batch_data = SparkDFBatchData(batch_data) return typed_batch_data, batch_markers
def get_batch_data_and_markers( self, batch_spec: BatchSpec ) -> Tuple[Any, BatchMarkers]: if not isinstance( batch_spec, (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec) ): raise InvalidBatchSpecError( f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received). """ ) batch_data: Optional[SqlAlchemyBatchData] = None batch_markers: BatchMarkers = BatchMarkers( { "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime( "%Y%m%dT%H%M%S.%fZ" ) } ) source_schema_name: str = batch_spec.get("schema_name", None) source_table_name: str = batch_spec.get("table_name", None) temp_table_schema_name: Optional[str] = batch_spec.get("temp_table_schema_name") if batch_spec.get("bigquery_temp_table"): # deprecated-v0.15.3 warnings.warn( "BigQuery tables that are created as the result of a query are no longer created as " "permanent tables. Thus, a named permanent table through the `bigquery_temp_table`" "parameter is not required. The `bigquery_temp_table` parameter is deprecated as of" "v0.15.3 and will be removed in v0.18.", DeprecationWarning, ) create_temp_table: bool = batch_spec.get( "create_temp_table", self._create_temp_table ) if isinstance(batch_spec, RuntimeQueryBatchSpec): # query != None is already checked when RuntimeQueryBatchSpec is instantiated query: str = batch_spec.query batch_spec.query = "SQLQuery" batch_data = SqlAlchemyBatchData( execution_engine=self, query=query, temp_table_schema_name=temp_table_schema_name, create_temp_table=create_temp_table, source_table_name=source_table_name, source_schema_name=source_schema_name, ) elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec): if self.engine.dialect.name.lower() == "oracle": selectable: str = self._build_selectable_from_batch_spec( batch_spec=batch_spec ) else: selectable: Selectable = self._build_selectable_from_batch_spec( batch_spec=batch_spec ) batch_data = SqlAlchemyBatchData( execution_engine=self, selectable=selectable, create_temp_table=create_temp_table, source_table_name=source_table_name, source_schema_name=source_schema_name, ) return batch_data, batch_markers
def _build_selectable_from_batch_spec( self, batch_spec: BatchSpec) -> Union[Selectable, str]: table_name: str = batch_spec["table_name"] if "splitter_method" in batch_spec: splitter_fn = getattr(self, batch_spec["splitter_method"]) split_clause = splitter_fn( table_name=table_name, batch_identifiers=batch_spec["batch_identifiers"], **batch_spec["splitter_kwargs"], ) else: split_clause = True if "sampling_method" in batch_spec: if batch_spec["sampling_method"] == "_sample_using_limit": # SQLalchemy's semantics for LIMIT are different than normal WHERE clauses, # so the business logic for building the query needs to be different. if self.engine.dialect.name.lower() == "oracle": # limit doesn't compile properly for oracle so we will append rownum to query string later raw_query = (sa.select("*").select_from( sa.table( table_name, schema=batch_spec.get("schema_name", None))).where(split_clause)) query = str( raw_query.compile( self.engine, compile_kwargs={"literal_binds": True})) query += "\nAND ROWNUM <= %d" % batch_spec[ "sampling_kwargs"]["n"] return query else: return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(split_clause).limit( batch_spec["sampling_kwargs"]["n"])) elif batch_spec["sampling_method"] == "_sample_using_random": num_rows: int = self.engine.execute( sa.select([sa.func.count()]).select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(split_clause)).scalar() p: float = batch_spec["sampling_kwargs"]["p"] or 1.0 sample_size: int = round(p * num_rows) return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get( "schema_name", None))).where(split_clause).order_by( sa.func.random()).limit(sample_size)) else: sampler_fn = getattr(self, batch_spec["sampling_method"]) return (sa.select("*").select_from( sa.table( table_name, schema=batch_spec.get("schema_name", None))).where( sa.and_( split_clause, sampler_fn(**batch_spec["sampling_kwargs"]), ))) return (sa.select("*").select_from( sa.table(table_name, schema=batch_spec.get("schema_name", None))).where(split_clause))