def extract(self) -> [StepMetadata]: inputs = [Dataset.from_table(self.source, "extract_input1")] outputs = [Dataset.from_table(self.source, "extract_output1")] return [ StepMetadata(name=get_job_name(task=self.operator), inputs=inputs, outputs=outputs, context={"extract": "extract"}) ]
def _get_input_from_bq(self, properties, client): bq_input_tables = get_from_nullable_chain(properties, [ 'statistics', 'query', 'referencedTables' ]) if not bq_input_tables: return None input_table_names = [ self._bq_table_name(bq_t) for bq_t in bq_input_tables ] sources = [ self._source(bq_t) for bq_t in bq_input_tables ] try: return [ Dataset.from_table_schema( source=source, table_schema=table_schema ) for table_schema, source in zip(self._get_table_schemas( input_table_names, client ), sources) ] except Exception as e: log.warning(f'Could not extract schema from bigquery. {e}') return [ Dataset.from_table(source, table) for table, source in zip(input_table_names, sources) ]
def extract_on_complete(self, task_instance) -> StepMetadata: inputs = [ Dataset.from_table_schema(self.source, DbTableSchema( schema_name='schema', table_name=DbTableName('extract_on_complete_input1'), columns=[DbColumn( name='field1', type='text', description='', ordinal_position=1 ), DbColumn( name='field2', type='text', description='', ordinal_position=2 )] )) ] outputs = [ Dataset.from_table(self.source, "extract_on_complete_output1") ] return StepMetadata( name=get_job_name(task=self.operator), inputs=inputs, outputs=outputs, context={ "extract_on_complete": "extract_on_complete" } )
def _get_input_from_bq(self, job, context, source, client): if not job._properties.get('statistics')\ or not job._properties.get('statistics').get('query')\ or not job._properties.get('statistics').get('query')\ .get('referencedTables'): return None bq_input_tables = job._properties.get('statistics')\ .get('query')\ .get('referencedTables') input_table_names = [ self._bq_table_name(bq_t) for bq_t in bq_input_tables ] try: return [ Dataset.from_table_schema(source=source, table_schema=table_schema) for table_schema in self._get_table_schemas( input_table_names, client) ] except Exception as e: log.warn(f'Could not extract schema from bigquery. {e}') context['bigquery.extractor.bq_schema_error'] = \ f'{e}: {traceback.format_exc()}' return [ Dataset.from_table(source, table) for table in input_table_names ]
def extract(self) -> [StepMetadata]: # (1) Parse sql statement to obtain input / output tables. sql_meta: SqlMeta = SqlParser.parse(self.operator.sql) # (2) Default all inputs / outputs to current connection. # NOTE: We'll want to look into adding support for the `database` # property that is used to override the one defined in the connection. conn_id = self.operator.postgres_conn_id source = Source(type='POSTGRESQL', name=conn_id, connection_url=get_connection_uri(conn_id)) # (3) Map input / output tables to dataset objects with source set # as the current connection. We need to also fetch the schema for the # input tables to format the dataset name as: # {schema_name}.{table_name} inputs = [ Dataset.from_table(source=source, table_name=in_table_schema.table_name.name, schema_name=in_table_schema.schema_name) for in_table_schema in self._get_table_schemas(sql_meta.in_tables) ] outputs = [ Dataset.from_table_schema(source=source, table_schema=out_table_schema) for out_table_schema in self._get_table_schemas(sql_meta.out_tables) ] return [ StepMetadata( name=f"{self.operator.dag_id}.{self.operator.task_id}", inputs=inputs, outputs=outputs, context={'sql': self.operator.sql}) ]
def _get_output_from_bq(self, job, source, client): bq_output_table = job._properties.get('configuration') \ .get('query') \ .get('destinationTable') output_table_name = self._bq_table_name(bq_output_table) table_schema = self._get_table_safely(output_table_name, client) if table_schema: return [ Dataset.from_table_schema(source=source, table_schema=table_schema) ] else: log.warn("Could not resolve output table from bq") return [Dataset.from_table(source, output_table_name)]
def _get_output_from_bq(self, properties, client) -> Optional[Dataset]: bq_output_table = get_from_nullable_chain(properties, [ 'configuration', 'query', 'destinationTable' ]) if not bq_output_table: return None output_table_name = self._bq_table_name(bq_output_table) source = self._source(bq_output_table) table_schema = self._get_table_safely(output_table_name, client) if table_schema: return Dataset.from_table_schema( source=source, table_schema=table_schema ) else: log.warning("Could not resolve output table from bq") return Dataset.from_table(source, output_table_name)