def extract(self) -> [StepMetadata]: # (1) Parse sql statement to obtain input / output tables. sql_meta: SqlMeta = SqlParser.parse(self.operator.sql) # (2) Default all inputs / outputs to current connection. # NOTE: We'll want to look into adding support for the `database` # property that is used to override the one defined in the connection. conn_id = self.operator.postgres_conn_id source = Source(type='POSTGRESQL', name=conn_id, connection_url=get_connection_uri(conn_id)) # (3) Map input / output tables to dataset objects with source set # as the current connection. We need to also fetch the schema for the # input tables to format the dataset name as: # {schema_name}.{table_name} inputs = [ Dataset.from_table(source=source, table_name=in_table_schema.table_name.name, schema_name=in_table_schema.schema_name) for in_table_schema in self._get_table_schemas(sql_meta.in_tables) ] outputs = [ Dataset.from_table_schema(source=source, table_schema=out_table_schema) for out_table_schema in self._get_table_schemas(sql_meta.out_tables) ] return [ StepMetadata( name=f"{self.operator.dag_id}.{self.operator.task_id}", inputs=inputs, outputs=outputs, context={'sql': self.operator.sql}) ]
def test_parse_simple_select_with_table_schema_prefix_and_extra_whitespace(): sql_meta = SqlParser.parse( ''' SELECT * FROM schema0.table0 ; ''' ) assert sql_meta.in_tables == [DbTableName('schema0.table0')] assert sql_meta.out_tables == []
def test_parse_simple_insert_into(): sql_meta = SqlParser.parse( ''' INSERT INTO table0 (col0, col1, col2) VALUES (val0, val1, val2); ''' ) assert sql_meta.in_tables == [] assert sql_meta.out_tables == [DbTableName('table0')]
def test_parse_simple_select_into(): sql_meta = SqlParser.parse( ''' SELECT * INTO table0 FROM table1; ''' ) assert sql_meta.in_tables == [DbTableName('table1')] assert sql_meta.out_tables == [DbTableName('table0')]
def test_parse_simple_select(): sql_meta = SqlParser.parse( ''' SELECT * FROM table0; ''' ) log.debug("sqlparser.parse() successful.") assert sql_meta.in_tables == [DbTableName('table0')] assert sql_meta.out_tables == []
def test_parse_simple_insert_into_select(): sql_meta = SqlParser.parse( ''' INSERT INTO table1 (col0, col1, col2) SELECT col0, col1, col2 FROM table0; ''' ) assert sql_meta.in_tables == [DbTableName('table0')] assert sql_meta.out_tables == [DbTableName('table1')]
def test_parse_simple_left_join(): sql_meta = SqlParser.parse( ''' SELECT col0, col1, col2 FROM table0 LEFT JOIN table1 ON t1.col0 = t2.col0 ''' ) assert sql_meta.in_tables == [DbTableName('table0'), DbTableName('table1')] assert sql_meta.out_tables == []
def test_parse_simple_right_outer_join(): sql_meta = SqlParser.parse( ''' SELECT col0, col1, col2 FROM table0 RIGHT OUTER JOIN table1 ON t1.col0 = t2.col0; ''' ) assert sql_meta.in_tables == [DbTableName('table0'), DbTableName('table1')] assert sql_meta.out_tables == []
def parse_sql_context(self): context = { 'sql': self.operator.sql, } try: sql_meta = SqlParser.parse(self.operator.sql) log.debug(f"bigquery sql parsed and obtained meta: {sql_meta}") context['bigquery.sql.parsed.inputs'] = json.dumps( [in_table.name for in_table in sql_meta.in_tables]) context['bigquery.sql.parsed.outputs'] = json.dumps( [out_table.name for out_table in sql_meta.out_tables]) except Exception as e: log.error(f"Cannot parse sql query. {e}", exc_info=True) context['bigquery.extractor.sql_parser_error'] = \ f'{e}: {traceback.format_exc()}' self.log.info(context) return context