def execute(self, context): aws_hook = AwsBaseHook(self.aws_credentials_id, client_type='s3') credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info('Date:' + self.execution_date) date = parser.parse(self.execution_date) self.log.info("Backfill_data: {}".format(self.backfill_data)) s3_bucket_key = "s3://{}/{}".format(self.s3_bucket, self.s3_key) if self.backfill_data: s3_path = s3_bucket_key + '/' + str(date.year) + '/' + str( date.month) else: s3_path = s3_bucket_key self.log.info("S3 path: {}".format(s3_path)) self.log.info("Deleting data from table {}.".format(self.table)) try: redshift.run("DELETE FROM {}".format(self.table)) except table_does_not_exist as ex: self.log.info("Andrea does not exist") copy_sql = self.COPY_SQL.format(self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, self.json_path) self.log.info( "SQL Statement Executing on Redshift: {}".format(copy_sql)) redshift.run(copy_sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id, client_type="redshift") credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_path = "s3://{}/{}/{}".format(self.s3_bucket, self.s3_directory, self.table["s3"]["key"]) files_format = self.table["s3"]["format"] delimiter = self.table["s3"]["delimiter"] ignoreheader = self.table["s3"]["ignoreheader"] delimiter_text = '' if (delimiter): delimiter_text = f"delimiter '{delimiter}'" self.log.info( f"Start Copying data from {s3_path} to Table { self.table['name']}" ) redshift.run(f""" COPY { self.table["name"]} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' SECRET_ACCESS_KEY '{credentials.secret_key}' {files_format} {delimiter_text} {ignoreheader} """)
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_query = """ COPY {schema}.{table} FROM 's3://{s3_bucket}/{s3_key}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format( schema=self.schema, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=copy_options, ) self.log.info('Executing COPY command...') postgres_hook.run(copy_query, self.autocommit) self.log.info("COPY command complete...")
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_statement = f""" COPY {self.schema}.{self.table} FROM 's3://{self.s3_bucket}/{self.s3_key}' with credentials 'aws_access_key_id={credentials.access_key};aws_secret_access_key={credentials.secret_key}' {copy_options}; """ if self.truncate_table: truncate_statement = f'TRUNCATE TABLE {self.schema}.{self.table};' sql = f""" BEGIN; {truncate_statement} {copy_statement} COMMIT """ else: sql = copy_statement self.log.info('Executing COPY command...') postgres_hook.run(sql, self.autocommit) self.log.info("COPY command complete...")
def execute(self, context): """ Description: This execution function loads data from a csv-file and writes it to postgres. Arguments: self: Instance of the class context: Context dictionary Returns: None """ postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id) # Truncate table self.log.info('Clearing data from Postgres staging table {}'.format( self.table)) trunc_formatted_sql = CSVToPostgresOperator.truncate_sql.format( self.table) postgres.run(trunc_formatted_sql) # Copying data from CSV to Postgres self.log.info('Copying data from CSV to Postgres - {}'.format( self.table)) formatted_sql = CSVToPostgresOperator.copy_sql.format( self.table, self.path_to_csv, self.delimiter, self.additional_params) postgres.run(formatted_sql) self.log.info('CSVToPostgresOperator for {} completed'.format( self.table))
def init_db(): try: hook = PostgresHook() hook.run(CREATE_QUERY) hook.run(LOAD_QUERY) except ProgrammingError: pass
def execute(self, context): postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() unload_options = '\n\t\t\t'.join(self.unload_options) s3_key = '{}/{}_'.format( self.s3_key, self.table) if self.table_as_file_name else self.s3_key select_query = "SELECT * FROM {schema}.{table}".format( schema=self.schema, table=self.table) unload_query = """ UNLOAD ('{select_query}') TO 's3://{s3_bucket}/{s3_key}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {unload_options}; """.format(select_query=select_query, s3_bucket=self.s3_bucket, s3_key=s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, unload_options=unload_options) self.log.info('Executing UNLOAD command...') postgres_hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) if self.delete_all_rows: self.log.info("Deleting all rows from table {}".format(self.table)) delete_stmt = self.DELETE_SQL.format(self.table) self.log.info(delete_stmt) redshift.run(delete_stmt) insert_stmt = self.INSERT_SQL.format(self.table, self.sql) self.log.info("Insert statement for fact {}".format(insert_stmt)) redshift.run(insert_stmt)
class PostgresOperator(BaseOperator): """ Executes sql code in a specific Postgres database :param sql: the sql code to be executed. (templated) :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' :param postgres_conn_id: The :ref:`postgres conn id <howto/connection:postgres>` reference to a specific postgres database. :type postgres_conn_id: str :param autocommit: if True, each command is automatically committed. (default value: False) :type autocommit: bool :param parameters: (optional) the parameters to render the SQL query with. :type parameters: dict or iterable :param database: name of database which overwrite defined one in connection :type database: str """ template_fields = ('sql', ) template_fields_renderers = {'sql': 'sql'} template_ext = ('.sql', ) ui_color = '#ededed' @apply_defaults def __init__( self, *, sql: str, postgres_conn_id: str = 'postgres_default', autocommit: bool = False, parameters: Optional[Union[Mapping, Iterable]] = None, database: Optional[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.sql = sql self.postgres_conn_id = postgres_conn_id self.autocommit = autocommit self.parameters = parameters self.database = database self.hook = None def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) conn = S3Hook.get_connection(conn_id=self.aws_conn_id) credentials_block = None if conn.extra_dejson.get('role_arn', False): credentials_block = f"aws_iam_role={conn.extra_dejson['role_arn']}" else: s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() credentials_block = build_credentials_block(credentials) copy_options = '\n\t\t\t'.join(self.copy_options) destination = f'{self.schema}.{self.table}' copy_destination = f'#{self.table}' if self.method == 'UPSERT' else destination copy_statement = self._build_copy_query(copy_destination, credentials_block, copy_options) if self.method == 'REPLACE': sql = f""" BEGIN; DELETE FROM {destination}; {copy_statement} COMMIT """ elif self.method == 'UPSERT': keys = self.upsert_keys or postgres_hook.get_table_primary_key( self.table, self.schema) if not keys: raise AirflowException( f"No primary key on {self.schema}.{self.table}. Please provide keys on 'upsert_keys'" ) where_statement = ' AND '.join( [f'{self.table}.{k} = {copy_destination}.{k}' for k in keys]) sql = f""" CREATE TABLE {copy_destination} (LIKE {destination}); {copy_statement} BEGIN; DELETE FROM {destination} USING {copy_destination} WHERE {where_statement}; INSERT INTO {destination} SELECT * FROM {copy_destination}; COMMIT """ else: sql = copy_statement self.log.info('Executing COPY command...') postgres_hook.run(sql, self.autocommit) self.log.info("COPY command complete...")
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() credentials_block = build_credentials_block(credentials) unload_options = '\n\t\t\t'.join(self.unload_options) unload_query = self._build_unload_query(credentials_block, self._select_query, self.s3_key, unload_options) self.log.info('Executing UNLOAD command...') postgres_hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")
def execute(self, context): self.log.info('Connecting to redshift!') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) if self.truncate_table: self.log.info(f"Truncating table: {self.table}") redshift.run(f""" TRUNCATE TABLE {self.table}; """) self.log.info('Loading dimension table into redshift') redshift.run(f""" INSERT INTO {self.table} {self.select_sql} """)
class PostgresOperator(BaseOperator): """ Executes sql code in a specific Postgres database :param sql: the SQL code to be executed as a single string, or a list of str (sql statements), or a reference to a template file. Template references are recognized by str ending in '.sql' :param postgres_conn_id: The :ref:`postgres conn id <howto/connection:postgres>` reference to a specific postgres database. :param autocommit: if True, each command is automatically committed. (default value: False) :param parameters: (optional) the parameters to render the SQL query with. :param database: name of database which overwrite defined one in connection """ template_fields: Sequence[str] = ('sql', ) # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { 'sql': 'postgresql' if 'postgresql' in wwwutils.get_attr_renderer() else 'sql' } template_ext: Sequence[str] = ('.sql', ) ui_color = '#ededed' def __init__( self, *, sql: Union[str, List[str]], postgres_conn_id: str = 'postgres_default', autocommit: bool = False, parameters: Optional[Union[Mapping, Iterable]] = None, database: Optional[str] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.sql = sql self.postgres_conn_id = postgres_conn_id self.autocommit = autocommit self.parameters = parameters self.database = database self.hook: Optional[PostgresHook] = None def execute(self, context: 'Context'): self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() credentials_block = build_credentials_block(credentials) unload_options = '\n\t\t\t'.join(self.unload_options) s3_key = f"{self.s3_key}/{self.table}_" if self.table_as_file_name else self.s3_key select_query = f"SELECT * FROM {self.schema}.{self.table}" unload_query = self._build_unload_query(credentials_block, select_query, s3_key, unload_options) self.log.info('Executing UNLOAD command...') postgres_hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")
def execute(self, context): aws_hook = AwsBaseHook(self.aws_credentials_id, client_type="s3") credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_format, ) redshift.run(formatted_sql)
def execute(self, context): aws_hook = AwsBaseHook(self.aws_credentials_id) aws_credentials = aws_hook.get_credentials() redshift_conn = PostgresHook( postgres_conn_id=self.redshift_conn_id, connect_args={ 'keepalives': 1, 'keepalives_idle': 60, 'keepalives_interval': 60 }) self.log.debug(f"Truncate Table: {self.table}") redshift_conn.run(f"TRUNCATE TABLE {self.table}") format = '' if self.data_format == 'csv' and self.ignore_header > 0: format += f"IGNOREHEADER {self.ignore_header}\n" if self.data_format == 'csv': format += f"DELIMITER '{self.delimiter}'\n" elif self.data_format == 'json': format += f"FORMAT AS JSON '{self.jsonpath}'\n" format += f"{self.copy_opts}" self.log.debug(f"format : {format}") formatted_key = self.s3_src_bucket_key.format(**context) self.log.info(f"Rendered S3 source file key : {formatted_key}") s3_url = f"s3://{self.s3_src_bucket_name}/{formatted_key}" self.log.debug(f"S3 URL : {s3_url}") formatted_sql = self._sql.format(**dict( table=self.table, source=s3_url, access_key=aws_credentials.access_key, secret_access_key=aws_credentials.secret_key, format=format )) self.log.debug(f"Base SQL: {self._sql}") self.log.info(f"Copying data from S3 to Redshift table {self.table}...") redshift_conn.run(formatted_sql) self.log.info(f"Finished copying data from S3 to Redshift table {self.table}")
def execute(self, context=None): """ Format the sql statements with the params_sql statement. Execute one by one the different statements. Args: context: Returns: """ if self.params_sql is not None: commands_formatted = [ S.SQL(q).format(**self.params_sql) for q in self.commands_stripped ] else: commands_formatted = [S.SQL(q) for q in self.commands_stripped] hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) for qf in commands_formatted: self.log.info("Executing Query:{}".format( qf.as_string(hook.get_conn()))) hook.run((qf, )) pass
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() credentials_block = build_credentials_block(credentials) copy_options = '\n\t\t\t'.join(self.copy_options) copy_statement = self._build_copy_query(credentials_block, copy_options) if self.truncate_table: delete_statement = f'DELETE FROM {self.schema}.{self.table};' sql = f""" BEGIN; {delete_statement} {copy_statement} COMMIT """ else: sql = copy_statement self.log.info('Executing COPY command...') postgres_hook.run(sql, self.autocommit) self.log.info("COPY command complete...")
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) conn = S3Hook.get_connection(conn_id=self.aws_conn_id) credentials_block = None if conn.extra_dejson.get('role_arn', False): credentials_block = f"aws_iam_role={conn.extra_dejson['role_arn']}" else: s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() credentials_block = build_credentials_block(credentials) unload_options = '\n\t\t\t'.join(self.unload_options) unload_query = self._build_unload_query(credentials_block, self.select_query, self.s3_key, unload_options) self.log.info('Executing UNLOAD command...') postgres_hook.run(unload_query, self.autocommit, parameters=self.parameters) self.log.info("UNLOAD command complete...")
def execute(self, context): """ Description: This custom function fills a given fact table with a passed SQL statement. Arguments: self: Instance of the class context: Context dictionary Returns: None """ # Build connection postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id) # Realize insert statement to fill dimension table formatted_sql = LoadFactOperator.insert_sql.format( self.table, self.insert_sql_query) postgres.run(formatted_sql) self.log.info( 'LoadFactOperator for dimension table {} completed'.format( self.table))
def execute(self, context): action = f"Redshift {len(self.query_list)} queries of {self.query_type} " self.log.info(f"Start {action}") self.log.info(self.redshift_conn_id) redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for query in self.query_list: if self.query_type == 'insert': if not self.append_data: self.log.info(f"Clearing data from destination Redshift table {query[1]}") redshift.run("DELETE FROM {}".format(query[1])) self.log.info(f"Insert data into destination Redshift table {query[1]}") redshift.run(query[0]) else: redshift.run(query) self.log.info(f"End {action}")
def drop_db(): hook = PostgresHook() hook.run(DELETE_QUERY)
class PostgresOperator(BaseOperator): """ Executes sql code in a specific Postgres database :param sql: the SQL code to be executed as a single string, or a list of str (sql statements), or a reference to a template file. Template references are recognized by str ending in '.sql' :param postgres_conn_id: The :ref:`postgres conn id <howto/connection:postgres>` reference to a specific postgres database. :param autocommit: if True, each command is automatically committed. (default value: False) :param parameters: (optional) the parameters to render the SQL query with. :param database: name of database which overwrite defined one in connection """ template_fields: Sequence[str] = ('sql', ) # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement. template_fields_renderers = { 'sql': 'postgresql' if 'postgresql' in wwwutils.get_attr_renderer() else 'sql' } template_ext: Sequence[str] = ('.sql', ) ui_color = '#ededed' def __init__( self, *, sql: Union[str, Iterable[str]], postgres_conn_id: str = 'postgres_default', autocommit: bool = False, parameters: Optional[Union[Iterable, Mapping]] = None, database: Optional[str] = None, runtime_parameters: Optional[Mapping] = None, **kwargs, ) -> None: super().__init__(**kwargs) self.sql = sql self.postgres_conn_id = postgres_conn_id self.autocommit = autocommit self.parameters = parameters self.database = database self.runtime_parameters = runtime_parameters self.hook: Optional[PostgresHook] = None def execute(self, context: 'Context'): self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) if self.runtime_parameters: final_sql = [] sql_param = {} for param in self.runtime_parameters: set_param_sql = f"SET {{}} TO %({param})s;" dynamic_sql = SQL(set_param_sql).format(Identifier(f"{param}")) final_sql.append(dynamic_sql) for param, val in self.runtime_parameters.items(): sql_param.update({f"{param}": f"{val}"}) if self.parameters: sql_param.update(self.parameters) if isinstance(self.sql, str): final_sql.append(SQL(self.sql)) else: final_sql.extend(list(map(SQL, self.sql))) self.hook.run(final_sql, self.autocommit, parameters=sql_param) else: self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)
def execute(self, context): """ Description: This execution function gets flight data from the OpenSky REST API hour per hour and writes this data to postgres. Arguments: self: Instance of the class context: Context dictionary Returns: None """ # Build connections postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id) api_connection = BaseHook.get_connection(conn_id=self.api_conn_id) # Build correct timestamps in unix-format for passing to the api query self.log.info('api_query_dateeee: {}'.format( self.api_query_date.format(**context))) start_time = datetime.fromisoformat( self.api_query_date.format(**context)) end_time = start_time + timedelta(hours=1) start_timestamp = str(int(datetime.timestamp(start_time))) end_timestamp = str(int(datetime.timestamp(end_time))) # Build complete api path # 'https://{}:{}@opensky-network.org/api/flights/all?begin={}&end={}' complete_api_path = self.api_path.format(api_connection.login, api_connection.password, start_timestamp, end_timestamp) self.log.info('api_path: {}'.format(complete_api_path)) # If parameter truncate_table is true, then truncate given table if self.truncate_table: self.log.info('Truncate data from staging table {}'.format( self.table)) trunc_formatted_sql = APItoPostgresOperator.truncate_sql.format( self.table) postgres.run(trunc_formatted_sql) # Get data from api try: response = requests.get(complete_api_path) data = response.json() except: self.log.info('API request error - message:{}'.format( sys.exc_info()[0])) # If response is OK and length of data > 0 then write data to the database if (response.status_code == 200): if (len(data) > 0): for element in data: formatted_sql = APItoPostgresOperator.insert_sql.format( self.table, element['icao24'], element['firstSeen'], element['estDepartureAirport'], element['lastSeen'], element['estArrivalAirport'], element['callsign']) postgres.run(formatted_sql) else: self.log.info( 'API request doesnt contain data - datetime:{}'.format( start_time)) else: self.log.info( 'API request problem - datetime:{} - response_code:{}'.format( start_time, str(response))) self.log.info( 'APItoPostgresOperator for {} completed - datetime: {}'.format( self.table, start_time))
class S3ToRedshiftOperator(BaseOperator): """ Executes an COPY command to load files from s3 to Redshift :param schema: reference to a specific schema in redshift database :type schema: str :param table: reference to a specific table in redshift database :type table: str :param s3_bucket: reference to a specific S3 bucket :type s3_bucket: str :param s3_key: reference to a specific S3 key :type s3_key: str :param redshift_conn_id: reference to a specific redshift database :type redshift_conn_id: str :param aws_conn_id: reference to a specific S3 connection :type aws_conn_id: str :param verify: Whether or not to verify SSL certificates for S3 connection. By default SSL certificates are verified. You can provide the following values: - ``False``: do not validate SSL certificates. SSL will still be used (unless use_ssl is False), but SSL certificates will not be verified. - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. You can specify this argument if you want to use a different CA cert bundle than the one used by botocore. :type verify: bool or str :param copy_options: reference to a list of COPY options :type copy_options: list """ template_fields = () template_ext = () ui_color = '#ededed' @apply_defaults def __init__(self, schema: str, table: str, s3_bucket: str, s3_key: str, redshift_conn_id: str = 'redshift_default', aws_conn_id: str = 'aws_default', verify: Optional[Union[bool, str]] = None, copy_options: Optional[List] = None, autocommit: bool = False, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.schema = schema self.table = table self.s3_bucket = s3_bucket self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.aws_conn_id = aws_conn_id self.verify = verify self.copy_options = copy_options or [] self.autocommit = autocommit self._s3_hook = None self._postgres_hook = None def execute(self, context): self._postgres_hook = PostgresHook( postgres_conn_id=self.redshift_conn_id) self._s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = self._s3_hook.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_query = """ COPY {schema}.{table} FROM 's3://{s3_bucket}/{s3_key}/{table}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(schema=self.schema, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=copy_options) self.log.info('Executing COPY command...') self._postgres_hook.run(copy_query, self.autocommit) self.log.info("COPY command complete...")