class PostgresOperator(BaseOperator): """ Executes sql code in a specific Postgres database :param postgres_conn_id: reference to a specific postgres database :type postgres_conn_id: string :param sql: the sql code to be executed :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' """ template_fields = ('sql',) template_ext = ('.sql',) ui_color = '#ededed' @apply_defaults def __init__( self, sql, postgres_conn_id='postgres_default', autocommit=False, parameters=None, *args, **kwargs): super(PostgresOperator, self).__init__(*args, **kwargs) self.sql = sql self.postgres_conn_id = postgres_conn_id self.autocommit = autocommit self.parameters = parameters def execute(self, context): _log.info('Executing: ' + str(self.sql)) self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id) self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
class PostgresOperator(BaseOperator): """ Executes sql code in a specific Postgres database :param sql: the sql code to be executed. (templated) :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' :param postgres_conn_id: reference to a specific postgres database :type postgres_conn_id: str :param autocommit: if True, each command is automatically committed. (default value: False) :type autocommit: bool :param parameters: (optional) the parameters to render the SQL query with. :type parameters: mapping or iterable :param database: name of database which overwrite defined one in connection :type database: str """ template_fields = ('sql',) template_ext = ('.sql',) ui_color = '#ededed' @apply_defaults def __init__( self, sql, postgres_conn_id='postgres_default', autocommit=False, parameters=None, database=None, *args, **kwargs): super().__init__(*args, **kwargs) self.sql = sql self.postgres_conn_id = postgres_conn_id self.autocommit = autocommit self.parameters = parameters self.database = database def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id, schema=self.database) self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)
class S3ToRedshiftTransfer(BaseOperator): """ Executes an COPY command to load files from s3 to Redshift :param schema: reference to a specific schema in redshift database :type schema: string :param table: reference to a specific table in redshift database :type table: string :param s3_bucket: reference to a specific S3 bucket :type s3_bucket: string :param s3_key: reference to a specific S3 key :type s3_key: string :param redshift_conn_id: reference to a specific redshift database :type redshift_conn_id: string :param aws_conn_id: reference to a specific S3 connection :type aws_conn_id: string :parame verify: Whether or not to verify SSL certificates for S3 connection. By default SSL certificates are verified. You can provide the following values: - False: do not validate SSL certificates. SSL will still be used (unless use_ssl is False), but SSL certificates will not be verified. - path/to/cert/bundle.pem: A filename of the CA cert bundle to uses. You can specify this argument if you want to use a different CA cert bundle than the one used by botocore. :type verify: bool or str :param copy_options: reference to a list of COPY options :type copy_options: list """ template_fields = () template_ext = () ui_color = '#ededed' @apply_defaults def __init__( self, schema, table, s3_bucket, s3_key, redshift_conn_id='redshift_default', aws_conn_id='aws_default', verify=None, copy_options=tuple(), autocommit=False, parameters=None, *args, **kwargs): super(S3ToRedshiftTransfer, self).__init__(*args, **kwargs) self.schema = schema self.table = table self.s3_bucket = s3_bucket self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.aws_conn_id = aws_conn_id self.verify = verify self.copy_options = copy_options self.autocommit = autocommit self.parameters = parameters def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = self.s3.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_query = """ COPY {schema}.{table} FROM 's3://{s3_bucket}/{s3_key}/{table}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(schema=self.schema, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=copy_options) self.log.info('Executing COPY command...') self.hook.run(copy_query, self.autocommit) self.log.info("COPY command complete...")
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Running sql query...") redshift.run(self.sql) self.log.info("...done.")
def execute(self, context): redshift_hook = PostgresHook(self.redshift_conn_id) redshift_hook.run(str(self.sql_query))
def execute(self, context): self.log.info('LoadFactOperator implemented by Mohan Hegde') self.log.info("LoadFact", self.sql) postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) postgres_hook.run(self.sql)
def copy_table(tablename, filename): '''Copy data from source files (.csv files) into src stage tables in the local postgres database''' pghook = PostgresHook('postgres_local') pghook.run(COPY_SQL.format(tablename, filename))
def create_indexes(SQL): '''Create indexes on the tables to improve query performance''' pghook = PostgresHook('postgres_local') pghook.run(SQL)
def execute_postgres_hook(query): pg_hook = PostgresHook('redshift_lake') pg_hook.run(query)
def execute(self, context): postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) postgres_hook.run(f"Delete from {self.table};") postgres_hook.run(self.sql)
def execute(self, context): self.log.info(f"start loading fact table {self.target_table}") redshift = PostgresHook(postgres_conn_id=self.conn_id) str_sql_insert = f"INSERT INTO {self.target_table} {self.str_sql_select}" redshift.run(str_sql_insert) self.log.info(f"finished loading fact table {self.target_table}")
class RedshiftToS3Transfer(BaseOperator): """ Executes an UNLOAD command to s3 as a CSV with headers :param schema: reference to a specific schema in redshift database :type schema: string :param table: reference to a specific table in redshift database :type table: string :param s3_bucket: reference to a specific S3 bucket :type s3_bucket: string :param s3_key: reference to a specific S3 key :type s3_key: string :param redshift_conn_id: reference to a specific redshift database :type redshift_conn_id: string :param s3_conn_id: reference to a specific S3 connection :type s3_conn_id: string :param options: reference to a list of UNLOAD options :type options: list """ template_fields = () template_ext = () ui_color = '#ededed' @apply_defaults def __init__( self, schema, table, s3_bucket, s3_key, redshift_conn_id='redshift_default', s3_conn_id='s3_default', unload_options=tuple(), autocommit=False, parameters=None, *args, **kwargs): super(RedshiftToS3Transfer, self).__init__(*args, **kwargs) self.schema = schema self.table = table self.s3_bucket = s3_bucket self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.s3_conn_id = s3_conn_id self.unload_options = unload_options self.autocommit = autocommit self.parameters = parameters def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) a_key, s_key = self.s3.get_credentials() unload_options = ('\n\t\t\t').join(self.unload_options) _log.info("Retrieving headers from %s.%s..." % (self.schema, self.table)) columns_query = """SELECT column_name FROM information_schema.columns WHERE table_schema = '{0}' AND table_name = '{1}' ORDER BY ordinal_position """.format(self.schema, self.table) cursor = self.hook.get_conn().cursor() cursor.execute(columns_query) rows = cursor.fetchall() columns = map(lambda row: row[0], rows) column_names = (', ').join(map(lambda c: "\\'{0}\\'".format(c), columns)) column_castings = (', ').join(map(lambda c: "CAST({0} AS text) AS {0}".format(c), columns)) unload_query = """ UNLOAD ('SELECT {0} UNION ALL SELECT {1} FROM {2}.{3}') TO 's3://{4}/{5}/{3}_' with credentials 'aws_access_key_id={6};aws_secret_access_key={7}' {8}; """.format(column_names, column_castings, self.schema, self.table, self.s3_bucket, self.s3_key, a_key, s_key, unload_options) _log.info('Executing UNLOAD command...') self.hook.run(unload_query, self.autocommit) _log.info("UNLOAD command complete...")
def execute(self, context): # Set AWS S3 and Redshift connections self.log.info("Setting up Redshift connection...") aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Redshift connection created.") self.log.info("Clearing data from Redshift target table...") redshift.run("DELETE FROM {}".format(self.target_table)) # Prepare S3 paths self.log.info("Preparing Copying data from S3 to Redshift...") exec_date_rendered = self.execution_date.format(**context) self.log.info("Execution_date: {}".format(exec_date_rendered)) exec_date_obj = datetime.datetime.strptime( exec_date_rendered, \ '%Y-%m-%d') self.log.info("Execution_year: {}".format(exec_date_obj.year)) self.log.info("Execution_month: {}".format(exec_date_obj.month)) rendered_key_raw = self.s3_key.format(**context) self.log.info("Rendered_key_raw: {}".format(rendered_key_raw)) if self.use_partitioned_data == "True": self.log.info("Rendered_key_raw: {}".format(rendered_key_raw)) rendered_key = rendered_key_raw.format(\ exec_date_obj.year, \ exec_date_obj.month) else: rendered_key = rendered_key_raw self.log.info("Rendered_key: {}".format(rendered_key)) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) self.log.info("S3_path: ".format(s3_path)) if self.json_paths == "": s3_json_path = "\'auto\'" else: s3_json_path = "\'s3://{}/{}\'".format( self.s3_bucket, \ self.json_paths) self.log.info("S3_PATH: {}".format(s3_path)) self.log.info("S3_JSON_PATH: {}".format(s3_json_path)) # Copy data from S3 to Redshift # Select oparations based on input file format (JSON or CSV) if self.file_format == "json": self.log.info("Preparing for JSON input data") formatted_sql = StageToRedshiftOperator.sql_template_json.format( self.target_table, s3_path, credentials.access_key, credentials.secret_key, s3_json_path ) elif self.file_format == "csv": self.log.info("Preparing for CSV input data") formatted_sql = StageToRedshiftOperator.sql_template_csv.format( self.target_table, s3_path, credentials.access_key, credentials.secret_key, self.ignore_headers, self.delimiter ) else: self.log.info('File Format defined something else than \ JSON or CSV. Other formats not supported.') pass # Executing COPY operation self.log.info("Executing Redshift COPY operation...") redshift.run(formatted_sql) self.log.info("Redshift COPY operation DONE.")
def push_to_master_table(*args, **kwargs): table = kwargs["params"]["table"] redshift_hook = PostgresHook("redshift") logging.info("Pushing {} data into master table".format(table)) redshift_hook.run("INSERT INTO {}_master SELECT * FROM {};".format( table, table))
def execute(self, context): redshift_hook = PostgresHook(self.redshift_conn_id) if not self.append_data: redshift_hook.run('''DELETE FROM {}'''.format(self.table)) redshift_hook.run(self.sql) self.log.info(f'Loading records into {self.table} completed')
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Loading the fact table into Redshift") formatted_sql = LoadFactOperator.insert_sql.format( self.table, self.load_sql_stmt) redshift.run(formatted_sql)
class RedshiftToDruidOperator(BaseOperator): """ Unloads a Redshift table to S3 and load into apache druid :param s3_bucket: reference to a specific S3 bucket :type s3_bucket: str :param s3_key: reference to a specific S3 key :type s3_key: str :param schema: reference to a specific schema in redshift database :type schema: str :param table: reference to a specific table in redshift database :type table: str :param sql: Custom SQL statement to execute :type sql: str :param druid_ingest_spec: druid ingestion json spec :type druid_ingest_spec: json :param unload_options: reference to a list of UNLOAD options :type unload_options: list :param include_header: Should include headers in the final file? :type include_header: bool :param autocommit: if True perform autocommit :type autocommit: bool :param aws_conn_id: reference to a specific S3 connection :type aws_conn_id: str :param redshift_conn_id: reference to a specific redshift database :type redshift_conn_id: str :param druid_conn_id: reference to a specific Druid overlord connection :type druid_conn_id: str """ template_fields = ( "s3_bucket", "s3_key", "schema", "table", "sql", "druid_ingest_spec", ) template_ext = (".sql",) ui_color = "#fff7e6" @apply_defaults def __init__( self, s3_bucket, s3_key, schema, table, sql=None, druid_ingest_spec=None, unload_options=tuple(), include_header=False, autocommit=False, aws_conn_id="aws_default", redshift_conn_id="postgres_default", druid_conn_id="druid_ingest_default", *args, **kwargs, ): super().__init__(*args, **kwargs) self.aws_conn_id = aws_conn_id self.redshift_conn_id = redshift_conn_id self.druid_conn_id = druid_conn_id self.s3_bucket = s3_bucket self.s3_key = s3_key self.schema = schema self.table = table self.sql = sql self.druid_ingest_spec = druid_ingest_spec self.unload_options = unload_options self.autocommit = autocommit self.include_header = include_header self.pg_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) if self.include_header and "PARALLEL OFF" not in [ uo.upper().strip() for uo in unload_options ]: self.unload_options = list(unload_options) + ["PARALLEL OFF"] def execute(self, context): self.unload() self.druid_ingest() def unload(self): credentials = self.s3_hook.get_credentials() unload_options = "\n\t\t\t".join(self.unload_options) if self.include_header: self.log.info("Retrieving headers from %s.%s...", self.schema, self.table) columns_query = """ SELECT column_name FROM information_schema.columns WHERE table_schema = '{schema}' AND table_name = '{table}' ORDER BY ordinal_position """.format( schema=self.schema, table=self.table ) cursor = self.pg_hook.get_conn().cursor() cursor.execute(columns_query) rows = cursor.fetchall() columns = [row[0] for row in rows] column_names = ", ".join("{0}".format(c) for c in columns) column_headers = ", ".join("\\'{0}\\'".format(c) for c in columns) column_castings = ", ".join( "CAST({0} AS text) AS {0}".format(c) for c in columns ) select_query = """SELECT {column_names} FROM (SELECT 2 sort_order, {column_castings} FROM {schema}.{table} UNION ALL SELECT 1 sort_order, {column_headers}) ORDER BY sort_order""".format( column_names=column_names, column_castings=column_castings, column_headers=column_headers, schema=self.schema, table=self.table, ) else: if self.sql: select_query = self.sql else: select_query = "SELECT * FROM {schema}.{table}".format( schema=self.schema, table=self.table ) unload_query = """ UNLOAD ('{select_query}') TO 's3://{s3_bucket}/{s3_key}/{table}_' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {unload_options}; """.format( select_query=select_query, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, unload_options=unload_options, ) self.log.info("Executing UNLOAD command...") self.pg_hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...") def druid_ingest(self): druid_hook = DruidHook(druid_ingest_conn_id=self.druid_conn_id) self.log.info("Submitting druid task: %s", json.dumps(self.druid_ingest_spec)) druid_hook.submit_indexing_job(self.druid_ingest_spec)
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info('Cleaning staging tables') redshift.run(self.sql_query)
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) redshift.run(self.sql.insert_query) self.log.info('LoadFactOperator Successful')
def destroy_and_create_schema(): logger = logging.getLogger(__name__) pg_hook = PostgresHook('redshift_lake') for command in table_commands: pg_hook.run(command)
def execute(self, context): redshift = PostgresHook(self.connection_id) self.log.info("Appending records to fact table {}".format( self.fact_table)) redshift.run("INSERT INTO {} ({})".format(self.fact_table, self.sql_statement))
def create_table(SQL): '''Create tables in the local postgres database''' pghook = PostgresHook('postgres_local') pghook.run(SQL)
def load_data_to_redshift(*args,**kwargs): aws_hook = AwsHook('aws_credentials') credentials = aws_hook.get_credentials() redshift_hook = PostgresHook('redshift') redshift_hook.run(sql_statements.COPY_ALL_TRIPS_SQL.format(credentials.access_key, credentials.secret_key))
def insert_update_table(SQL): '''INSERT/UPDATE data into the table in local postgres database''' pghook = PostgresHook('postgres_local') pghook.run(SQL)
def execute(self, context): pg = PostgresHook(postgres_conn_id=self.pg_conn_id) sql_statement = 'REFRESH MATERIALIZED VIEW ' + self.materialized_view pg.run(sql_statement)
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Truncating dim table '{}'".format(self.table)) redshift.run("TRUNCATE {}".format(self.table)) self.log.info("loading dim table '{}'".format(self.table)) redshift.run(self.sql_stmt)
class S3ToRedshiftTransfer(BaseOperator): """ Executes an COPY command to load files from s3 to Redshift :param schema: reference to a specific schema in redshift database :type schema: str :param table: reference to a specific table in redshift database :type table: str :param s3_bucket: reference to a specific S3 bucket :type s3_bucket: str :param s3_key: reference to a specific S3 key :type s3_key: str :param redshift_conn_id: reference to a specific redshift database :type redshift_conn_id: str :param aws_conn_id: reference to a specific S3 connection :type aws_conn_id: str :param verify: Whether or not to verify SSL certificates for S3 connection. By default SSL certificates are verified. You can provide the following values: - ``False``: do not validate SSL certificates. SSL will still be used (unless use_ssl is False), but SSL certificates will not be verified. - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. You can specify this argument if you want to use a different CA cert bundle than the one used by botocore. :type verify: bool or str :param copy_options: reference to a list of COPY options :type copy_options: list """ template_fields = () template_ext = () ui_color = '#ededed' @apply_defaults def __init__(self, schema: str, table: str, s3_bucket: str, s3_key: str, redshift_conn_id: str = 'redshift_default', aws_conn_id: str = 'aws_default', verify: Optional[Union[bool, str]] = None, copy_options: Optional[List] = None, autocommit: bool = False, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.schema = schema self.table = table self.s3_bucket = s3_bucket self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.aws_conn_id = aws_conn_id self.verify = verify self.copy_options = copy_options or [] self.autocommit = autocommit def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = self.s3.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_query = """ COPY {schema}.{table} FROM 's3://{s3_bucket}/{s3_key}/{table}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(schema=self.schema, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=copy_options) self.log.info('Executing COPY command...') self.hook.run(copy_query, self.autocommit) self.log.info("COPY command complete...")
def execute(self, context): aws_hook = AwsHook(self.aws_creds) creds = aws_hook.get_credentials() rs_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Starting gathering CSV files from S3...") s3r = boto3.resource("s3", region_name=self.s3_region, aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key) bucket = s3r.Bucket(self.s3_bucket) csv_obj = bucket.objects.filter(Prefix=self.s3_key) csvlist = [] for c in csv_obj: csvlist.append(c.key) csvlist = [x for x in csvlist if x.find(".csv") > -1] client = boto3.client("s3", region_name=self.s3_region, aws_access_key_id=creds.access_key, aws_secret_access_key=creds.secret_key) i = 1 for c in csvlist: ind = str(i) csvo = client.get_object(Bucket=self.s3_bucket, Key=c) body = csvo["Body"] csv_string = body.read().decode("utf-8") df = pd.read_csv(StringIO(csv_string)) cols = list(df.columns) #for bringing sql fields in sql format cols = ['"' + c + '"' for c in cols] cols = ', '.join(cols) self.log.info(f"Copying file {ind} from S3 to Redshift for " + self.table) s3_path = "s3://" + self.s3_bucket + "/" + c if self.include_cols == True: formated_sql = S3CSVToRedshiftOperator.copy_sql.format( self.table, cols, s3_path, creds.access_key, creds.secret_key, "'" + self.s3_region + "'") else: formated_sql = S3CSVToRedshiftOperator.copy_sql_excl.format( self.table, s3_path, creds.access_key, creds.secret_key, "'" + self.s3_region + "'") rs_hook.run(formated_sql) self.log.info(f'StageToRedshiftOperator finished for file {ind}!') i += 1 check_sql = f"SELECT COUNT(*) FROM {self.table}" checkno = rs_hook.get_records(check_sql) if checkno == 0: raise ValueError("No data transfered to Redshift!") self.log.info("Data Quality check passed successfully!")
def attach_persona_tags_to_people(ds, *args, **kwargs): if 'client' not in kwargs or kwargs['client'] is None: raise Exception("You must configure a client for this operator") headers = {"Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")} pg_connection = kwargs['client'] + '_apollos_postgres' pg_hook = PostgresHook( postgres_conn_id=pg_connection, keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5 ) person_entity_id = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/EntityTypes", params={"$filter": "Name eq 'Rock.Model.Person'"}, headers=headers ).json()[0]['Id'] persona_params = { "$filter": f"EntityTypeId eq {person_entity_id} and CategoryId eq {186}", "$select": "Id,Name,Guid", "$orderby": "ModifiedDateTime desc", } if not kwargs['do_backfill']: persona_params['$filter'] += f" and (ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null or PersistedLastRefreshDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')})'" personas = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/DataViews", params=persona_params, headers=headers).json() for persona in personas: fetched_all = False skip = 0 top = 10000 while not fetched_all: params = { "$top": top, "$skip": skip, "$select": "Id", "$orderby": "ModifiedDateTime desc", } r = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/People/DataView/{persona['Id']}", params=params, headers=headers) rock_objects = r.json() if not isinstance(rock_objects, list): print(rock_objects) print("oh uh, we might have made a bad request") print("top: {top}") print("skip: {skip}") skip += top continue def generate_insert_sql(tag_id, person_id): return f''' INSERT INTO people_tag ("tagId", "personId", "createdAt", "updatedAt") SELECT t.id, p.id, NOW(), NOW() FROM people p, tags t WHERE t."originId" = '{tag_id}' AND p."originId" = '{person_id}' ON CONFLICT ("tagId", "personId") DO NOTHING ''' sql_to_run = map(lambda p: generate_insert_sql(persona['Id'], p['Id']), rock_objects) pg_hook.run(sql_to_run) skip += top fetched_all = len(rock_objects) < top
def create_table(create_table_sql): hook = PostgresHook() hook.run(create_table_sql, autocommit=True)
def fetch_and_save_persona_tags(ds, *args, **kwargs): if 'client' not in kwargs or kwargs['client'] is None: raise Exception("You must configure a client for this operator") headers = {"Authorization-Token": Variable.get(kwargs['client'] + "_rock_token")} fetched_all = False skip = 0 top = 10000 pg_connection = kwargs['client'] + '_apollos_postgres' pg_hook = PostgresHook( postgres_conn_id=pg_connection, keepalives=1, keepalives_idle=30, keepalives_interval=10, keepalives_count=5 ) person_entity_id = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/EntityTypes", params={"$filter": "Name eq 'Rock.Model.Person'"}, headers=headers ).json()[0]['Id'] while not fetched_all: params = { "$top": top, "$skip": skip, "$filter": f"EntityTypeId eq {person_entity_id} and CategoryId eq {186}", "$select": "Id,Name,Guid", "$orderby": "ModifiedDateTime desc", } if not kwargs['do_backfill']: params['$filter'] += f" and (ModifiedDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')}' or ModifiedDateTime eq null or PersistedLastRefreshDateTime ge datetime'{kwargs['execution_date'].strftime('%Y-%m-%dT00:00')})'" print(params) r = requests.get( f"{Variable.get(kwargs['client'] + '_rock_api')}/DataViews", params=params, headers=headers) rock_objects = r.json() if not isinstance(rock_objects, list): print(rock_objects) print("oh uh, we might have made a bad request") print("top: {top}") print("skip: {skip}") skip += top continue skip += top fetched_all = len(rock_objects) < top # "createdAt", "updatedAt", "originId", "originType", "apollosType", "name", "data", "type" def update_tags(obj): return ( kwargs['execution_date'], kwargs['execution_date'], obj['Id'], 'rock', 'Tag', obj['Name'], Json({ "guid": obj["Guid"] }), "Persona" ) def fix_casing(col): return "\"{}\"".format(col) tags_to_insert = list(map(update_tags, rock_objects)) columns = list(map(fix_casing, ("createdAt","updatedAt", "originId", "originType", "apollosType", "name", "data", "type"))) pg_hook.insert_rows( 'tags', tags_to_insert, columns, 0, True, replace_index = ('"originId"', '"originType"') ) add_apollos_ids = """ UPDATE "tags" SET "apollosId" = "apollosType" || ':' || id::varchar WHERE "originType" = 'rock' and "apollosId" IS NULL """ pg_hook.run(add_apollos_ids)
def execute(self, context): self.log.info('LoadFactOperator implemented ') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) formatted_sql = "INSERT INTO {} ".format(self.table)+self.sql_query redshift.run(formatted_sql)
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("loading fact table ...") redshift.run(self.sql_stmt)
class S3ToRedshiftTransfer(BaseOperator): """ Executes an COPY command to load files from s3 to Redshift :param schema: reference to a specific schema in redshift database :type schema: string :param table: reference to a specific table in redshift database :type table: string :param s3_bucket: reference to a specific S3 bucket :type s3_bucket: string :param s3_key: reference to a specific S3 key :type s3_key: string :param redshift_conn_id: reference to a specific redshift database :type redshift_conn_id: string :param aws_conn_id: reference to a specific S3 connection :type aws_conn_id: string :param copy_options: reference to a list of COPY options :type copy_options: list """ template_fields = () template_ext = () ui_color = '#ededed' @apply_defaults def __init__( self, schema, table, s3_bucket, s3_key, redshift_conn_id='redshift_default', aws_conn_id='aws_default', copy_options=tuple(), autocommit=False, parameters=None, *args, **kwargs): super(S3ToRedshiftTransfer, self).__init__(*args, **kwargs) self.schema = schema self.table = table self.s3_bucket = s3_bucket self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.aws_conn_id = aws_conn_id self.copy_options = copy_options self.autocommit = autocommit self.parameters = parameters def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(aws_conn_id=self.aws_conn_id) credentials = self.s3.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_query = """ COPY {schema}.{table} FROM 's3://{s3_bucket}/{s3_key}/{table}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(schema=self.schema, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=copy_options) self.log.info('Executing COPY command...') self.hook.run(copy_query, self.autocommit) self.log.info("COPY command complete...")
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) query = LoadFactOperator.sql.format(self.table, self.load_query) redshift.run(query)
class RedshiftToS3Transfer(BaseOperator): """ Executes an UNLOAD command to s3 as a CSV with headers :param schema: reference to a specific schema in redshift database :type schema: str :param table: reference to a specific table in redshift database :type table: str :param s3_bucket: reference to a specific S3 bucket :type s3_bucket: str :param s3_key: reference to a specific S3 key :type s3_key: str :param redshift_conn_id: reference to a specific redshift database :type redshift_conn_id: str :param aws_conn_id: reference to a specific S3 connection :type aws_conn_id: str :param verify: Whether or not to verify SSL certificates for S3 connection. By default SSL certificates are verified. You can provide the following values: - ``False``: do not validate SSL certificates. SSL will still be used (unless use_ssl is False), but SSL certificates will not be verified. - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. You can specify this argument if you want to use a different CA cert bundle than the one used by botocore. :type verify: bool or str :param unload_options: reference to a list of UNLOAD options :type unload_options: list """ template_fields = () template_ext = () ui_color = '#ededed' @apply_defaults def __init__( self, schema, table, s3_bucket, s3_key, redshift_conn_id='redshift_default', aws_conn_id='aws_default', verify=None, unload_options=tuple(), autocommit=False, include_header=False, *args, **kwargs): super().__init__(*args, **kwargs) self.schema = schema self.table = table self.s3_bucket = s3_bucket self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.aws_conn_id = aws_conn_id self.verify = verify self.unload_options = unload_options self.autocommit = autocommit self.include_header = include_header if self.include_header and \ 'PARALLEL OFF' not in [uo.upper().strip() for uo in unload_options]: self.unload_options = list(unload_options) + ['PARALLEL OFF', ] def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = self.s3.get_credentials() unload_options = '\n\t\t\t'.join(self.unload_options) if self.include_header: self.log.info("Retrieving headers from %s.%s...", self.schema, self.table) columns_query = """SELECT column_name FROM information_schema.columns WHERE table_schema = '{schema}' AND table_name = '{table}' ORDER BY ordinal_position """.format(schema=self.schema, table=self.table) cursor = self.hook.get_conn().cursor() cursor.execute(columns_query) rows = cursor.fetchall() columns = [row[0] for row in rows] column_names = ', '.join("{0}".format(c) for c in columns) column_headers = ', '.join("\\'{0}\\'".format(c) for c in columns) column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c) for c in columns) select_query = """SELECT {column_names} FROM (SELECT 2 sort_order, {column_castings} FROM {schema}.{table} UNION ALL SELECT 1 sort_order, {column_headers}) ORDER BY sort_order"""\ .format(column_names=column_names, column_castings=column_castings, column_headers=column_headers, schema=self.schema, table=self.table) else: select_query = "SELECT * FROM {schema}.{table}"\ .format(schema=self.schema, table=self.table) unload_query = """ UNLOAD ('{select_query}') TO 's3://{s3_bucket}/{s3_key}/{table}_' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {unload_options}; """.format(select_query=select_query, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, unload_options=unload_options) self.log.info('Executing UNLOAD command...') self.hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")
class RedshiftToS3Transfer(BaseOperator): """ Executes an UNLOAD command to s3 as a CSV with headers :param schema: reference to a specific schema in redshift database :type schema: string :param table: reference to a specific table in redshift database :type table: string :param s3_bucket: reference to a specific S3 bucket :type s3_bucket: string :param s3_key: reference to a specific S3 key :type s3_key: string :param redshift_conn_id: reference to a specific redshift database :type redshift_conn_id: string :param aws_conn_id: reference to a specific S3 connection :type aws_conn_id: string :param unload_options: reference to a list of UNLOAD options :type unload_options: list """ template_fields = () template_ext = () ui_color = '#ededed' @apply_defaults def __init__( self, schema, table, s3_bucket, s3_key, redshift_conn_id='redshift_default', aws_conn_id='aws_default', unload_options=tuple(), autocommit=False, parameters=None, include_header=False, *args, **kwargs): super(RedshiftToS3Transfer, self).__init__(*args, **kwargs) self.schema = schema self.table = table self.s3_bucket = s3_bucket self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.aws_conn_id = aws_conn_id self.unload_options = unload_options self.autocommit = autocommit self.parameters = parameters self.include_header = include_header if self.include_header and \ 'PARALLEL OFF' not in [uo.upper().strip() for uo in unload_options]: self.unload_options = list(unload_options) + ['PARALLEL OFF', ] def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(aws_conn_id=self.aws_conn_id) credentials = self.s3.get_credentials() unload_options = '\n\t\t\t'.join(self.unload_options) if self.include_header: self.log.info("Retrieving headers from %s.%s...", self.schema, self.table) columns_query = """SELECT column_name FROM information_schema.columns WHERE table_schema = '{schema}' AND table_name = '{table}' ORDER BY ordinal_position """.format(schema=self.schema, table=self.table) cursor = self.hook.get_conn().cursor() cursor.execute(columns_query) rows = cursor.fetchall() columns = [row[0] for row in rows] column_names = ', '.join("{0}".format(c) for c in columns) column_headers = ', '.join("\\'{0}\\'".format(c) for c in columns) column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c) for c in columns) select_query = """SELECT {column_names} FROM (SELECT 2 sort_order, {column_castings} FROM {schema}.{table} UNION ALL SELECT 1 sort_order, {column_headers}) ORDER BY sort_order"""\ .format(column_names=column_names, column_castings=column_castings, column_headers=column_headers, schema=self.schema, table=self.table) else: select_query = "SELECT * FROM {schema}.{table}"\ .format(schema=self.schema, table=self.table) unload_query = """ UNLOAD ('{select_query}') TO 's3://{s3_bucket}/{s3_key}/{table}_' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {unload_options}; """.format(select_query=select_query, table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, unload_options=unload_options) self.log.info('Executing UNLOAD command...') self.hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")