def execute(self, context): """ Copy data from S3 buckets to staging tables in redhsift cluster. - redshift_conn_id: redshift cluster connection - aws_credentials_id: AWS connection - table: redshift cluster table name - s3_bucket: S3 bucket where source data resides - s3_key: S3 prefix - file_format: source file format - options JSON, CSV """ aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Deleting data from Redshift staging tables") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift staging tables") s3_path = "s3://{}".format(self.s3_bucket) if self.execution_date: year = self.execution_date.strftime("%Y") month = self.execution_date.strftime("%m") day = self.execution_date.strftime("%d") s3_path = '/'.join([s3_path, str(year), str(month), str(day)]) s3_path = s3_path + '/' + self.s3_key additional = "" if self.file_format == 'CSV': additional = " DELIMETER ',' IGNOREHEADER 1 " formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, self.file_format, additional) redshift.run(formatted_sql) self.log.info(f"Success: Copied {self.table} from S3 to Redshift")
def execute(self, context): ''' The function performs data load and executes the operator Arguments: self: reference to the object context: context provided when the operator is called Return: result ''' aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Creating the fact table in redshift before insert") redshift.run(format(self.create_table_sql)) self.log.info('Insert to the respective fact table') if self.mode == "append": insert_sql = f"INSERT INTO {self.target_table} {self.insert_table_sql}" self.log.info("Command is " + insert_sql) else: insert_sql = f"DELETE FROM {self.target_table}; INSERT INTO {self.target_table} {self.insert_table_sql}" self.log.info("Command is " + insert_sql) redshift.run(insert_sql)
def execute(self, context): # retrieve aws credentials aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() # connect to redshift redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # clear redshift destination if self.clear_table: self.log.info( "Clearing data from destination Redshift table {}".format( self.table)) redshift.run("DELETE FROM {}".format(self.table)) # copy data from S3 to redshift self.log.info("Copying data from S3 to Redshift table {}".format( self.table)) # render s3 path rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) self.log.info(s3_path) # render jsonpath if not "auto" if self.json_path != "auto": json = "s3://{}/{}".format(self.s3_bucket, self.json_path) else: json = self.json_path # format sql using copy_sql formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, json) redshift.run(formatted_sql)
def execute(self, context): # AWS Hook aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() # RedShift Hook redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # Get number of records in the table records = redshift.get_records( f"SELECT COUNT(*) FROM {self.table_name}") # Fields and data df = redshift.get_pandas_df(self.sql) fields = list(df.columns.values) data_rows = redshift.get_records(self.sql) if self.load_mode == "clean": # Clear data self.log.info(f"Clearing data from {self.table_name} table") redshift.run("DELETE FROM {}".format(self.table_name)) self.log.info( f"Deleted {records[0][0]} records from {self.table_name}") else: job_execution_ts = self.filter_key[0].format(**context) next_job_execution_ts = self.filter_key[1].format(**context) filtered_df = df[(df['start_time'] >= job_execution_ts) & (df['start_time'] < next_job_execution_ts)] data_rows = [tuple(x) for x in filtered_df.values] # Populate table self.log.info("Populating data to {} table".format(self.table_name)) redshift.insert_rows(table=self.table_name, rows=data_rows, target_fields=fields, commit_every=1000, replace=False) self.log.info("Inserted {} records to {}".format( len(data_rows), self.table_name))
def execute(self, context): """ Loads the data from S3 to a staging table in Redshift. :param context: Airflow's context. """ aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_path = f"s3://{self.s3_bucket}/{self.s3_key}" self.log.info(f'The source path is: {s3_path}') if self.file_type.lower() == 'json': formatted_sql = StageToRedshiftOperator.copy_json.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_parameter) elif self.file_type.lower() == 'csv': formatted_sql = StageToRedshiftOperator.copy_csv.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.ignore_headers, self.delimiter) else: raise ValueError('file_type must be either json or csv') self.log.info(f'About to execute the command: {formatted_sql}') redshift.run(formatted_sql)
def execute(self, context): # Get AWS Credential aws_hook = AwsHook(self.emr_credential_id) credentials = aws_hook.get_credentials() # Get EMR Client Instance emr = boto3.client('emr', 'us-west-2', aws_access_key_id=credentials.access_key, aws_secret_access_key=credentials.secret_key) # Launching EMR Cluster try: response = emr.run_job_flow(**self.JOBFLOW_OVERWRITE) except Exception as e: print(e) raise ValueError('EMR LAUNCHING FAILED') jobFlowId = response['JobFlowId'] state = emr.describe_cluster( ClusterId=jobFlowId)['Cluster']['Status']['State'] self.log.info(f'Cluster State: {state}') # wait for emr to launch while state is 'STARTING': time.sleep(15) state = emr.describe_cluster( ClusterId=jobFlowId)['Cluster']['Status']['State'] self.log.info(f'Cluster State:{state}') self.log.info(f'EMR STATE:{state}') context['ti'].xcom_push(key='EMR_ID', value=jobFlowId) return jobFlowId
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run(f"DELETE FROM {self.table}") #self.log.info("Key = ", credentials.access_key) #self.log.info("Secret = ", credentials.secret_key) self.log.info("Copying data from S3 to Redshift") #self.log.info("s3_key = ",self.s3_key) #self.log.info("context = ",context) #rendered_key = self.s3_key.format(**context) #self.log.info("rendered_key : s3_key : ",rendered_key) #s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) s3_path = "s3://{}/{}".format(self.s3_bucket, self.s3_key) formatted_sql = StageToRedshiftOperator.COPY_SQL.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.more_options) redshift.run(formatted_sql)
def execute(self, context, *args, **kwargs): logging.info("<<<<<<Starting Execution>>>>>") aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook("redshift") if (self.filetype == 'json'): fix = " json 'auto';" else: fix = " csv IGNOREHEADER 1;" logging.info("File Type Detected") logging.info(fix) logging.info("Clearing data from destination Redshift table") redshift_hook.run("DELETE FROM {}".format(self.table)) logging.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, fix) logging.info("SQL Generated") logging.info(formatted_sql) redshift_hook.run(formatted_sql) logging.info("Data Pushed to Staging Table")
def execute(self, context): """ Copy data from S3 buckets to redshift cluster into staging tables. """ self.log.info('Fetching Credentials') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_params ) redshift.run(formatted_sql)
def execute(self, context): ''' This function copies JSON data from the S3 bucket into the target table in Redshift. This function runs automatically when operator is called. ''' aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info('Clearing data from destination Redshift table') redshift.run('DELETE FROM {}'.format(self.table)) self.log.info('Copying data from S3 to Redshift') rendered_key = self.s3_key.format(**context) s3_path = 's3://{}/{}'.format(self.s3_bucket, rendered_key) copy_sql = """ COPY {} FROM '{}' IAM_ROLE '{}' FORMAT AS PARQUET; """ formatted_sql = copy_sql.format(self.table, s3_path, self.iam_role_arn) redshift.run(formatted_sql)
def execute(self, context): self.log.info('StageToRedshiftOperator not implemented yet') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Copying data from S3 to Redshift") copy_to_redshift = (""" copy {0} from 's3://{1}/{2}' access_key_id '{3}' secret_access_key '{4}' compupdate off region '{5}' FORMAT AS JSON '{6}'; """.format( self.table, self.s3_bucket, self.s3_key, credentials.access_key, credentials.secret_key, self.region, self.json) ) redshift.run(copy_to_redshift) self.log.info("Copy command completed")
def execute(self, context): """ LoadDimFileDataOperator execute Loads a data file in JSON or CSV format from an S3 bucket into a Redshift table. This operator will only load data into a table if it is empty. :param context: """ self.log.info("Using data file {} from S3 bucket {}".format(self.s3_key, self.s3_bucket)) aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # only copy data if dimension table is empty records = redshift.get_records("SELECT COUNT(*) FROM {}".format(self.table)) num_records = records.pop(0)[0] if num_records == 0: self.log.info("Inserting into destination Redshift table {}".format(self.table)) formatted_sql = LoadDimFileDataOperator.copy_sql.format( self.table, self.s3_bucket, self.s3_key, credentials.access_key, credentials.secret_key, self.copy_format ) redshift.run(formatted_sql) else: self.log.info("Found {} records in {}".format(num_records, self.table)) self.log.info("{} table data is already set".format(self.table)) self.log.info("Data copy complete")
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info(f"Copying {self.table} from S3 to Redshift") copy_query = """ COPY {table} FROM '{s3_path}' ACCESS_KEY_ID '{access_key}' SECRET_ACCESS_KEY '{secret_key}' REGION 'us-west-2' FORMAT AS JSON '{json_path}' """.format(table=self.table, s3_path=self.s3_path, access_key=credentials.access_key, secret_key=credentials.secret_key, json_path=self.json_path) redshift_hook.run(copy_query) self.log.info(f"Copy {self.table} complete.")
def execute(self, context): self.log.info("Connecting to AWS") aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() self.log.info("Connecting to Redshift using postgress hook") redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data in destination table..") redshift.run("DELETE FROM {}".format(self.table_name)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table_name, "s3://{}/{}".format(self.s3_bucket, rendered_key), credentials.access_key, credentials.secret_key, self.region, self.json_path ) self.log.info(f'running query: {formatted_sql}') redshift.run(formatted_sql) self.log.info(f'Successful copy of {self.table_name} from S3 to Redshift')
def execute(self, context): self.log.info('Starting StageToRedshiftOperator') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(self.redshift_conn_id) self.log.info('Drop stage_event table') redshift.run(StageToRedshiftOperator.table_drop.format( self.table_name)) self.log.info('Create stage_event table') if self.table_name == 'staging_events': redshift.run(StageToRedshiftOperator.staging_event_table_create) else: redshift.run(StageToRedshiftOperator.staging_songs_table_create) self.log.info("Copy data from S3 to table") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}/".format(self.s3_bucket, rendered_key) redshift.run( StageToRedshiftOperator.copy_table.format(self.table_name, s3_path, credentials.access_key, credentials.secret_key, self.copy_format))
def execute(self, context): self.log.info('--------------------------------------------------------------------') self.log.info(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ' Starting loading stage tables') self.log.info('Trying to collect AWS Credentials') aws_hook = AwsHook(self.aws_credentials) credentials = aws_hook.get_credentials() self.log.info('Succesfully collected AWS Credentials') self.log.info('Trying to connect to Redshift') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Copying data from S3 to Redshift") key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, key) if self.file_type == "json": copy_json_sql = StageToRedshiftOperator.copy_json_statement.format( self.redshift_table, s3_path, credentials.access_key, credentials.secret_key, self.json_path ) redshift.run(copy_json_sql) if self.file_type == "csv": copy_csv_sql = StageToRedshiftOperator.copy_csv_to_sql.format( self.redshift_table, s3_path, credentials.access_key, credentials.secret_key, self.ignore_headers, self.delimiter ) redshift.run(copy_csv_sql) self.log.info(datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + ' Loading stage tables finished succesfully') self.log.info('--------------------------------------------------------------------')
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) self.log.info("Rendered_key: {}".format(rendered_key)) self.log.info("s3_path: {}".format(s3_path)) self.log.info("access_key: {}".format(credentials.access_key)) self.log.info("json: {}".format(self.json)) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, self.json, credentials.access_key, credentials.secret_key, ) redshift.run(formatted_sql)
def execute(self, context): self.log.info('Loading staging table') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id = self.conn_id) #self.log.info('Deleting staging table') #redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying from S3 to redshift") rend_key = self.s3_key.format(**context) s3_path = "s3://{}{}".format(self.s3_bucket,rend_key) final_copy_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, self.jsonpath ) redshift.run(final_copy_sql) self.log.info(f"Staging table {self.table} loaded successfully")
def execute(self, context): # AWS Hook aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() # Postgres Hook redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # Delete previous rows self.log.info("Deleting {} table's content.".format(self.table)) redshift.run("DELETE FROM {}".format(self.table)) s3_path = "s3://{}/{}".format(self.s3_bucket, self.s3_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.data_format ) # Copy data from S3 to Redshift self.log.info("Copying data from S3 to Redshift. Table: {}.".format(self.table)) redshift.run(formatted_sql)
def execute(self, context): ''' Create a connection to AWS using the aws_credentials and create a PostgresHook to connect to Amazon Redshift. Get the data from s3 bucket and load it to redshift tables ''' aws_hook = AwsHook(self.aws_credentials) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) #execution_date = kwargs["execution_date"] self.log.info("Clearing data from destination Redshift table") redshift_hook.run("DELETE FROM {}".format(self.table)) self.log.info(f"Copying data to {self.table}") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = '' if self.json == '': formatted_sql = StageToRedshiftOperator.copy_sql_csv.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.deli) else: formatted_sql = StageToRedshiftOperator.copy_sql_json.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json) redshift_hook.run(formatted_sql) self.log.info(f"Copying data to {self.table} completed") # # # COPY tablename # FROM 'data_source' # CREDENTIALS 'credentials-args' # FORMAT AS { AVRO | JSON } 's3://jsonpaths_file';
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info('Checking for counts') for table in self.tables: records = redshift.get_records(f"Select count(*) from {table}")[0] if records[0] == 0: raise ValueError( f"Data quality check failed. {table} contains zero rows") else: self.log.info( f"Count checks passed for {table},{records[0]} records present in the table" ) self.log.info('Checking for Null ids') for check in self.checks: records = redshift.get_records(check['check_sql'])[0] if records[0] != check['expected_result']: raise ValueError( f"Data quality check failed. {check['table']} contains null in id column, , got {records[0]} records instead" ) else: self.log.info(f"Null id checks passed for {check['table']}")
def execute(self, context): # Fetch credentials from Airflow aws_hook = AwsHook(self.aws_credentials_id) redshift_hook = PostgresHook(self.redshift_conn_id) if aws_hook is not None and redshift_hook is not None: credentials = aws_hook.get_credentials() # Resolve the S3 key; expand to include the given year and month s3_key = self.s3_key.format(**context) s3_path = f"s3://{self.s3_bucket}/{s3_key}" # Clear the table if needed self.log.info( f"Clearing data from destination Redshift table ({self.table})" ) redshift_hook.run(f"TRUNCATE {self.table}") self.log.info(f"Ingesting S3 ({s3_path}) to {self.table} [start]") cmd = f"COPY {self.table} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' SECRET_ACCESS_KEY '{credentials.secret_key}' JSON '{self.json_path}' COMPUPDATE OFF" redshift_hook.run(cmd) self.log.info(f"Ingesting S3 ({s3_path}) to {self.table} [end]")
def execute(self, context): self.log.info('StageToRedshiftOperator starting...', self.s3_bucket, self.json_path) redshift = PostgresHook(self.redshift_conn_id) path = "s3://{}/{}".format(self.s3_bucket, self.s3_key) if self.json_path != "auto": json_path = "s3://{}/{}".format(self.s3_bucket, self.json_path) else: json_path = self.json_path aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() query = StageToRedshiftOperator.copy_sql.format( self.table, path, credentials.access_key, credentials.secret_key, self.region, json_path, ) redshift.run(query) self.log.info(f'Copied {self.table} to Redshift successfully...')
def execute(self, context): """ This operatior takes data from S3 storage and put it into staging tables which from it's parsed to target tables. :param context: specified above :return: staging tables """ self.log.info(f'Staging phase start to table: {self.table}') aws_hook = AwsHook(self.aws_conn_id) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(self.redshift_conn_id) redshift_hook.run( SqlQueries.truncate_table.format(self.table) ) redshift_hook.run( SqlQueries.copy_tables_to_stage.format( self.table, self.s3_bucket, self.s3_key, credentials.access_key, credentials.secret_key, self.json_path ) ) self.log.info(f'Staging phase finished to table: {self.table}')
def execute(self, context): # Connections aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # Clear existing raw data self.log.info('Clearing data from destination Redshift table') redshift.run("DELETE FROM {}".format(self.table)) # copy the new raw data self.log.info(f"In progress: Copying {self.table} from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( table_name=self.table, s3_path=s3_path, access_key=credentials.access_key, secret_key=credentials.secret_key, file_type=self.file_type) redshift.run(formatted_sql) self.log.info(f'Done: Copying {self.table} from S3 to Redshift')
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from Redshift destination table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) rendered_date = self.execution_date.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.ignore_headers, self.delimiter, self.json_path, self.data_format, year=rendered_date.year, month=rendered_date.month) redshift.run(formatted_sql)
def execute(self, context): """ Executes the operator logic :param context: """ self.log.info('StageCsvToRedshiftOperator execute') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) self.log.info('StageCsvToRedshiftOperator s3_path: ' + s3_path) formatted_sql = StageJsonToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key) redshift.run(formatted_sql)
def execute(self, context): self.log.info('Inside StageToRedshiftOperator -->>>') s3_path = "s3://{}/{}" delete_sql = "Delete from {}".format(self.table_name) # Using AWS Hook to get aws credentials from Airflow aws_hook = AwsHook(self.aws_credential_id) aws_credentials = aws_hook.get_credentials() # Using PostgresHook to get Redshift connection details from Airflow redshift = PostgresHook(self.redshift_connection_id) # Replaces with the execution date and month using JINJA Templates updated_key = self.s3_key.format(**context) s3_location = s3_path.format(self.s3_bucket, updated_key) # Formatting the sql statements with actual table name, aws cred details stage_load_sql = StageToRedshiftOperator.stage_load_sql.format( self.table_name, s3_location, aws_credentials.access_key, aws_credentials.secret_key, self.json_path) # Executing delete sql statement redshift.run(delete_sql) self.log.info("Deleted data from staging table {}".format( self.table_name)) # Executing load sql statement redshift.run(stage_load_sql) self.log.info("Data loaded successfully to staging table {}".format( self.table_name))
def execute(self, context): aws_hook = AwsHook(self.aws_creds) creds = aws_hook.get_credentials() rs_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Dropping old stagings tables!") dropst = CreateTables.dropsql(self.table) rs_hook.run(dropst) self.log.info("createsql staging table") rs_hook.run(self.createsql) self.log.info("Copying data from S3 to Redshift for " + self.table) s3_path = "s3://" + self.s3_bucket + "/" + self.s3_key formated_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, creds.access_key, creds.secret_key, self.s3_region, self.s3_jsondetails) rs_hook.run(formated_sql) self.log.info('StageToRedshiftOperator finished!')
def execute(self, context): #self.log.info('StageToRedshiftOperator not implemented yet') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table (if it exists)") redshift_hook.run(f"DELETE FROM {self.destination_table}") self.log.info("Copying data from S3 to Redshift") # http://s3-us-west-2.amazonaws.com/udacity-dend/log-data/2018/11/2018-11-04-events.json #s3_path = f"s3://{self.s3_bucket}/{self.s3_key}/{ execution_date.strftime('%Y')}/{{ execution_date.strftime('%m') }}/{{ ds }}-events.json" if self.s3_key == 'song_data': s3_path = "s3://{bucket}/{key}".format(bucket=self.s3_bucket, key=self.s3_key) else: s3_key = '{key}/{{execution_date.year}}/{{execution_date.month}}/{{ds}}-events.json'.format(key=self.s3_key) s3_path = "s3://{bucket}/{key}".format(bucket=self.s3_bucket, key=self.s3_key) if self.input_file_type == 'csv': copy_sql = StageToRedshiftOperator.copy_sql_csv.format(self.destination_table, s3_path, credentials.access_key, credentials.secret_key, self.region, self.delimiter, self.ignore_headers) else: # json (default) if self.s3_key == 'log_data': jsonpath = 's3://{}/log_json_path.json'.format(self.s3_bucket) else: jsonpath = 'auto' copy_sql = StageToRedshiftOperator.copy_sql_json.format(self.destination_table, s3_path, credentials.access_key, credentials.secret_key, self.region, jsonpath ) redshift_hook.run(copy_sql)