def test_expand_role(self): conn = boto3.client('iam', region_name='us-east-1') conn.create_role(RoleName='test-role', AssumeRolePolicyDocument='some policy') hook = AwsHook() arn = hook.expand_role('test-role') expect_arn = conn.get_role(RoleName='test-role').get('Role').get('Arn') self.assertEqual(arn, expect_arn)
def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self): self._create_clusters() hook = AwsHook(aws_conn_id='aws_default') client_from_hook = hook.get_client_type('redshift') clusters = client_from_hook.describe_clusters()['Clusters'] self.assertEqual(len(clusters), 2)
def expand_role(self): if 'Model' not in self.config: return hook = AwsHook(self.aws_conn_id) config = self.config['Model'] if 'ExecutionRoleArn' in config: config['ExecutionRoleArn'] = hook.expand_role(config['ExecutionRoleArn'])
def test_get_resource_type_returns_a_boto3_resource_of_the_requested_type(self): hook = AwsHook(aws_conn_id='aws_default') resource_from_hook = hook.get_resource_type('dynamodb') # this table needs to be created in production table = resource_from_hook.create_table( TableName='test_airflow', KeySchema=[ { 'AttributeName': 'id', 'KeyType': 'HASH' }, ], AttributeDefinitions=[ { 'AttributeName': 'name', 'AttributeType': 'S' } ], ProvisionedThroughput={ 'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10 } ) table.meta.client.get_waiter( 'table_exists').wait(TableName='test_airflow') self.assertEqual(table.item_count, 0)
def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self): client = boto3.client('emr', region_name='us-east-1') if len(client.list_clusters()['Clusters']): raise ValueError('AWS not properly mocked') hook = AwsHook(aws_conn_id='aws_default') client_from_hook = hook.get_client_type('emr') self.assertEqual(client_from_hook.list_clusters()['Clusters'], [])
def test_get_client_type_returns_a_boto3_client_of_the_requested_type(self): client = boto3.client("emr", region_name="us-east-1") if len(client.list_clusters()["Clusters"]): raise ValueError("AWS not properly mocked") hook = AwsHook(aws_conn_id="aws_default") client_from_hook = hook.get_client_type("emr") self.assertEqual(client_from_hook.list_clusters()["Clusters"], [])
def test_get_credentials_from_login(self, mock_get_connection): mock_connection = Connection(login='******', password='******') mock_get_connection.return_value = mock_connection hook = AwsHook() credentials_from_hook = hook.get_credentials() self.assertEqual(credentials_from_hook.access_key, 'aws_access_key_id') self.assertEqual(credentials_from_hook.secret_key, 'aws_secret_access_key') self.assertIsNone(credentials_from_hook.token)
def test_get_credentials_from_extra(self, mock_get_connection): mock_connection = Connection( extra='{"aws_access_key_id": "aws_access_key_id",' '"aws_secret_access_key": "aws_secret_access_key"}' ) mock_get_connection.return_value = mock_connection hook = AwsHook() credentials_from_hook = hook.get_credentials() self.assertEqual(credentials_from_hook.access_key, 'aws_access_key_id') self.assertEqual(credentials_from_hook.secret_key, 'aws_secret_access_key') self.assertIsNone(credentials_from_hook.token)
def _inject_aws_credentials(self): if TRANSFER_SPEC not in self.body or AWS_S3_DATA_SOURCE not in self.body[TRANSFER_SPEC]: return aws_hook = AwsHook(self.aws_conn_id) aws_credentials = aws_hook.get_credentials() aws_access_key_id = aws_credentials.access_key aws_secret_access_key = aws_credentials.secret_key self.body[TRANSFER_SPEC][AWS_S3_DATA_SOURCE][AWS_ACCESS_KEY] = { ACCESS_KEY_ID: aws_access_key_id, SECRET_ACCESS_KEY: aws_secret_access_key, }
def test_get_credentials_from_role_arn(self, mock_get_connection): mock_connection = Connection( extra='{"role_arn":"arn:aws:iam::123456:role/role_arn"}') mock_get_connection.return_value = mock_connection hook = AwsHook() credentials_from_hook = hook.get_credentials() self.assertEqual(credentials_from_hook.access_key, 'AKIAIOSFODNN7EXAMPLE') self.assertEqual(credentials_from_hook.secret_key, 'aJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY') self.assertEqual(credentials_from_hook.token, 'BQoEXAMPLEH4aoAH0gNCAPyJxz4BlCFFxWNE1OPTgk5TthT+FvwqnKwRcOIfrRh' '3c/LTo6UDdyJwOOvEVPvLXCrrrUtdnniCEXAMPLE/IvU1dYUg2RVAJBanLiHb4I' 'gRmpRV3zrkuWJOgQs8IZZaIv2BXIa2R4OlgkBN9bkUDNCJiBeb/AXlzBBko7b15' 'fjrBs2+cTQtpZ3CYWFXG8C5zqx37wnOE49mRl/+OtkIKGO7fAE')
def test_get_credentials_from_extra_with_s3_config_and_profile( self, mock_get_connection, mock_parse_s3_config ): mock_connection = Connection( extra='{"s3_config_format": "aws", ' '"profile": "test", ' '"s3_config_file": "aws-credentials", ' '"region_name": "us-east-1"}') mock_get_connection.return_value = mock_connection hook = AwsHook() hook._get_credentials(region_name=None) mock_parse_s3_config.assert_called_with( 'aws-credentials', 'aws', 'test' )
def execute(self, context): self.log.info('StageToRedshiftOperator has implemented') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.copy_json_option ) redshift.run(formatted_sql)
def load_data_to_redshift(target_table, s3_location, aws_credentials_id, *args, **kwargs): """ A simple PythonOperator helper function to transfer files from S3 to a target table in Redshift/Posgres. Args: target_table : name of the target table in Redshift/Postgres database, s3_location : the location of a file in S3 bucket aws_credentials_id : Airflow connection ID for AWS IAM user. Must have a read access to provided S3 bucket. """ # Obtain AWS S3 access credentials using AwsHook aws_hook = AwsHook(aws_credentials_id) credentials = aws_hook.get_credentials() # Obtain Redshift credentials using PostgresHook redshift_hook = PostgresHook("redshift") copy_sql = (sql_statements.COPY_SQL.format(target_table, s3_location, credentials.access_key, credentials.secret_key)) # Run copy SQL statement redshift_hook.run(copy_sql)
def execute(self, context): #first create staging table try: staging_table_create_sql = getattr( SqlQueries, '{}_table_create'.format(self.table)) except AttributeError: raise Exception('create sql of the target table not defined!') redshift_hook = PostgresHook(self.redshift_conn_id) redshift_hook.run(staging_table_create_sql) #then copy data from s3 logging.info('staging from s3 bucket {} to redshift table {}'.format( self.s3_bucket, self.table)) aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(self.redshift_conn_id) rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) sql = self.copy_sql.format(self.table, s3_path, credentials.access_key, credentials.secret_key, self.region) redshift_hook.run(sql)
def execute(self, context): self.log.info('StageCSVToRedshiftOperator starting...') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(self.redshift_conn_id) if self.create_table_sql is not None: #drop table redshift_hook.run(f"DROP TABLE IF EXISTS {self.table}") redshift_hook.run(self.create_table_sql) sql_stmt = StageCSVToRedshiftOperator.COPY_SQL.format( self.table, self.s3_bucket, credentials.access_key, credentials.secret_key, ) redshift_hook.run(sql_stmt) self.log.info(f'{self.table} Loaded')
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info( f"Clearing data from destination Redshift {self.table} table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info(f"Copying data from s3 to Redshift {self.table} table") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = LoadS3ToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, ) redshift.run(formatted_sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) if self.truncate_table == 1: self.log.info('Cleaning table {}.{}...'.format(self.schema, self.table)) truncate_table_sql = 'TRUNCATE TABLE {}.{};'.format(self.schema, self.table) redshift.run(truncate_table_sql) copy_json = CopyJson.copy_to_redshift.format( self.schema, self.table, self.s3_source_path, credentials.access_key, credentials.secret_key, self.copy_format ) self.log.info('Starting to copy {}...'.format(self.table)) redshift.run(copy_json)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Copying data from S3 bucket to Reshift") rendered_key = self.s3_key.format(**context) s3_path = f"s3://{self.s3_bucket}/{rendered_key}" if self.file_type == "json": qry = f"COPY {self.table} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' SECRET_ACCESS_KEY '{credentials.secret_key}' REGION '{self.region}' JSON '{self.json_path}' COMPUPDATE OFF" redshift.run(qry) if self.file_type == "csv": qry = f"COPY {self.table} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' SECRET_ACCESS_KEY '{credentials.secret_key}' REGION '{self.region}' IGNOREHEADER '{self.ignore_headers}' DELIMETER '{self.delimeter}'" redshift.run(qry) self.log.info('StageToRedshiftOperator copied')
def execute(self, context): """ Execute Operator. Execute function for operator, which creates the hooks to AWS and Redshift to run the COPY query to load the staging table specified. """ self.log.info("Setting Hooks for AWS and Redshift") aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run(f"DELETE FROM {self.table}") self.log.info("Copying data from S3 to Redshift") formatted_sql = self.copy_sql.format(table=self.table, s3_path=self.s3_path, jsonpath=self.jsonpath, key=credentials.access_key, secret=credentials.secret_key) redshift.run(formatted_sql)
def execute(self, context): # get redshift hook and aws credentials redshift = PostgresHook(postgres_conn_id=self.conn_id) aws = AwsHook(self.aws_credentials) credentials = aws.get_credentials() # drop staging table if it exists self.log.info(f'Deleting {self.table} if it exists.') redshift.run(f'DROP TABLE IF EXISTS {self.table}') # construct and run copy query query = self.copy_query.format(self.table, self.s3_path, credentials.access_key, credentials.secret_key, self.format_path) self.log.info(f"Copying data from {self.s3_path} to {self.table}") redshift.run(query) self.log.info('StageToRedshiftOperator finished.')
def execute(self, context): # self.log.info('StageToRedshiftOperator not implemented yet') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Deleting existing data if available in Redshift tables") postgres_hook.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 bucket to Redshift") rendered_key = self.s3_key.format(**context) self.log.info(rendered_key) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) #s3_path = "s3://udacity-dend/song_data/A/A/A" formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_path) postgres_hook.run(formatted_sql, self.autocommit) self.log.info("Data Copy Complete")
def execute(self, context): """ Copies data from S3 buckets to AWS Redshift cluster into staging tables. """ self.log.info('Executing StageToRedshiftOperator!') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) # s3_path = "s3://{}".format(self.s3_bucket) # if self.execution_date: # # Backfill a specific date # year = self.execution_date.strftime("%Y") # month = self.execution_date.strftime("%m") # day = self.execution_date.strftime("%d") # s3_path = '/'.join([s3_path, str(year), str(month), str(day)]) # s3_path = s3_path + '/' + self.s3_key self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) if self.file_format == 'json': formatted_sql = StageToRedshiftOperator.copy_json_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_path) else: formatted_sql = StageToRedshiftOperator.copy_csv_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.ignore_headers, self.delimiter) redshift.run(formatted_sql) self.log.info( f"Success@stage_redshift.py: Copied {self.table} from S3 to Redshift" )
def execute(self, context): """Logic to stage data from S3 to Redshift""" try: logging.info("START: Staging S3 to Redshift - Started Execution") aws_hook = AwsHook(self.aws_credentials) aws_credentials = aws_hook.get_credentials( ) # gets temporary aws credentials. IAM roles built using this feature. redshift_hook = PostgresHook(postgres_conn_id=self.redhift_conn_id) logging.info( f"INFO: Clearing Data from target Redshift table: {self.table_name}" ) redshift_hook.run(f"TRUNCATE TABLE {self.table_name}") logging.info( f"INFO: Copying Data from S3 to Redshift table: {self.table_name}" ) logging.info(f"INFO: JSON PATH {self.json_path}") rendered_s3_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_s3_key) if (self.table_format == 'CSV'): data_format = 'CSV' else: data_format = "JSON '{}'".format(self.json_path) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table_name, s3_path, aws_credentials.access_key, aws_credentials.secret_key, data_format) logging.info(f"INFO: Formatted COPY SQL: {formatted_sql}") redshift_hook.run(formatted_sql) logging.info( "SUCCESS: Staging S3 to Redshift - Finished Execution") except Exception as ex: logging.info(f"FAILED: Staging S3 to Redshift with error: {ex}")
def execute(self, context): logging.info("Executing {}".format(self.__class__.__name__)) aws = AwsHook(aws_conn_id=self.aws_conn_id) s3 = aws.get_client_type('s3') s3_location = '{0}/{1}'.format(self.destination_prefix, self.s3_filename) local_file = '/tmp/temp_adwords_data.csv' s3_location = '{0}/{1}'.format(self.destination_prefix, self.s3_filename) # initiliaze adwords client client = adwords.AdWordsClient.LoadFromStorage(self.yaml_file_location) report_downloader = client.GetReportDownloader(version='v201806') # build awql report report_query = adwords.ReportQueryBuilder().\ Select(*self.fields).\ From(self.report_name).\ During(start_date=self.query_start_date, end_date=self.query_end_date) for condition in self.conditions: report_query = report_query.Where(condition['name'])\ .In(*condition['values']) report_query = report_query.Build() # Download report locally (temp) filepath = self.temp_localfile with open(filepath, 'wb') as handler: report_downloader.DownloadReportWithAwql( report_query, 'CSV', output=handler, skip_report_header=True, skip_column_header=False, skip_report_summary=True, include_zero_impressions=False) #Upload to S3 s3.upload_file(filepath, self.destination_bucket, s3_location)
def execute(self, context): """ Copy JSON data from S3 bucket into staging tables on Redshift - redshift_conn_id: redshift cluster connection - aws_credentials_id: AWS connection - table: table name on redshift - s3_bucket: S3 bucket holding JSON data source - s3_key: S3 key for data - region: S3 bucket region """ aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") try: redshift.run("DELETE FROM {}".format(self.table)) except: self.log.error("Could not clear data from destination Redshift table") raise self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, self.json_format ) try: redshift.run(formatted_sql) self.log.info("Data copied into Redshift table successfully") except Exception as e: self.log.error("Could not copy the data from S3 into Redshift tables") self.log.error(e) raise
def create_cluster(self): """ Returns a DataprocClusterCreateOperator """ properties = {} # Google cloud storage requires object.create permission when reading from pyspark properties["core:fs.gs.implicit.dir.repair.enable"] = "false" # Set hadoop properties to access s3 from dataproc if self.aws_conn_id: for key, value in zip( ("access.key", "secret.key", "session.token"), AwsHook(self.aws_conn_id).get_credentials(), ): if value is not None: properties["core:fs.s3a." + key] = value # For older spark versions we need to set the properties differently if key == "access.key": properties["core:fs.s3.awsAccessKeyId"] = value elif key == "secret.key": properties["core:fs.s3.awsSecretAccessKey"] = value properties.update(self.additional_properties) metadata = { 'gcs-connector-version': '1.9.16', 'bigquery-connector-version': '0.13.6' } metadata.update(self.additional_metadata) return DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name=self.cluster_name, gcp_conn_id=self.gcp_conn_id, service_account=self.service_account, project_id=self.connection.project_id, storage_bucket='moz-fx-data-prod-dataproc-scratch', num_workers=self.num_workers, image_version=self.image_version, properties=properties, zone=self.zone, idle_delete_ttl=self.idle_delete_ttl, auto_delete_ttl=self.auto_delete_ttl, master_machine_type=self.master_machine_type, worker_machine_type=self.worker_machine_type, num_preemptible_workers=self.num_preemptible_workers, optional_components=self.optional_components, install_component_gateway=self.install_component_gateway, init_actions_uris=self.init_actions_uris, metadata=metadata, )
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() self.log.info("AWS S3 connection successful") redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Redshift connection successful") self.log.info(self.s3_bucket) self.log.info(self.s3_key) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, self.json_format, ) redshift.run(formatted_sql) self.log.info("Data copied")
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) #self.log.info("Clearing data from destination Redshift table if already exists") #redshift.run("DELETE FROM {} WHERE ".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) # Check if file already loaded check_file_sql = """ SELECT EXISTS( SELECT 1 FROM files_loaded WHERE file_name = '{}' ); """.format(s3_path) file_exists = redshift.get_records(check_file_sql)[0][0] self.log.info("File " + s3_path + " exists: " + str(file_exists)) if not file_exists: try: formatted_sql = self.copy_sql.format(self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_header) redshift.run(formatted_sql) except: self.log.info("File " + s3_path + " could not be inserted into Redshift.") else: log_file_sql = """ INSERT INTO files_loaded(file_name) VALUES('{}'); """.format(s3_path) redshift.run(log_file_sql)
def execute(self, context): ''' This Operator execute a SQL statement to load(append-only) or delete the content of the tables and load the new content(delete-load) four dimension tables on a AWS Redshift Cluster. Arguments are passed from the DAG args:{ : redshift_conn_id = parameters of the redshift connection : aws_credentials = AWS credentials : sql_statement = SQL statement : table = table names : mode = append-only or delete-load } ''' aws_hook = AwsHook(self.aws_credentials) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) if self.mode == 'append-only': redshift.run(self.sql_statement) else: redshift.run(f'TRUNCATE TABLE {self.table}') redshift.run(self.sql_statement)
def execute(self, context): aws_hook = AwsHook(self._aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self._redshift_conn_id) rendered_key = self._s3_key.format(**context) s3_path = f"s3://{self._s3_bucket}/{rendered_key}" self.log.info( f"StageToRedshiftOperator: copying data from S3 {self._s3_key} to table {self._table}" ) if rendered_key is None or rendered_key == '': raise ValueError("StageToRedshiftOperator: S3 key path is missing") if s3_path is None or s3_path == '': raise ValueError("StageToRedshiftOperator: S3 path is missing") cmd = f"COPY {self._table} FROM '{s3_path}' ACCESS_KEY_ID '{credentials.access_key}' SECRET_ACCESS_KEY" \ f" '{credentials.secret_key}' JSON '{self._json_path}' COMPUPDATE OFF" redshift.run(cmd)
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # redshift.run(f'DROP TABLE IF EXISTS {self.table};') redshift.run(self.sql) if self.append_only is False: self.log.info("Delete {} table".format(self.table)) redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") aws_hook = AwsHook(self.aws_conn_id) credentials = aws_hook.get_credentials() rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) format_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.formated) self.log.info("Executing copy") redshift.run(format_sql) self.log.info('StageToRedshiftOperator tables copied.')
def execute(self, context): self.log.info('StageToRedshiftOperator not implemented yet') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DROP TABLE IF EXISTS {}".format(self.table)) redshift.run("{}".format(self.create_table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) json_path = "{}".format(self.json_path) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, json_path ) redshift.run(formatted_sql)
def execute(self, context): self.log.info('StageToRedshiftOperator initializing...') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) if self.flush_first: self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( table=self.table, s3_path=s3_path, iam=self.aws_iam, region=self.region, file_format=self.file_format ) redshift.run(formatted_sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) redshift_hook = PostgresHook(self.redshift_conn_id) conn = redshift_hook.get_conn() cursor = conn.cursor() self.log.info("Connected to redshift ... StageToRedshiftOperator") cursor.execute(self.load_stagging_table) cursor.close() conn.commit() self.log.info("Load command completed ... StageToRedshiftOperator") return True
def execute(self, context): aws_hook = AwsHook(self.aws_conn_id) aws_credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clean up the staging table per each load") redshift_hook.run("DELETE FROM {}".format(self.table)) self.log.info("Time to load data from S3 to RedShift") if self.execution_date: formatted_sql = StageToRedshiftOperator.copy_sql_with_date.format( self.table, self.s3_path, self.execution_date.strftime("%Y"), self.execution_date.strftime("%d"), aws_credentials.access_key, aws_credentials.secret_key, self.region, self.data_format, self.execution_date ) else: formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, self.s3_path, aws_credentials.access_key, aws_credentials.secret_key, self.region, self.data_format, self.execution_date ) redshift_hook.run(formatted_sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id, client_type='s3') credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Dropping table from Redshift") redshift.run("DROP TABLE IF EXISTS {}".format(self.table)) s3_path = "s3://{}/{}".format(self.s3_bucket, self.s3_key) sql_create_format = self.sql_create.format(self.table) self.log.info(f"Creating Table: {self.table} in Redshift") redshift.run(sql_create_format) sql_stage_format = self.sql_stage.format(self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_path) self.log.info( f"Copying data from S3 to Redshift's Table: {self.table}") redshift.run(sql_stage_format) self.log.info("Copying data from S3 to Redshift succesfully")
def execute(self, context): self.log.info(""" =============== Staging started =============== """) self.log.info(f""" AWS CREDENTIALS : {self.aws_credentials_id}""") aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table_name)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) self.log.info(f"""rendered_key = {rendered_key}""") s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table_name, s3_path, credentials.access_key, credentials.secret_key, ) self.log.info(f"""Query : {formatted_sql}""") redshift.run(formatted_sql)
def execute(self, context): self.log.info('StageToRedshiftOperator not implemented yet') aws_hook = AwsHook(self.aws_credential_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from staging table: {}".format(self.table)) redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift table: {}".format(self.table)) rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.ignore_headers, self.copy_opt ) redshift.run(formatted_sql)
def execute(self, context): self.log.info('Testing StageToRedshiftOperator') # Reading aws credentials to connect to redshift database aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # # Cleaning redshift tables # self.log.info('StageToRedshiftOperator deleting tables') # redshift.run("DELETE FROM {}".format(self.table)) # Copying data from S3 to Redshift self.log.info( 'StageToRedshiftOperator copying data from S3 to Redshift tables') rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.json_copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_path) redshift.run(formatted_sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = SqlQueries.copy_from_s3_to_redshift_json.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_format ) redshift.run(formatted_sql) self.log.info("Finished copying data from S3 to Redshift table {}".format(self.table))
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info('Creating Redshift table if doesn\'t yet exist') redshift.run(self.create_table_sql) self.log.info('Clearing data from destination Redshift table') redshift.run(f'DELETE FROM {self.table}') self.log.info('Copying data from S3 to Redshift') rendered_key = self.s3_key.format(**context) s3_path = 's3://{}/{}'.format(self.s3_bucket, rendered_key) formatted_sql = self.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, ) redshift.run(formatted_sql)
def expand_role(self): if 'ExecutionRoleArn' in self.config: hook = AwsHook(self.aws_conn_id) self.config['ExecutionRoleArn'] = hook.expand_role(self.config['ExecutionRoleArn'])
def expand_role(self): if 'TrainingJobDefinition' in self.config: config = self.config['TrainingJobDefinition'] if 'RoleArn' in config: hook = AwsHook(self.aws_conn_id) config['RoleArn'] = hook.expand_role(config['RoleArn'])