def test_get_credentials_from_login(self, mock_get_connection): mock_connection = Connection(login='******', password='******') mock_get_connection.return_value = mock_connection hook = AwsHook() credentials_from_hook = hook.get_credentials() self.assertEqual(credentials_from_hook.access_key, 'aws_access_key_id') self.assertEqual(credentials_from_hook.secret_key, 'aws_secret_access_key') self.assertIsNone(credentials_from_hook.token)
def test_get_credentials_from_extra(self, mock_get_connection): mock_connection = Connection( extra='{"aws_access_key_id": "aws_access_key_id",' '"aws_secret_access_key": "aws_secret_access_key"}' ) mock_get_connection.return_value = mock_connection hook = AwsHook() credentials_from_hook = hook.get_credentials() self.assertEqual(credentials_from_hook.access_key, 'aws_access_key_id') self.assertEqual(credentials_from_hook.secret_key, 'aws_secret_access_key') self.assertIsNone(credentials_from_hook.token)
def _inject_aws_credentials(self): if TRANSFER_SPEC not in self.body or AWS_S3_DATA_SOURCE not in self.body[TRANSFER_SPEC]: return aws_hook = AwsHook(self.aws_conn_id) aws_credentials = aws_hook.get_credentials() aws_access_key_id = aws_credentials.access_key aws_secret_access_key = aws_credentials.secret_key self.body[TRANSFER_SPEC][AWS_S3_DATA_SOURCE][AWS_ACCESS_KEY] = { ACCESS_KEY_ID: aws_access_key_id, SECRET_ACCESS_KEY: aws_secret_access_key, }
def test_get_credentials_from_role_arn(self, mock_get_connection): mock_connection = Connection( extra='{"role_arn":"arn:aws:iam::123456:role/role_arn"}') mock_get_connection.return_value = mock_connection hook = AwsHook() credentials_from_hook = hook.get_credentials() self.assertEqual(credentials_from_hook.access_key, 'AKIAIOSFODNN7EXAMPLE') self.assertEqual(credentials_from_hook.secret_key, 'aJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY') self.assertEqual(credentials_from_hook.token, 'BQoEXAMPLEH4aoAH0gNCAPyJxz4BlCFFxWNE1OPTgk5TthT+FvwqnKwRcOIfrRh' '3c/LTo6UDdyJwOOvEVPvLXCrrrUtdnniCEXAMPLE/IvU1dYUg2RVAJBanLiHb4I' 'gRmpRV3zrkuWJOgQs8IZZaIv2BXIa2R4OlgkBN9bkUDNCJiBeb/AXlzBBko7b15' 'fjrBs2+cTQtpZ3CYWFXG8C5zqx37wnOE49mRl/+OtkIKGO7fAE')
def insert_tweets_sentiment(redshift_conn_id: str, aws_conn_id: str, **kwargs): redshift_hook = PostgresHook(postgres_conn_id=redshift_conn_id, autocommit=True) aws_hook = AwsHook(aws_conn_id=aws_conn_id) aws_credentials = aws_hook.get_credentials() year = kwargs['execution_date'].year month = kwargs['execution_date'].month day = kwargs['execution_date'].day sql = f""" COPY tweets_sentiment FROM './../../../data/tweets-sentiment/{year:04d}-{month:02d}-{day:02d}.jsonl' ACCESS_KEY_ID '{aws_credentials.access_key}' SECRET_ACCESS_KEY '{aws_credentials.secret_key}' FORMAT AS JSON 'auto' """ redshift_hook.run(sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info( f"Copying data from S3 to Redshift staging {self.table} table") rendered_key = self.s3_key.format(**context) self.log.info(f"Rendered Key: {rendered_key}") s3_path = f"s3://{self.s3_bucket}/{rendered_key}" formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, self.extra_params) self.log.info( f"Executing query to copy data from '{s3_path}' to '{self.table}'") redshift.run(formatted_sql)
def execute(self, context): self.log.info('StageToRedshiftOperator not implemented yet') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) #Delete the table records = redshift.get_records(f"SELECT COUNT(*) FROM {self.table}") num_records = records[0][0] if num_records > 0: redshift.run("Delete from {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, self.s3_path, credentials.access_key, credentials.secret_key, self.json, self.region) redshift.run(formatted_sql)
def execute(self, context): self.log.info('StageToRedshiftOperator not implemented yet') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") #rendered key was used in the example project rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.ignore_headers, self.delimiter) redshift.run(formatted_sql)
def execute(self, context): self.log.info("Data insertion in Dimension table - {}".format( self.table)) aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) if not self.append_only: self.log.info("Delete existing {} Dimension table".format( self.table)) redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Creating New table") redshift.run(self.creation_query) self.log.info("Executing data insert query") redshift.run(self.data_insertion_query)
def execute(self, context): aws_hook = AwsHook("aws_credentials") credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) copy_query = """ COPY {table} FROM 's3://{s3_bucket}/{s3_prefix}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(table=self.table, s3_bucket=self.s3_bucket, s3_prefix=self.s3_prefix, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=self.copy_options) redshift_hook.run(copy_query)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Truncating data from destination Redshift table") redshift.run("TRUNCATE TABLE {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) s3_json = self.json if s3_json != "auto": s3_json = "s3://{}/{}".format(self.s3_bucket, self.json) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, s3_json, self.timeformat) redshift.run(formatted_sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from Staging tables") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") s3_path = "s3://{}/{}".format(self.s3_bucket, self.s3_key) if self.s3_key == "log_data": json_path = "s3://{}/{}".format(self.s3_bucket, self.log_json_path) else: json_path = 'auto' formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, json_path) redshift.run(formatted_sql) self.log.info(f"Staging table {self.table} created successfully")
def execute(self, context): ''' This Operator execute a SQL statement to create the fact table on the AWS Redshift Cluster. Arguments are passed from the DAG args: { : redshift_conn_id = parameters of the redshift connection : aws_credentials = AWS credentials : sql_statement = SQL statement : table = table name } ''' self.log.info(f"load table: {self.table}") aws_hook = AwsHook(self.aws_credentials) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id = self.redshift_conn_id) redshift.run(self.sql_statement)
def execute(self, context): #IAM_ROLE='arn:aws:iam::972068528963:user/dwhadmin' redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) redshift.run(self.create_stmt) rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) if self.file_format == 'parquet': formatted_sql_parquet = StageToRedshiftOperator.copy_sql_parquet.format( self.table, s3_path, self.IAM_ROLE) redshift.run(formatted_sql_parquet) else: aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.ignore_headers) redshift.run(formatted_sql)
def execute(self, context): self.log.info('StageToRedshiftOperator not implemented yet') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Copying data from S3 bucket to Redshift") rendered_key = self.s3_key.format( **context) ### to be interpreted based on context s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json) redshift.run(formatted_sql)
def test_get_credentials_from_role_arn_with_external_id( self, mock_get_connection): mock_connection = Connection( extra='{"role_arn":"arn:aws:iam::123456:role/role_arn",' ' "external_id":"external_id"}') mock_get_connection.return_value = mock_connection hook = AwsHook() credentials_from_hook = hook.get_credentials() self.assertEqual(credentials_from_hook.access_key, 'AKIAIOSFODNN7EXAMPLE') self.assertEqual(credentials_from_hook.secret_key, 'aJalrXUtnFEMI/K7MDENG/bPxRfiCYzEXAMPLEKEY') self.assertEqual( credentials_from_hook.token, 'BQoEXAMPLEH4aoAH0gNCAPyJxz4BlCFFxWNE1OPTgk5TthT+FvwqnKwRcOIfrRh' '3c/LTo6UDdyJwOOvEVPvLXCrrrUtdnniCEXAMPLE/IvU1dYUg2RVAJBanLiHb4I' 'gRmpRV3zrkuWJOgQs8IZZaIv2BXIa2R4OlgkBN9bkUDNCJiBeb/AXlzBBko7b15' 'fjrBs2+cTQtpZ3CYWFXG8C5zqx37wnOE49mRl/+OtkIKGO7fAE')
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # from helper if self.query_table == "users_table": query_content = SqlQueries.user_table_insert if self.query_table == "songs_table": query_content = SqlQueries.song_table_insert if self.query_table == "artists_table": query_content = SqlQueries.song_table_insert if self.query_table == "time_table": query_content = SqlQueries.time_table_insert formatted_sql = LoadDimensionOperator.insert_sql.format( self.table, self.columns, query_content) redshift.run(formatted_sql)
def execute(self, context): self._log_formatted("Settings connections") aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self._log_formatted(f"Clearing data from staging table : {self.table}") redshift.run(f"DELETE FROM {self.table}") rendered_key = self.s3_key.format(**context) s3_path = f"s3://{self.s3_bucket}/{rendered_key}" formatted_query = StageToRedshiftOperator.copy_query.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.region, self.json_format, self.time_format) self._log_formatted( f"Copying data from {s3_path} to table {self.table}") redshift.run(formatted_query)
def execute(self, context): aws_hook = AwsHook(self.aws_cred) credentials = aws_hook.get_credentials() aws_access_key = credentials.access_key aws_secret_key = credentials.secret_key postgrs_hook = PostgresHook(postgres_conn_id=self.redshift_id) postgrs_hook.run("DROP TABLE {} IF EXISTS".format(self.table_name), self.autocommit) aws_s3_path = "s3://{}/{}".format(self.aws_s3_bucket, self.aws_s3_key.format(**context)) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table_name, aws_s3_path, aws_access_key, aws_secret_key, self.ignore_header, self.json) postgrs_hook.run(formatted_sql, self.autocommit)
def execute(self, context): self.log.info('Starting StageToRedshiftOperator..') self.log.info('Creating hooks for S3 and Redshift..') aws_hook = AwsHook(aws_creds) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(postgres_conn=self.redshift_conn) self.log.info('Clearing data from Redshift tables..') redshift.run(f'DELETE FROM {self.target_table}') self.log.info('Moving data to Redshift tables..') key = self.s3_key.format(**context) path = f's3://{self.s3_bucket}//{key}' query = StageToRedshiftOperator.copy_sql.format( self.target_table, path, credentials.access_key, credentials.secret_key, self.json_path)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, self.s3_path, credentials.access_key, credentials.secret_key, self.region, self.json_path ) redshift.run(formatted_sql)
def execute(self, context): self.log.info("Connect to AWS") aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() self.log.info("Connecting to Redshift") redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) self.log.info("Truncating table {}...".format(self.table)) redshift.run("TRUNCATE {}".format(self.table)) self.log.info("Creating table {}...".format(self.table)) redshift.run( StageToRedshiftOperator.copy_sql.format(self.table, s3_path, credentials.access_key, credentials.secret_key))
def create_spark_context(aws_conn_id): """creates the spark session""" spark = (SparkSession.builder.config( "spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0").getOrCreate()) spark.sparkContext.setLogLevel("INFO") sc = spark.sparkContext sc = pyspark.SQLContext(sc) aws_hook = AwsHook(aws_conn_id) credentials = aws_hook.get_credentials() spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", credentials.access_key) spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", credentials.secret_key) return sc
def execute(self, context): """ redshift_conn_id: redshift cluster connection info. aws_credentials_id: necessary info needed to make AWS connection s3_bucket: source data in S3 bucket that has the files we want to copy from. """ self.log.info('StageToRedshiftOperator not implemented yet') hook = S3Hook(self.aws_credentials_id) bucket = self.s3bucket keys = hook.list_keys(bucket) aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() session = Session(aws_access_key_id=credentials.access_key, aws_secret_access_key=credentials.secret_key) for key in keys: logging.info(f"--------------- s3://{bucket}/{key} -----------") session.resource('s3').Bucket(bucket).download_file( key, '/home/workspace/uk-data/' + key)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() aws_arn = Variable.get(self.aws_arn_id) redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info(f"Staging {self.destination_table} from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.staging_sql.format( credentials.access_key, credentials.secret_key, aws_arn, destination_table=self.destination_table, s3_path=s3_path, json_format=self.json_format) redshift_hook.run(formatted_sql) self.log.info(f"Staging of {self.destination_table} complete!")
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info( f"Clearing data from destination Redshift table {self.table}") redshift.run("DELETE FROM {}".format(self.table)) rendered_key = self.s3_key.format(**context) self.log.info( f"Copying data from S3 file {rendered_key} to Redshift table {self.table}" ) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_copy_mode, self.aws_region) redshift.run(formatted_sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_query.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.json_path ) redshift.run(formatted_sql)
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) logging.info(f"Copying table {self.table} to Redshift ....") if not (self.format == "json" or self.format == 'csv'): raise ValueError(" The file format should be JSON or CSV !.") elif self.format == 'json': self.file_format = "format json '{}' ".format(self.extract_format) else: self.file_format = "format CSV" formatted_sql = StageToRedshiftOperator.copy_sql.format( self.create_sql_stmt, self.table, self.data_path, self.region, self.file_format, credentials.access_key, credentials.secret_key) redshift.run(formatted_sql)
def execute(self, context): self.log.info('Loading data from Amazon S3 to Redshift...') aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) if self.clear_table: self.log.info(f'Clearing data from {self.table}') redshift.run('DELETE FROM {}'.format(self.table)) self.log.info(f'Copying data from S3 to table {self.table} Redshift') rendered_key = self.s3_key.format(**context) s3_path = 's3://{}/{}'.format(self.s3_bucket, rendered_key) formatted_sql = StageToRedshiftOperator.copy_sql.format( self.table, s3_path, credentials.access_key, credentials.secret_key, self.extra_info_sql) self.log.info(f"Executing {formatted_sql}") redshift.run(formatted_sql)
def load_songs_to_redshift(*args, **kwargs): """ Loads songs data from S3 to redshift using parameters supplied by the calling airflow task """ aws_hook_name = kwargs["params"]["aws_hook"] redshift_hook_name = kwargs["params"]["redshift_hook"] songs_data_location = kwargs["params"]["songs_data_location"] aws_hook = AwsHook(aws_hook_name) credentials = aws_hook.get_credentials() redshift_hook = PostgresHook(redshift_hook_name) sql_statement = sql.stage_songs.format(songs_data_location, credentials.access_key, credentials.secret_key) redshift_hook.run(sql_statement)
def load_trip_data_to_redshift(*args, **kwargs): aws_hook = AwsHook("aws_credentials") credentials = aws_hook.get_credentials() redshift_hook = PostgresHook("redshift") # # # # TODO: How do we get the execution_date from our context? execution_date=kwargs["execution_date"] #execution_date = datetime.datetime.utcnow() # # # sql_stmt = sql_statements.COPY_MONTHLY_TRIPS_SQL.format( credentials.access_key, credentials.secret_key, year=execution_date.year, month=execution_date.month ) redshift_hook.run(sql_stmt)
def execute(self, context): redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) aws_hook = AwsHook(aws_conn_id=self.aws_conn_id) credentials = aws_hook.get_credentials() stage_sql = """ COPY {table} FROM {s3_source} with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {json_format}; """.format(table=self.table, s3_source=self.s3_source, access_key=credentials.access_key, secret_key=credentials.secret_key, json_format=self.json_format) self.log.info("Staging data to Redshift") redshift_hook.run(stage_sql)
def execute(self, context): # define aws creds and redshift conn aws_hook = AwsHook(self.aws_credentials) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info('Copying data from s3 to Redshift') redshift.run("""COPY {} FROM '{}' ACCESS_KEY_ID '{}' SECRET_ACCESS_KEY '{}' REGION 'us-west-2' FORMAT AS JSON '{}'""".format(self.table, self.s3_bucket, credentials.access_key, credentials.secret_key, self.copy_json_option))
def execute(self, context): aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") s3_path = "s3://{}/{}".format(self.s3_bucket, self.s3_key) sql_w_format = StageToRedshiftOperator.copy_sql.format( table=self.table, s3_path=s3_path, access_key=credentials.access_key, secret_key=credentials.secret_key, json_format=self.json_format) redshift.run(sql_w_format) self.log.info('StageToRedshiftOperator executed')