def test_check_for_wildcard_key(self): hook = S3Hook(aws_conn_id=None) bucket = hook.get_bucket('bucket') bucket.create() bucket.put_object(Key='abc', Body=b'a') bucket.put_object(Key='a/b', Body=b'a') self.assertTrue(hook.check_for_wildcard_key('a*', 'bucket')) self.assertTrue(hook.check_for_wildcard_key('s3://bucket//a*')) self.assertTrue(hook.check_for_wildcard_key('abc', 'bucket')) self.assertTrue(hook.check_for_wildcard_key('s3://bucket//abc')) self.assertFalse(hook.check_for_wildcard_key('a', 'bucket')) self.assertFalse(hook.check_for_wildcard_key('s3://bucket//a')) self.assertFalse(hook.check_for_wildcard_key('b', 'bucket')) self.assertFalse(hook.check_for_wildcard_key('s3://bucket//b'))
def test_download_file(self, mock_temp_file): mock_temp_file.return_value.__enter__ = Mock(return_value=mock_temp_file) s3_hook = S3Hook(aws_conn_id='s3_test') s3_hook.check_for_key = Mock(return_value=True) s3_obj = Mock() s3_obj.download_fileobj = Mock(return_value=None) s3_hook.get_key = Mock(return_value=s3_obj) key = 'test_key' bucket = 'test_bucket' s3_hook.download_file(key=key, bucket_name=bucket) s3_hook.check_for_key.assert_called_once_with(key, bucket) s3_hook.get_key.assert_called_once_with(key, bucket) s3_obj.download_fileobj.assert_called_once_with(mock_temp_file)
def test_list_prefixes_paged(self): hook = S3Hook(aws_conn_id=None) bucket = hook.get_bucket('bucket') bucket.create() # we dont need to test the paginator # that's covered by boto tests keys = ["%s/b" % i for i in range(2)] dirs = ["%s/" % i for i in range(2)] for key in keys: bucket.put_object(Key=key, Body=b'a') self.assertListEqual(sorted(dirs), sorted(hook.list_prefixes('bucket', delimiter='/', page_size=1)))
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() credentials_block = build_credentials_block(credentials) unload_options = '\n\t\t\t'.join(self.unload_options) unload_query = self._build_unload_query( credentials_block, self._select_query, self.s3_key, unload_options ) self.log.info('Executing UNLOAD command...') postgres_hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")
def test_load_file_acl(self, s3_bucket): hook = S3Hook() with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(b"Content") temp_file.seek(0) hook.load_file(temp_file.name, "my_key", s3_bucket, gzip=True, acl_policy='public-read') response = boto3.client('s3').get_object_acl( Bucket=s3_bucket, Key="my_key", RequestPayer='requester') # pylint: disable=no-member # noqa: E501 # pylint: disable=C0301 assert ((response['Grants'][1]['Permission'] == 'READ') and (response['Grants'][0]['Permission'] == 'FULL_CONTROL')) os.unlink(temp_file.name)
def test_check_for_wildcard_key(self, s3_bucket): hook = S3Hook() bucket = hook.get_bucket(s3_bucket) bucket.put_object(Key='abc', Body=b'a') bucket.put_object(Key='a/b', Body=b'a') assert hook.check_for_wildcard_key('a*', s3_bucket) is True assert hook.check_for_wildcard_key('abc', s3_bucket) is True assert hook.check_for_wildcard_key('s3://{}//a*'.format(s3_bucket)) is True assert hook.check_for_wildcard_key('s3://{}//abc'.format(s3_bucket)) is True assert hook.check_for_wildcard_key('a', s3_bucket) is False assert hook.check_for_wildcard_key('b', s3_bucket) is False assert hook.check_for_wildcard_key('s3://{}//a'.format(s3_bucket)) is False assert hook.check_for_wildcard_key('s3://{}//b'.format(s3_bucket)) is False
def upload_to_s3(state, date): '''Grabs data from Covid endpoint and saves to flat file on S3 ''' # Connect to S3 s3_hook = S3Hook(aws_conn_id=s3_conn_id) # Get data from API url = 'https://covidtracking.com/api/v1/states/' res = requests.get(url + '{0}/{1}.csv'.format(state, date)) # Save data to CSV on S3 s3_hook.load_string(res.text, '{0}_{1}.csv'.format(state, date), bucket_name=bucket, replace=True)
def execute(self, context: 'Context'): """ Executes AWS Glue Job from Airflow :return: the id of the current glue job. """ if not self.script_location.startswith(self.s3_protocol): s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) script_name = os.path.basename(self.script_location) s3_hook.load_file(self.script_location, self.s3_artifacts_prefix + script_name, bucket_name=self.s3_bucket) s3_script_location = f"s3://{self.s3_bucket}/{self.s3_artifacts_prefix}{script_name}" else: s3_script_location = self.script_location glue_job = GlueJobHook( job_name=self.job_name, desc=self.job_desc, concurrent_run_limit=self.concurrent_run_limit, script_location=s3_script_location, retry_limit=self.retry_limit, num_of_dpus=self.num_of_dpus, aws_conn_id=self.aws_conn_id, region_name=self.region_name, s3_bucket=self.s3_bucket, iam_role_name=self.iam_role_name, create_job_kwargs=self.create_job_kwargs, ) self.log.info( "Initializing AWS Glue Job: %s. Wait for completion: %s", self.job_name, self.wait_for_completion, ) glue_job_run = glue_job.initialize_job(self.script_args, self.run_job_kwargs) if self.wait_for_completion: glue_job_run = glue_job.job_completion(self.job_name, glue_job_run['JobRunId']) self.log.info( "AWS Glue Job: %s status: %s. Run Id: %s", self.job_name, glue_job_run['JobRunState'], glue_job_run['JobRunId'], ) else: self.log.info("AWS Glue Job: %s. Run Id: %s", self.job_name, glue_job_run['JobRunId']) return glue_job_run['JobRunId']
def execute(self, context) -> None: mysql_hook = MySqlHook(mysql_conn_id=self.mysql_conn_id) s3_conn = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) data_df = mysql_hook.get_pandas_df(self.query) self.log.info("Data from MySQL obtained") self._fix_int_dtypes(data_df) with NamedTemporaryFile(mode='r+', suffix='.csv') as tmp_csv: data_df.to_csv(tmp_csv.name, **self.pd_csv_kwargs) s3_conn.load_file(filename=tmp_csv.name, key=self.s3_key, bucket_name=self.s3_bucket) if s3_conn.check_for_key(self.s3_key, bucket_name=self.s3_bucket): file_location = os.path.join(self.s3_bucket, self.s3_key) self.log.info("File saved correctly in %s", file_location)
def test_put_bucket_tagging_when_tags_exist_overwrites(self): hook = S3Hook() hook.create_bucket(bucket_name='new_bucket') initial_tag_set = [{'Key': 'Color', 'Value': 'Green'}] hook.put_bucket_tagging(bucket_name='new_bucket', tag_set=initial_tag_set) assert len(hook.get_bucket_tagging(bucket_name='new_bucket')) == 1 assert hook.get_bucket_tagging( bucket_name='new_bucket') == initial_tag_set new_tag_set = [{'Key': 'Fruit', 'Value': 'Apple'}] hook.put_bucket_tagging(bucket_name='new_bucket', tag_set=new_tag_set) result = hook.get_bucket_tagging(bucket_name='new_bucket') assert len(result) == 1 assert result == new_tag_set
def execute(self, context): s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) self.log.info(f'Check files in S3 and in local folder ...') s_local_filespath = f"{self.files_path}/{context['ds_nodash']}*.csv" l_local_files = glob.glob(s_local_filespath) l_files_in_s3 = s3_hook.list_keys(self.dest_bucket_name) if all(elem in l_files_in_s3 for elem in l_local_files): if len(l_local_files) > 0: raise ValueError(f"Not all elements in the path " f"{s_local_filespath} were found in S3" f" {self.dest_bucket_name} bucket") self.log.info(f"All {len(l_local_files)} in local folder are in " f" {self.dest_bucket_name} bucket")
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file( filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True )
def hook(self): """Returns S3Hook.""" remote_conn_id = conf.get('logging', 'REMOTE_LOG_CONN_ID') try: from airflow.providers.amazon.aws.hooks.s3 import S3Hook return S3Hook(remote_conn_id, transfer_config_args={"use_threads": False}) except Exception as e: # pylint: disable=broad-except self.log.exception( 'Could not create an S3Hook with connection id "%s". ' 'Please make sure that airflow[aws] is installed and ' 'the S3 connection exists. Exception : "%s"', remote_conn_id, e, ) return None
def test_generate_presigned_url(self, s3_bucket): hook = S3Hook() presigned_url = hook.generate_presigned_url(client_method="get_object", params={ 'Bucket': s3_bucket, 'Key': "my_key" }) url = presigned_url.split("?")[1] params = { x[0]: x[1] for x in [x.split("=") for x in url[0:].split("&")] } assert {"AWSAccessKeyId", "Signature", "Expires"}.issubset(set(params.keys()))
def execute(self, context) -> None: postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3_hook.get_credentials() credentials_block = build_credentials_block(credentials) unload_options = '\n\t\t\t'.join(self.unload_options) s3_key = f"{self.s3_key}/{self.table}_" if self.table_as_file_name else self.s3_key select_query = f"SELECT * FROM {self.schema}.{self.table}" unload_query = self._build_unload_query(credentials_block, select_query, s3_key, unload_options) self.log.info('Executing UNLOAD command...') postgres_hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")
def execute(self, context: 'Context'): s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) if isinstance(self.data, str): s3_hook.load_string( self.data, self.s3_key, self.s3_bucket, self.replace, self.encrypt, self.encoding, self.acl_policy, self.compression, ) else: s3_hook.load_bytes(self.data, self.s3_key, self.s3_bucket, self.replace, self.encrypt, self.acl_policy)
def upload_data(aws_conn,local_file,file_key, bucket): ''' :param aws_conn: the AWS connection ID :param local_file: location of the file to be uploaded in your local computer :param file_key: destination within the desired bucket where the file will be uploaded :param bucket: name of the bucket where the file will be uploaded :return: ''' try: s3 = S3Hook(aws_conn) logging.info(f"{log_start} uploading data onto S3 bucket") s3.load_file(filename=local_file, key=file_key ,bucket_name=bucket, replace=True) logging.info(f"{log_finish} uploading data onto S3 bucket") except Exception as e: logging.info(e) print("Unable to upload data onto S3. Please review logs")
def execute(self, context): hook = S3Hook(self.aws_credentials_id) pathlist = list(Path(self.dataset_dir).glob(self.file_glob)) for path in pathlist: bucket_key = str(path)[len(self.dataset_dir) + 1:] if hook.check_for_key(key=bucket_key, bucket_name=self.bucket_name): self.log.info( f"File '{bucket_key}' is already present as s3://{self.bucket_name}/{bucket_key}. Skip upload." ) else: self.log.info( f"Upload file '{bucket_key}' to s3://{self.bucket_name}/{bucket_key}. This might take a while." ) hook.load_file(filename=str(path), key=bucket_key, bucket_name=self.bucket_name)
def download_dataset(year_month: str): url = ( f'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_{year_month}.csv' ) response = requests.get(url, stream=True) response.raise_for_status() s3 = S3Hook('aws_connection_id') s3_path = f's3://nyc-yellow-taxi-raw-data/yellow_tripdata_{year_month}.csv.gz' bucket, key = s3.parse_s3_url(s3_path) with NamedTemporaryFile('w', encoding='utf-8', delete=False) as f: for chunk in response.iter_lines(): f.write('{}\n'.format(chunk.decode('utf-8'))) s3.load_file(f.name, key, bucket, replace=True, gzip=True) return s3_path
def execute(self, context): s3_hook = S3Hook(self.aws_conn_id) ftp_hook = FTPHook(ftp_conn_id=self.ftp_conn_id) with NamedTemporaryFile() as local_tmp_file: ftp_hook.retrieve_file( remote_full_path=self.ftp_path, local_full_path_or_buffer=local_tmp_file.name) s3_hook.load_file( filename=local_tmp_file.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=self.replace, encrypt=self.encrypt, gzip=self.gzip, acl_policy=self.acl_policy, )
def execute(self, context: 'Context') -> None: sql_hook = self._get_hook() s3_conn = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) data_df = sql_hook.get_pandas_df(sql=self.query, parameters=self.parameters) self.log.info("Data from SQL obtained") self._fix_dtypes(data_df) file_options = FILE_OPTIONS_MAP[self.file_format] with NamedTemporaryFile(mode=file_options.mode, suffix=file_options.suffix) as tmp_file: self.log.info("Writing data to temp file") getattr(data_df, file_options.function)(tmp_file.name, **self.pd_kwargs) self.log.info("Uploading data to S3") s3_conn.load_file( filename=tmp_file.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=self.replace )
def get_list_from_s3(*args, **context): """ Get list of data files from S3 bucket """ updated_prefix = prefix.format(frequency="monthly", symbol=symbol, granularity=granularity) filepaths = [] bucket_obj = S3Hook("aws_default").get_bucket(bucket) for obj in bucket_obj.objects.filter(Prefix=updated_prefix): path, filename = os.path.split(obj.key) if filename.endswith("zip"): filepaths.append(obj.key) context["ti"].xcom_push(key="file_list", value=filepaths) logging.info("Found {:d} files".format(len(filepaths)))
def upload_to_s3(endpoint, date): # Instanstiaute s3_hook = S3Hook(aws_conn_id=S3_CONN_ID) print("Created Connection") print(s3_hook.get_session()) print(s3_hook) # Base URL url = 'https://covidtracking.com/api/v1/states/' res = requests.get(url + '{0}/{1}.csv'.format(endpoint, date)) # Take string, upload to S3 using predefined method s3_hook.load_string(res.text, '{0}_{1}.csv'.format(endpoint, date), bucket_name=BUCKET, replace=True)
def execute(self, context: 'Context') -> None: sql_hook = self._get_hook() s3_conn = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) data_df = sql_hook.get_pandas_df(sql=self.query, parameters=self.parameters) self.log.info("Data from SQL obtained") self._fix_int_dtypes(data_df) file_options = FILE_OPTIONS_MAP[self.file_format] with NamedTemporaryFile(mode=file_options.mode, suffix=file_options.suffix) as tmp_file: if self.file_format == FILE_FORMAT.CSV: data_df.to_csv(tmp_file.name, **self.pd_kwargs) else: data_df.to_parquet(tmp_file.name, **self.pd_kwargs) s3_conn.load_file( filename=tmp_file.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=self.replace )
def execute_json(self, context, f_get_data): hook = S3Hook(self.source_conn.conn_id) suffix = self.suffix if self.key_name: data = f_get_data( hook=hook, key=self.key_name, bucket=self.bucket_name, ) self.upload_data(data=data) else: data = [] objects = self._iterate_through_bucket( s3hook=hook, bucket=self.bucket_name, prefix=self.prefix, ) for obj_iter in objects: obj = hook.get_key(obj_iter["Key"], self.bucket_name) if self.load_data_from and obj.last_modified < self.load_data_from: continue if self.load_data_until and obj.last_modified >= self.load_data_until: continue if suffix and not suffix == obj.key[-len(suffix):]: continue self.log.info("Loading data from file {0}".format(obj.key, )) self._metadata.update({ "bucket_name": self.bucket_name, "file_name": obj.key, "file_last_modified": str(obj.last_modified), }) data = f_get_data( hook=hook, key=obj.key, bucket=self.bucket_name, ) self.upload_data(data=data)
def _upload_ratings(s3_conn_id, s3_bucket, **context): year = context["execution_date"].year month = context["execution_date"].month # Fetch ratings from our 'API'. ratings = fetch_ratings(year=year, month=month) # Write ratings to temp file. with tempfile.TemporaryDirectory() as tmp_dir: tmp_path = path.join(tmp_dir, "ratings.csv") ratings.to_csv(tmp_path, index=False) # Upload file to S3. hook = S3Hook(s3_conn_id) hook.load_file( tmp_path, key=f"ratings/{year}/{month}.csv", bucket_name=s3_bucket, replace=True, )
def pivot_data(**kwargs): #Make connection to Snowflake hook = SnowflakeHook(snowflake_conn_id='snowflake') conn = hook.get_conn() #Define SQL query query = 'SELECT DATE, STATE, POSITIVE FROM STATE_DATA;' #Read data into pandas dataframe df = pd.read_sql(query, conn) #Pivot dataframe into new format pivot_df = df.pivot(index='DATE', columns='STATE', values='POSITIVE').reset_index() #Save dataframe to S3 s3_hook = S3Hook(aws_conn_id=S3_CONN_ID) s3_hook.load_string(pivot_df.to_csv(index=False), '{0}.csv'.format(filename), bucket_name=BUCKET, replace=True)
def process_data(state, date): '''Reads data from S3, processes, and saves to new S3 file ''' # Connect to S3 s3_hook = S3Hook(aws_conn_id=s3_conn_id) # Read data data = StringIO( s3_hook.read_key(key='{0}_{1}.csv'.format(state, date), bucket_name=bucket)) df = pd.read_csv(data, sep=',') # Process data processed_data = df[['date', 'state', 'positive', 'negative']] # Save processed data to CSV on S3 s3_hook.load_string(processed_data.to_string(), '{0}_{1}_processed.csv'.format(state, date), bucket_name=bucket, replace=True)
def test_get_wildcard_key(self, s3_bucket): hook = S3Hook() bucket = hook.get_bucket(s3_bucket) bucket.put_object(Key='abc', Body=b'a') bucket.put_object(Key='a/b', Body=b'a') # The boto3 Class API is _odd_, and we can't do an isinstance check as # each instance is a different class, so lets just check one property # on S3.Object. Not great but... assert hook.get_wildcard_key('a*', s3_bucket).key == 'a/b' assert hook.get_wildcard_key('a*', s3_bucket, delimiter='/').key == 'abc' assert hook.get_wildcard_key('abc', s3_bucket, delimiter='/').key == 'abc' assert hook.get_wildcard_key('s3://{}/a*'.format(s3_bucket)).key == 'a/b' assert hook.get_wildcard_key('s3://{}/a*'.format(s3_bucket), delimiter='/').key == 'abc' assert hook.get_wildcard_key('s3://{}/abc'.format(s3_bucket), delimiter='/').key == 'abc' assert hook.get_wildcard_key('a', s3_bucket) is None assert hook.get_wildcard_key('b', s3_bucket) is None assert hook.get_wildcard_key('s3://{}/a'.format(s3_bucket)) is None assert hook.get_wildcard_key('s3://{}/b'.format(s3_bucket)) is None
def get_files(self, s3_hook: S3Hook, delimiter: Optional[str] = '/') -> List: """Gets a list of files in the bucket""" prefix = self.bucket_key config = { 'PageSize': None, 'MaxItems': None, } if self.wildcard_match: prefix = re.split(r'[\[\*\?]', self.bucket_key, 1)[0] paginator = s3_hook.get_conn().get_paginator('list_objects_v2') response = paginator.paginate( Bucket=self.bucket_name, Prefix=prefix, Delimiter=delimiter, PaginationConfig=config ) keys: List = [] for page in response: if 'Contents' in page: _temp = [k for k in page['Contents'] if isinstance(k.get('Size', None), (int, float))] keys = keys + _temp return keys