def execute(self, context): # use the super to list all files in an Google Cloud Storage bucket files = super(GoogleCloudStorageToS3Operator, self).execute(context) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if not self.replace: # if we are not replacing -> list all files in the S3 bucket # and only keep those files which are present in # Google Cloud Storage and not in S3 bucket_name, _ = S3Hook.parse_s3_url(self.dest_s3_key) existing_files = s3_hook.list_keys(bucket_name) files = set(files) - set(existing_files) if files: hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) for file in files: file_bytes = hook.download(self.bucket, file) dest_key = self.dest_s3_key + file self.log.info("Saving file to %s", dest_key) s3_hook.load_bytes(file_bytes, key=dest_key, replace=self.replace) self.log.info("All done, uploaded %d files to S3", len(files)) else: self.log.info("In sync, no files needed to be uploaded to S3") return files
def check_for_url(self, s3url): """ check if the s3url exists :param s3url: S3 url :type s3url:str :return: bool """ bucket, key = S3Hook.parse_s3_url(s3url) s3hook = S3Hook(aws_conn_id=self.aws_conn_id) if not s3hook.check_for_bucket(bucket_name=bucket): raise AirflowException( "The input S3 Bucket {} does not exist ".format(bucket)) if key and not s3hook.check_for_key(key=key, bucket_name=bucket)\ and not s3hook.check_for_prefix( prefix=key, bucket_name=bucket, delimiter='/'): # check if s3 key exists in the case user provides a single file # or if s3 prefix exists in the case user provides a prefix for files raise AirflowException("The input S3 Key " "or Prefix {} does not exist in the Bucket {}" .format(s3url, bucket)) return True
def execute(self, context): # use the super to list all files in an Google Cloud Storage bucket files = super().execute(context) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if not self.replace: # if we are not replacing -> list all files in the S3 bucket # and only keep those files which are present in # Google Cloud Storage and not in S3 bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key) # look for the bucket and the prefix to avoid look into # parent directories/keys existing_files = s3_hook.list_keys(bucket_name, prefix=prefix) # in case that no files exists, return an empty array to avoid errors existing_files = existing_files if existing_files is not None else [] # remove the prefix for the existing files to allow the match existing_files = [file.replace(prefix, '', 1) for file in existing_files] files = list(set(files) - set(existing_files)) if files: hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) for file in files: file_bytes = hook.download(self.bucket, file) dest_key = self.dest_s3_key + file self.log.info("Saving file to %s", dest_key) s3_hook.load_bytes(file_bytes, key=dest_key, replace=self.replace) self.log.info("All done, uploaded %d files to S3", len(files)) else: self.log.info("In sync, no files needed to be uploaded to S3") return files
def check_s3_url(self, s3url): """ Check if an S3 URL exists :param s3url: S3 url :type s3url: str :rtype: bool """ bucket, key = S3Hook.parse_s3_url(s3url) if not self.s3_hook.check_for_bucket(bucket_name=bucket): raise AirflowException( "The input S3 Bucket {} does not exist ".format(bucket)) if key and not self.s3_hook.check_for_key(key=key, bucket_name=bucket)\ and not self.s3_hook.check_for_prefix( prefix=key, bucket_name=bucket, delimiter='/'): # check if s3 key exists in the case user provides a single file # or if s3 prefix exists in the case user provides multiple files in # a prefix raise AirflowException("The input S3 Key " "or Prefix {} does not exist in the Bucket {}" .format(s3url, bucket)) return True
def __init__(self, aws_conn_id, s3_bucket, s3_key, execution_date, cass_cluster, *args, **kwargs): super(TargetDBWrite, self).__init__(*args, **kwargs) self.aws_conn_id = aws_conn_id self.s3_bucket = s3_bucket self.s3_key = s3_key self.execution_date = execution_date self.s3_hook = S3Hook(self.aws_conn_id) aws_session = self.s3_hook.get_credentials() self.spark = SparkSession.builder.appName( 's3_to_cassandra').getOrCreate() self.sc = self.spark.sparkContext hadoop_conf = self.sc._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") # logging.info(f'CREDENTIALS : {aws_session}') hadoop_conf.set("fs.s3.awsAccessKeyId", aws_session[0]) hadoop_conf.set("fs.s3.awsSecretAccessKey", aws_session[1]) cluster = Cluster(cass_cluster) self.session = cluster.connect()
def execute(self, context): """ redshift_conn_id: redshift cluster connection info. aws_credentials_id: necessary info needed to make AWS connection s3_bucket: source data in S3 bucket that has the files we want to copy from. """ self.log.info('StageToRedshiftOperator not implemented yet') hook = S3Hook(self.aws_credentials_id) bucket = self.s3bucket keys = hook.list_keys(bucket) aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() session = Session(aws_access_key_id=credentials.access_key, aws_secret_access_key=credentials.secret_key) keys = os.listdir('/home/workspace/uk-traffic') for key in keys: session.resource('s3').Bucket(bucket).upload_file('/home/workspace/uk-traffic/' + key, key, ExtraArgs={'ACL': 'public-read'}) keys2 = os.listdir('/home/workspace/uk-accident') for key in keys2: session.resource('s3').Bucket(bucket).upload_file('/home/workspace/uk-accident/' + key, key, ExtraArgs={'ACL': 'public-read'})
def execute(self, context): self.log.info('Retrieving credentials') s3_hook = S3Hook(self.s3_conn_id) # render macros to variables rendered_s3_bucket = self.s3_bucket.format(**context) rendered_s3_directory = self.s3_directory.format(**context) rendered_local_directory = self.local_directory.format(**context) # save file to S3 self.log.info('Saving local directory to S3') local_file_list = os.listdir(rendered_local_directory) for local_file in local_file_list: rendered_s3_key = rendered_s3_directory + local_file rendered_local_file = rendered_local_directory + local_file self.log.info(rendered_s3_key) s3_hook.load_file(filename=rendered_local_file, bucket_name=rendered_s3_bucket, key=rendered_s3_key, replace=self.replace) self.log.info('Saved {} locals file to bucket {}'.format( len(local_file_list), rendered_s3_bucket))
def execute(self, context): """ This function executes the transfer from the email server (via imap) into s3. :param context: The context while executing. :type context: dict """ self.log.info( 'Transferring mail attachment %s from mail server via imap to s3 key %s...', self.imap_attachment_name, self.s3_key) with ImapHook(imap_conn_id=self.imap_conn_id) as imap_hook: imap_mail_attachments = imap_hook.retrieve_mail_attachments( name=self.imap_attachment_name, mail_folder=self.imap_mail_folder, check_regex=self.imap_check_regex, latest_only=True) s3_hook = S3Hook(aws_conn_id=self.s3_conn_id) s3_hook.load_bytes(bytes_data=imap_mail_attachments[0][1], key=self.s3_key, replace=self.s3_overwrite)
def execute(self, context): self.log.info("Going to execute CSV to Json Operator") s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) self.log.info("Downloading S3 File") with NamedTemporaryFile() as csv_file: source_obj = s3_hook.get_key(self.csv_key, self.csv_bucket) with open(csv_file.name, "wb") as opened_csv_file: source_obj.download_fileobj(opened_csv_file) with open(csv_file.name, "r") as opened_csv_file: reader = csv.DictReader(opened_csv_file) json_data = self.python_callable(reader) try: first_row = next(json_data) except StopIteration: self.log.info("Callable didn't return any rows") return False self.log.info("Uploading to S3") rows = itertools.chain([first_row], json_data) with NamedTemporaryFile() as final_file: with open(final_file.name, "w", encoding="utf-8") as opened_final_file: for row in rows: opened_final_file.write( json.dumps(row, ensure_ascii=False)) opened_final_file.write("\n") opened_final_file.flush() s3_hook.load_file( filename=final_file.name, key=self.json_key, bucket_name=self.json_bucket, replace=True, ) self.log.info("Finished executing CSV to JSON Operator") return True
def execute(self, context): hook = AutopilotHook(http_conn_id=self.autopilot_conn_id) results = [] if self.ids: for id in self.ids: id_endpoint = "{}/{}".format(self.autopilot_resource, id) if self.contacts: results += self.get_all_contacts(hook, id_endpoint, data=self.payload) else: results += self.get(hook, id_endpoint, data=self.payload) elif self.contacts: results += self.get_all_contacts(hook, self.autopilot_resource, data=self.payload) else: results += self.get(hook, self.autopilot_resource, results_field=self.results_field, data=self.payload) with NamedTemporaryFile("w") as tmp: for result in results: tmp.write(json.dumps(result) + '\n') tmp.flush() dest_s3 = S3Hook(s3_conn_id=self.s3_conn_id) dest_s3.load_file(filename=tmp.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True) dest_s3.connection.close()
def __init__(self, redshift_conn_id: str = "", aws_credentials_id: str = "", target_table: str = "", s3_bucket: Optional[str] = None, s3_key: Optional[str] = None, json_path: Optional[str] = None, ignore_headers: int = 1, delimiter: str = ',', *args, **kwargs): super(StageToRedshiftOperator, self).__init__(*args, **kwargs) self.redshift_conn_id = redshift_conn_id self.aws_credentials_id = aws_credentials_id self.target_table = target_table self.s3_bucket = s3_bucket self.s3_key = s3_key self.json_path = json_path self.ignore_headers = ignore_headers self.delimiter = delimiter self.s3_hook = S3Hook(aws_conn_id=aws_credentials_id)
def setUp(self): hook = SSHHook(ssh_conn_id='ssh_default') s3_hook = S3Hook('aws_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.s3_hook = s3_hook self.ssh_client = self.hook.get_conn() self.sftp_client = self.ssh_client.open_sftp() self.dag = dag self.s3_bucket = BUCKET self.sftp_path = SFTP_PATH self.s3_key = S3_KEY
def execute(self, context): self.log.info(f'S3DataExistsOperator') s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) rendered_prefix = self.prefix.format(**context) success = s3_hook.check_for_bucket(self.bucket) if success: self.log.info("Found the bucket: {}".format(self.bucket)) else: self.log.info("Invalid bucket: {}".format(self.bucket)) raise FileNotFoundError("No S3 bucket named {}".format( self.bucket)) success = s3_hook.check_for_prefix(prefix=rendered_prefix, delimiter='/', bucket_name=self.bucket) if success: self.log.info("Found the prefix: {}".format(rendered_prefix)) else: self.log.info("Invalid prefix: {}".format(rendered_prefix)) raise FileNotFoundError("No prefix named {}/{} ".format( self.bucket, rendered_prefix))
def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(aws_conn_id=self.aws_conn_id) credentials = self.s3.get_credentials() unload_options = '\n\t\t\t'.join(self.unload_options) self.log.info("Retrieving headers from %s.%s...", self.schema, self.table) columns_query = """SELECT column_name FROM information_schema.columns WHERE table_schema = '{0}' AND table_name = '{1}' ORDER BY ordinal_position """.format(self.schema, self.table) cursor = self.hook.get_conn().cursor() cursor.execute(columns_query) rows = cursor.fetchall() columns = [row[0] for row in rows] column_names = ', '.join("\\'{0}\\'".format(c) for c in columns) column_castings = ', '.join("CAST({0} AS text) AS {0}".format(c) for c in columns) unload_query = """ UNLOAD ('SELECT {0} UNION ALL SELECT {1} FROM {2}.{3} ORDER BY 1 DESC') TO 's3://{4}/{5}/{3}_' with credentials 'aws_access_key_id={6};aws_secret_access_key={7}' {8}; """.format(column_names, column_castings, self.schema, self.table, self.s3_bucket, self.s3_key, credentials.access_key, credentials.secret_key, unload_options) self.log.info('Executing UNLOAD command...') self.hook.run(unload_query, self.autocommit) self.log.info("UNLOAD command complete...")
def copy_events_from_s3_to_redshift(*args, **kwargs): table = kwargs['params']['table'] hook = S3Hook(aws_conn_id='aws_credentials') redshift_hook = PostgresHook('redshift') # get Variables log_data = Variable.get('LOG_DATA') arn_iam_role = Variable.get('iam_role') region = Variable.get('region') log_jsonpath = Variable.get('LOG_JSONPATH') logging.info(f"Copying from s3 {log_data} to redshift table {table}") # format the COPY_SQL string sql_stmt = create_tables.COPY_SQL.format( table, log_data, arn_iam_role, region, log_jsonpath ) logging.info(f"COPY SQL statement is: {sql_stmt}") redshift_hook.run(sql_stmt)
def build_copy(self): a_key, s_key = S3Hook(s3_conn_id=self.s3_conn_id).get_credentials() snowflake_destination = '' if self.database: snowflake_destination += '{}.'.format(self.database) if self.schema: snowflake_destination += '{}.'.format(self.schema) snowflake_destination += self.table fmt_str = { 'snowflake_destination': snowflake_destination, 's3_bucket': self.s3_bucket, 's3_key': self.s3_key, 'aws_access_key_id': a_key, 'aws_secret_access_key': s_key, 'file_format_name': self.file_format_name } return self.copy.format(**fmt_str)
def __init__( self, s3_bucket, s3_key, schema, table, sql=None, druid_ingest_spec=None, unload_options=tuple(), include_header=False, autocommit=False, aws_conn_id="aws_default", redshift_conn_id="postgres_default", druid_conn_id="druid_ingest_default", *args, **kwargs, ): super().__init__(*args, **kwargs) self.aws_conn_id = aws_conn_id self.redshift_conn_id = redshift_conn_id self.druid_conn_id = druid_conn_id self.s3_bucket = s3_bucket self.s3_key = s3_key self.schema = schema self.table = table self.sql = sql self.druid_ingest_spec = druid_ingest_spec self.unload_options = unload_options self.autocommit = autocommit self.include_header = include_header self.pg_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) if self.include_header and "PARALLEL OFF" not in [ uo.upper().strip() for uo in unload_options ]: self.unload_options = list(unload_options) + ["PARALLEL OFF"]
def execute(self, context): facebook_hook = FacebookAdsHook( access_token=self.access_token, facebook_ads_conn_id=self.facebook_conn_id ) s3_hook = S3Hook(self.aws_conn_id) self.log.info("Fetch API since: %s", str(self.since)) self.log.info("Fetch API until: %s", str(self.until)) self.log.info("Breakdowns: %s", str(self.breakdowns)) self.log.info("Fields: %s", str(self.insight_fields)) time_range = {"since": self.since, "until": self.until} file_name = "/tmp/{key}.jsonl".format(key=uuid.uuid4().hex) with open(file_name, "w") as insight_file: for account_id in self.account_ids: insights = facebook_hook.insights( account_id, self.insight_fields, self.breakdowns, time_range, self.time_increment, self.level, self.limit, ) if len(insights) > 0: for insight in insights: insight_file.write(json.dumps(insight) + "\n") s3_hook.load_file( filename=file_name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True, ) os.remove(file_name)
def execute(self, context): """ Executes the operator logic :param context: """ self.log.info('StagepayToRedshiftOperator execute') redshift= PostgresHook(postgres_conn_id=self.redshift_conn_id) #redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(self.aws_credentials_id) #self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=False) credentials = self.s3.get_credentials() #aws_hook = AwsHook(self.aws_credentials_id) #credentials = aws_hook.get_credentials() #redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) #copy_options = '\n\t\t\t'.join(self.copy_options) self.log.info("Clearing data from destination Redshift table") redshift.run("DELETE FROM {}".format(self.table)) self.log.info("Copying data from S3 to Redshift") rendered_key = self.s3_key.format(**context) s3_path = "s3://{}/{}".format(self.s3_bucket, rendered_key) self.log.info('StagepayToRedshiftOperatorr s3_path: ' + s3_path) #formatted_sql = StageToRedshiftOperator.copy_sql.format( formatted_sql = StagepayToRedshiftOperator.copy_query.format( self.table, s3_path, credentials.access_key, credentials.secret_key, #self.delimiter, self.format_as_json #copy_options=copy_options ) #redshift.run(copy_query, self.autocommit) # redshift.run(formatted_sql) redshift.run(formatted_sql,self.autocommit)
def monitor_S3_key(**context): """ S3 monitor will log metrics for the target key, collecting the following metrics: - size (MB) - context type (MIME type) - last modified timestamp - metadata associated with the key - parts count - storage class """ s3_hook = S3Hook(aws_conn_id=AWS_CONN_ID) target_path = context["target_s3_path"] basename = context["path_basename"] log_metric("target file", target_path) boto3_key_object = s3_hook.get_key(key=target_path) key_metrics = { "{}-size(MB)".format(basename): (boto3_key_object.content_length / MB), "{}-content_type".format(basename): boto3_key_object.content_type, "{}-last_modified".format(basename): boto3_key_object.last_modified.__str__(), "{}-metadata".format(basename): boto3_key_object.metadata, "{}-parts_count".format(basename): boto3_key_object.parts_count, } key_metrics["{}-storage_class".format(basename)] = ( boto3_key_object.storage_class if boto3_key_object.storage_class else "s3 standard") for metric_name, value in key_metrics.items(): log_metric(metric_name, value) context["ti"].xcom_push("{}_key_metrics".format(basename), key_metrics)
def upload(**kwargs): """ Function to upload all of the output files from Kneaddata and Humann2 and their temporary files """ s3 = S3Hook() files = os.listdir(os.path.abspath('output')) file_base = kwargs['ti'].xcom_pull(task_ids="parse_filename") [ s3.load_file(os.path.join(os.path.abspath('output'), file_name), os.path.join('output', file_name), bucket_name='airflow-project', replace=True) for file_name in files if not os.path.isdir(os.path.join(os.path.abspath('output'), file_name)) ] [ s3.load_file(os.path.join(os.path.abspath('output'), file_base + '_kneaddata_paired_humann2_temp', file_name), os.path.join('output', file_name), bucket_name='airflow-project', replace=True) for file_name in os.listdir( os.path.join(os.path.abspath('output'), file_base + '_kneaddata_paired_humann2_temp')) ]
def test_execute(self, mock_hook, mock_hook2): mock_hook.return_value.list.return_value = MOCK_FILES mock_hook.return_value.download.return_value = b"testing" mock_hook2.return_value.list.return_value = MOCK_FILES operator = GoogleCloudStorageToS3Operator(task_id=TASK_ID, bucket=GCS_BUCKET, prefix=PREFIX, delimiter=DELIMITER, dest_aws_conn_id=None, dest_s3_key=S3_BUCKET) # create dest bucket hook = S3Hook(aws_conn_id=None) b = hook.get_bucket('bucket') b.create() b.put_object(Key=MOCK_FILES[0], Body=b'testing') # we expect MOCK_FILES[1:] to be uploaded # and all MOCK_FILES to be present at the S3 bucket uploaded_files = operator.execute(None) self.assertEqual(sorted(MOCK_FILES[1:]), sorted(uploaded_files)) self.assertEqual(sorted(MOCK_FILES), sorted(hook.list_keys('bucket', delimiter='/')))
def execute(self, context): postgres_hook = PostgresHook(postgres_conn_id=self._postgres_conn_id) s3_hook = S3Hook(aws_conn_id=self._s3_conn_id) with postgres_hook.get_cursor() as cursor: cursor.execute(self._query) results = cursor.fetchall() headers = [_[0] for _ in cursor.description] data_buffer = io.StringIO() csv_writer = csv.writer(data_buffer, quoting=csv.QUOTE_ALL, lineterminator=os.linesep) csv_writer.writerow(headers) csv_writer.writerows(results) data_buffer_binary = io.BytesIO(data_buffer.getvalue().encode()) s3_hook.load_file_obj( file_obj=data_buffer_binary, bucket_name=self._s3_bucket, key=self._s3_key, replace=True, )
def _setup_dest_conn(self, dest_conn_id, results_bucket_name, results_dest_name): """ Setup results connection. Retrieves s3 connection and makes sure we've got location details (bucket, filename) :param dest_conn_id: :param results_bucket_name: :param results_dest_name: """ conn = BaseHook._get_connection_from_env(dest_conn_id) if conn.conn_type != 's3': raise AttributeError( "Only s3 is allowed as a results destination, not {0}".format( conn.conn_type)) self.dest_conn = S3Hook(aws_conn_id=dest_conn_id) self.dest_conn_id = dest_conn_id if results_bucket_name is None or results_dest_name is None: raise AttributeError( "Specify bucket name and key name to store results") self.results_bucket_name = results_bucket_name self.results_dest_name = results_dest_name
def export_variable(): session = settings.Session() s3_hook = S3Hook() s3_client = s3_hook.get_conn() query = session.query(Variable) allrows = query.all() k = ["key", "val", "is_encrypted", "description"] if len(allrows) > 0: outfileStr = "" f = StringIO(outfileStr) w = csv.DictWriter(f, k) for y in allrows: w.writerow({ k[0]: y.key, k[1]: y.get_val(), k[2]: y.is_encrypted, k[3]: None }) outkey = S3_KEY + 'variable.csv' s3_client.put_object(Bucket=S3_BUCKET, Key=outkey, Body=f.getvalue()) session.close() return "OK"
def _fetch_file_names(self): self.s3_hook = S3Hook(aws_conn_id=self.aws_conn_id); self.bucket = self.s3_hook.get_bucket(self.s3_bucket); if not self.bucket: raise AirflowException("Bucket Does Not Exist"); s3_keys = self.s3_hook.list_keys(bucket_name=self.s3_bucket , prefix="m&"); if s3_keys is not None and len(s3_keys) > 0: self.s3_path = s3_keys[random.randint(0,len(s3_keys)-1)]; key_breaks = self.s3_path.split("."); index_files = key_breaks[0].split("&"); table_name = index_files[1]; primary_key = index_files[2]; self.src_table = table_name + "_staging"; self.dest_table = table_name; self.src_keys = [primary_key]; self.dest_keys = [primary_key]; return True; return False;
def listS3BucketKeys(): # set necessary Airflow Variables and store them in metastore DB Variable.set("s3_bucket", "udacity-dend") Variable.set("s3_prefix", "data-pipelines") # instantiate S3Hook Class # Airflow's S3Hook Docs: https://bit.ly/2B2tHN7 sampleHook = S3Hook(aws_conn_id='aws_credentials') # retrieve Variable values from metastore s3_bucket = Variable.get("s3_bucket") s3_prefix = Variable.get("s3_prefix") # print message logging.info(f'Listing Keys from S3 Bucket: {s3_bucket}/{s3_prefix}') # use S3Hook's "list_keys()" method return a List Object of bucket keys s3KeyList = sampleHook.list_keys(s3_bucket, prefix=s3_prefix) # iterate on "keys" object and print each item for key in s3KeyList: logging.info(f"- S3://{s3_bucket}/{key}")
def upload_to_s3(**kwargs): """ Generates a CSV that is then uploaded to Google Cloud Storage using the S3Hook. This is meant to imitate the first step of a traditional ETL DAG: ingesting data from some external source. This shows how this can be done with an arbitrary python script. """ df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=['col_a', 'col_b', 'col_c', 'col_d']) df.to_csv('test_data.csv', index=False) hook = S3Hook(aws_conn_id='astronomer-s3') hook.load_file(bucket_name='astronomer-workflows-dev', key='test_data.csv', filename='test_data.csv', replace=True)
def execute(self, context): hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = s3.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) table = f'{self.schema}.{self.table}' if self.schema is not None else self.table copy_query = """ COPY {table} FROM 's3://{s3_bucket}/{s3_key}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format(table=table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=copy_options) self.log.info('Executing COPY command...') hook.run(copy_query, self.autocommit) self.log.info("COPY command complete...")
def gather_posts_html(**kwargs): print('About to gather post index html') http_hook = HttpHook(method='GET', http_conn_id=http_local_posts_conn_id) res = http_hook.run(post_index_endpoint, headers=headers) print('Finished gathering post index html') # with the response, now we insert into the bucket execution_time = dt.datetime.fromisoformat(kwargs['ts']) print(type(execution_time)) print(execution_time) formatted_execution_time = execution_time.strftime('%Y%m%d-%H%M%S') key = f"indexes/{formatted_execution_time}-posts.html" with tempfile.NamedTemporaryFile() as temp: print(temp) temp.write(res.content) print(f"Writing {temp.name} to html to s3 with key {key}") temp.seek(0) print(res.content) s3_hook = S3Hook(aws_conn_id='s3_posts_html') s3_hook.load_file(temp.name, key, bucket_name=posts_bucket_name) print('Finished writing html to s3')
def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.s3 = S3Hook(aws_conn_id=self.aws_conn_id) credentials = self.s3.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) copy_query = """ COPY {table} FROM 's3://{s3_bucket}/{s3_key}/{table}' with credentials 'aws_access_key_id={access_key};aws_secret_access_key={secret_key}' {copy_options}; """.format( table=self.table, s3_bucket=self.s3_bucket, s3_key=self.s3_key, access_key=credentials.access_key, secret_key=credentials.secret_key, copy_options=copy_options) self.log.info('Executing COPY command...') self.hook.run(copy_query) self.log.info("COPY command complete...")
def execute(self, context): self.hook = JdbcHook(jdbc_conn_id=self.snowflake_conn_id) self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) sql = self.pre_sql if self.drop_and_create: sql += self._build_pre_sql() s3_bucket, s3_key = self.s3.parse_s3_url(self.data_s3_key) if s3_bucket != S3_BUCKET: raise ValueError( 'For Snowflake loads the S3 bucket must be {}. Got: {}'.format( S3_BUCKET, s3_bucket)) copy_sql = """ COPY INTO {table} FROM @airflow.{stage}/{s3_key}; """.format( table=self.table, stage=self.stage, s3_key=s3_key, ) sql.append(copy_sql) self.hook.run(['BEGIN;'] + sql + ['COMMIT;'])
def upload_files_to_s3(s3_conn_id: str, s3_bucket: str, max_connections: int = 10, **context) -> str: results: List[str] = [] result_map: Union[Iterator, List] = [] templates_dict: Dict[str, str] = context.get("templates_dict", {}) filepaths: str = templates_dict.get("filepaths", "").strip() def upload_file(filepath_and_hook: Tuple[str, S3Hook], bucket: str = s3_bucket): return _upload_file(filepath_and_hook, bucket) if filepaths: log.info(f"Connecting to s3 connection: {s3_conn_id}") hook = S3Hook(s3_conn_id, verify=False) filepath_list = filepaths.split(",") paths = [(os.path.abspath(fp), hook) for fp in filepath_list] with concurrent.futures.ThreadPoolExecutor( max_workers=max_connections) as executor: result_map = executor.map(upload_file, paths) results = [r for r in result_map if r is not None] return ",".join(results)
def test_parse_s3_url(self): parsed = S3Hook.parse_s3_url(self.s3_test_url) self.assertEqual(parsed, ("test", "this/is/not/a-real-key.txt"), "Incorrect parsing of the s3 url")
def __init__(self, conn_id): S3Hook.__init__(self, s3_conn_id=conn_id)