def poke(self, context): self.log.info('Sensor checks existence of : %s, %s', self.bucket, self.object) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) return hook.exists(self.bucket, self.object)
def execute(self, context): gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if gcs_hook.exists(self.gcs_source_bucket, self.gcs_source_uri) is False: self.log.error('Skip object not found: gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri) raise AirflowException('Skip object not found: gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri) tmp = tempfile.NamedTemporaryFile() self.log.info('Download gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri) gcs_hook.download( bucket=self.gcs_source_bucket, object=self.gcs_source_uri, filename=tmp.name, ) self.log.info('Upload s3://%s/%s', self.s3_destination_bucket, self.s3_destination_uri) s3_hook.load_file( filename=tmp.name, bucket_name=self.s3_destination_bucket, key=self.s3_destination_uri, replace=True, acl_policy=self.s3_acl_policy ) tmp.close()
def poke(self, context): self.log.info("Sensor checks existence of : %s, %s", self.bucket, self.object) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to, ) # check file is exist and not zero byte size return ( hook.exists(self.bucket, self.object) and hook.get_size(self.bucket, self.object) > 0 )
def execute(self, context): self.log.info('Checking exists: %s, %s', self.bucket, self.object_to_check) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) object_exists = hook.exists(self.bucket, self.object_to_check) if self.store_to_xcom_key: context['ti'].xcom_push(key=self.store_to_xcom_key, value=object_exists) else: context['ti'].xcom_push(key='object_exists', value=object_exists) self.log.info(object_exists)
def execute(self, context): self.log.info('Checking exists: %s, %s', self.bucket, self.object_to_check) hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) object_exists = hook.exists(self.bucket, self.object_to_check) self.log.info('Object exists : %s', object_exists) branch = self.not_exist_task if object_exists: branch = self.exist_task # set tasks in the unselected task list to be skipped downstream_tasks = context['task'].downstream_list self.log.info('Following branch %s', branch) self.log.info('Downstream task_ids %s', downstream_tasks) skip_tasks = [t for t in downstream_tasks if t.task_id != branch] if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, skip_tasks)
def execute(self, context): gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) gcs_source_objects = gcs_hook.list(bucket=self.gcs_source_bucket, prefix=self.gcs_source_prefix, maxResults=1000) if gcs_source_objects is None or len(gcs_source_objects) == 0: self.log.warn('SKIP: No objects found matching the prefix "%s"', self.gcs_source_prefix) return self.log.info('Number of object to compose: %d', len(gcs_source_objects)) for gcs_uri in gcs_source_objects: tmp = tempfile.NamedTemporaryFile() if gcs_hook.exists(self.gcs_source_bucket, gcs_uri) is False: if self.fail_on_missing is True: self.log.error('Execution will fail Object not found: gs://%s/%s', self.gcs_source_bucket, gcs_uri) self.is_failed = True else: self.log.warning('Skipping. Object not found: gs://%s/%s', self.gcs_source_bucket, gcs_uri) continue self.log.info('Download gs://%s/%s', self.gcs_source_bucket, gcs_uri) gcs_hook.download( bucket=self.gcs_source_bucket, object=gcs_uri, filename=tmp.name ) self.log.info('Upload s3://%s/%s', self.s3_destination_bucket, gcs_uri) s3_hook.load_file( filename=tmp.name, bucket_name=self.s3_destination_bucket, key=gcs_uri, replace=True, acl_policy=self.s3_acl_policy ) tmp.close() if self.is_failed: raise AirflowException('Some object were not found at the source.')
def execute(self, context): cloud_storage_conn = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) condition = cloud_storage_conn.exists(self.bucket, self.filename) self.log.info("Condition result is %s", condition) if condition: self.log.info('Proceeding with downstream tasks...') return self.log.info('Skipping downstream tasks...') downstream_tasks = context['task'].get_flat_relatives(upstream=False) self.log.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) self.log.info("Done.")
def execute(self, context): gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to ) s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) for i in range(0, len(self.gcs_source_uris), 1): tmp = tempfile.NamedTemporaryFile() gcs_obj = self.gcs_source_uris[i] s3_obj = self.s3_destination_uris[i] if gcs_hook.exists(self.gcs_source_bucket, gcs_obj) is False: if self.fail_on_missing is True: self.log.error('Execution will fail Object not found: gs://%s/%s', self.gcs_source_bucket, gcs_obj) self.is_failed = True else: self.log.warning('Skipping. Object not found: gs://%s/%s', self.gcs_source_bucket, gcs_obj) continue self.log.info('Download gs://%s/%s', self.gcs_source_bucket, gcs_obj) gcs_hook.download( bucket=self.gcs_source_bucket, object=gcs_obj, filename=tmp.name ) self.log.info('Upload s3://%s/%s', self.s3_destination_bucket, s3_obj) s3_hook.load_file( filename=tmp.name, bucket_name=self.s3_destination_bucket, key=s3_obj, replace=True, acl_policy=self.s3_acl_policy ) tmp.close() if self.is_failed: raise AirflowException('Some object were not found at the source.')
class GCSLog(object): """ Utility class for reading and writing logs in GCS. Requires airflow[gcp_api] and setting the REMOTE_BASE_LOG_FOLDER and REMOTE_LOG_CONN_ID configuration options in airflow.cfg. """ def __init__(self): """ Attempt to create hook with airflow[gcp_api]. """ remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID') self.hook = None try: from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook self.hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=remote_conn_id) except: logging.error( 'Could not create a GoogleCloudStorageHook with connection id ' '"{}". Please make sure that airflow[gcp_api] is installed ' 'and the GCS connection exists.'.format(remote_conn_id)) def log_exists(self, remote_log_location): """ Check if remote_log_location exists in remote storage :param remote_log_location: log's location in remote storage :return: True if location exists else False """ if self.hook: try: bkt, blob = self.parse_gcs_url(remote_log_location) return self.hook.exists(bkt, blob) except Exception: pass return False def read(self, remote_log_location, return_error=False): """ Returns the log found at the remote_log_location. :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param return_error: if True, returns a string error message if an error occurs. Otherwise returns '' when an error occurs. :type return_error: bool """ if self.hook: try: bkt, blob = self.parse_gcs_url(remote_log_location) return self.hook.download(bkt, blob).decode() except: pass # raise/return error if we get here err = 'Could not read logs from {}'.format(remote_log_location) logging.error(err) return err if return_error else '' def write(self, log, remote_log_location, append=True): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if self.hook: if append: old_log = self.read(remote_log_location) log = old_log + '\n' + log try: bkt, blob = self.parse_gcs_url(remote_log_location) from tempfile import NamedTemporaryFile with NamedTemporaryFile(mode='w+') as tmpfile: tmpfile.write(log) # Force the file to be flushed, since we're doing the # upload from within the file context (it hasn't been # closed). tmpfile.flush() self.hook.upload(bkt, blob, tmpfile.name) except: # raise/return error if we get here logging.error('Could not write logs to {}'.format(remote_log_location)) def parse_gcs_url(self, gsurl): """ Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a tuple containing the corresponding bucket and blob. """ # Python 3 try: from urllib.parse import urlparse # Python 2 except ImportError: from urlparse import urlparse parsed_url = urlparse(gsurl) if not parsed_url.netloc: raise AirflowException('Please provide a bucket name') else: bucket = parsed_url.netloc blob = parsed_url.path.strip('/') return (bucket, blob)
def check_gcs_file_exists(file_name, google_cloud_conn_id, bucket): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=google_cloud_conn_id) return hook.exists(bucket, file_name)
class GCSLog(object): """ Utility class for reading and writing logs in GCS. Requires airflow[gcp_api] and setting the REMOTE_BASE_LOG_FOLDER and REMOTE_LOG_CONN_ID configuration options in airflow.cfg. """ def __init__(self): """ Attempt to create hook with airflow[gcp_api]. """ remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID') self.hook = None try: from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook self.hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=remote_conn_id) except: logging.error( 'Could not create a GoogleCloudStorageHook with connection id ' '"{}". Please make sure that airflow[gcp_api] is installed ' 'and the GCS connection exists.'.format(remote_conn_id)) def log_exists(self, remote_log_location): """ Check if remote_log_location exists in remote storage :param remote_log_location: log's location in remote storage :return: True if location exists else False """ if self.hook: try: bkt, blob = self.parse_gcs_url(remote_log_location) return self.hook.exists(bkt, blob) except Exception: pass return False def read(self, remote_log_location, return_error=False): """ Returns the log found at the remote_log_location. :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param return_error: if True, returns a string error message if an error occurs. Otherwise returns '' when an error occurs. :type return_error: bool """ if self.hook: try: bkt, blob = self.parse_gcs_url(remote_log_location) return self.hook.download(bkt, blob).decode() except: pass # return error if needed if return_error: msg = 'Could not read logs from {}'.format(remote_log_location) logging.error(msg) return msg return '' def write(self, log, remote_log_location, append=True): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if self.hook: if append: old_log = self.read(remote_log_location) log = '\n'.join([old_log, log]) try: bkt, blob = self.parse_gcs_url(remote_log_location) from tempfile import NamedTemporaryFile with NamedTemporaryFile(mode='w+') as tmpfile: tmpfile.write(log) # Force the file to be flushed, since we're doing the # upload from within the file context (it hasn't been # closed). tmpfile.flush() self.hook.upload(bkt, blob, tmpfile.name) except: logging.error( 'Could not write logs to {}'.format(remote_log_location)) def parse_gcs_url(self, gsurl): """ Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a tuple containing the corresponding bucket and blob. """ # Python 3 try: from urllib.parse import urlparse # Python 2 except ImportError: from urlparse import urlparse parsed_url = urlparse(gsurl) if not parsed_url.netloc: raise AirflowException('Please provide a bucket name') else: bucket = parsed_url.netloc blob = parsed_url.path.strip('/') return (bucket, blob)