Ejemplo n.º 1
0
 def poke(self, context):
     self.log.info('Sensor checks existence of : %s, %s', self.bucket,
                   self.object)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_conn_id,
         delegate_to=self.delegate_to)
     return hook.exists(self.bucket, self.object)
Ejemplo n.º 2
0
 def execute(self, context):
     gcs_hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
         delegate_to=self.delegate_to
     )
     s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify)
     if gcs_hook.exists(self.gcs_source_bucket, self.gcs_source_uri) is False:
         self.log.error('Skip object not found: gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri)
         raise AirflowException('Skip object not found: gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri)
     tmp = tempfile.NamedTemporaryFile()
     self.log.info('Download gs://%s/%s', self.gcs_source_bucket, self.gcs_source_uri)
     gcs_hook.download(
         bucket=self.gcs_source_bucket,
         object=self.gcs_source_uri,
         filename=tmp.name,
     )
     self.log.info('Upload s3://%s/%s', self.s3_destination_bucket, self.s3_destination_uri)
     s3_hook.load_file(
             filename=tmp.name,
         bucket_name=self.s3_destination_bucket,
         key=self.s3_destination_uri,
         replace=True,
         acl_policy=self.s3_acl_policy
     )
     tmp.close()
Ejemplo n.º 3
0
 def poke(self, context):
     self.log.info("Sensor checks existence of : %s, %s", self.bucket, self.object)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_conn_id,
         delegate_to=self.delegate_to,
     )
     # check file is exist and not zero byte size
     return (
         hook.exists(self.bucket, self.object)
         and hook.get_size(self.bucket, self.object) > 0
     )
 def execute(self, context):
     self.log.info('Checking exists: %s, %s', self.bucket,
                   self.object_to_check)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
         delegate_to=self.delegate_to)
     object_exists = hook.exists(self.bucket, self.object_to_check)
     if self.store_to_xcom_key:
         context['ti'].xcom_push(key=self.store_to_xcom_key,
                                 value=object_exists)
     else:
         context['ti'].xcom_push(key='object_exists', value=object_exists)
     self.log.info(object_exists)
 def execute(self, context):
     self.log.info('Checking exists: %s, %s', self.bucket,
                   self.object_to_check)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
         delegate_to=self.delegate_to)
     object_exists = hook.exists(self.bucket, self.object_to_check)
     self.log.info('Object exists : %s', object_exists)
     branch = self.not_exist_task
     if object_exists:
         branch = self.exist_task
     # set tasks in the unselected task list to be skipped
     downstream_tasks = context['task'].downstream_list
     self.log.info('Following branch %s', branch)
     self.log.info('Downstream task_ids %s', downstream_tasks)
     skip_tasks = [t for t in downstream_tasks if t.task_id != branch]
     if downstream_tasks:
         self.skip(context['dag_run'], context['ti'].execution_date,
                   skip_tasks)
Ejemplo n.º 6
0
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )
        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify)

        gcs_source_objects = gcs_hook.list(bucket=self.gcs_source_bucket, prefix=self.gcs_source_prefix, maxResults=1000)
        if gcs_source_objects is None or len(gcs_source_objects) == 0:
            self.log.warn('SKIP: No objects found matching the prefix "%s"', self.gcs_source_prefix)
            return

        self.log.info('Number of object to compose: %d', len(gcs_source_objects))

        for gcs_uri in gcs_source_objects:
            tmp = tempfile.NamedTemporaryFile()
            if gcs_hook.exists(self.gcs_source_bucket, gcs_uri) is False:
                if self.fail_on_missing is True:
                    self.log.error('Execution will fail Object not found: gs://%s/%s', self.gcs_source_bucket, gcs_uri)
                    self.is_failed = True
                else:
                    self.log.warning('Skipping. Object not found: gs://%s/%s', self.gcs_source_bucket, gcs_uri)
                continue

            self.log.info('Download gs://%s/%s', self.gcs_source_bucket, gcs_uri)
            gcs_hook.download(
                bucket=self.gcs_source_bucket,
                object=gcs_uri,
                filename=tmp.name
            )
            self.log.info('Upload s3://%s/%s', self.s3_destination_bucket, gcs_uri)
            s3_hook.load_file(
                filename=tmp.name,
                bucket_name=self.s3_destination_bucket,
                key=gcs_uri,
                replace=True,
                acl_policy=self.s3_acl_policy
            )
            tmp.close()
            if self.is_failed:
                raise AirflowException('Some object were not found at the source.')
Ejemplo n.º 7
0
    def execute(self, context):
        cloud_storage_conn = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                                    delegate_to=self.delegate_to
                                                    )

        condition = cloud_storage_conn.exists(self.bucket, self.filename)
        self.log.info("Condition result is %s", condition)

        if condition:
            self.log.info('Proceeding with downstream tasks...')
            return

        self.log.info('Skipping downstream tasks...')

        downstream_tasks = context['task'].get_flat_relatives(upstream=False)
        self.log.debug("Downstream task_ids %s", downstream_tasks)

        if downstream_tasks:
            self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks)

        self.log.info("Done.")
Ejemplo n.º 8
0
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to
        )
        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify)

        for i in range(0, len(self.gcs_source_uris), 1):
            tmp = tempfile.NamedTemporaryFile()
            gcs_obj = self.gcs_source_uris[i]
            s3_obj = self.s3_destination_uris[i]
            if gcs_hook.exists(self.gcs_source_bucket, gcs_obj) is False:
                if self.fail_on_missing is True:
                    self.log.error('Execution will fail Object not found: gs://%s/%s', self.gcs_source_bucket, gcs_obj)
                    self.is_failed = True
                else:
                    self.log.warning('Skipping. Object not found: gs://%s/%s', self.gcs_source_bucket, gcs_obj)
                continue

            self.log.info('Download gs://%s/%s', self.gcs_source_bucket, gcs_obj)
            gcs_hook.download(
                bucket=self.gcs_source_bucket,
                object=gcs_obj,
                filename=tmp.name
            )
            self.log.info('Upload s3://%s/%s', self.s3_destination_bucket, s3_obj)
            s3_hook.load_file(
                filename=tmp.name,
                bucket_name=self.s3_destination_bucket,
                key=s3_obj,
                replace=True,
                acl_policy=self.s3_acl_policy
            )
            tmp.close()
            if self.is_failed:
                raise AirflowException('Some object were not found at the source.')
Ejemplo n.º 9
0
 def poke(self, context):
     self.log.info('Sensor checks existence of : %s, %s', self.bucket, self.object)
     hook = GoogleCloudStorageHook(
         google_cloud_storage_conn_id=self.google_cloud_conn_id,
         delegate_to=self.delegate_to)
     return hook.exists(self.bucket, self.object)
Ejemplo n.º 10
0
class GCSLog(object):
    """
    Utility class for reading and writing logs in GCS. Requires
    airflow[gcp_api] and setting the REMOTE_BASE_LOG_FOLDER and
    REMOTE_LOG_CONN_ID configuration options in airflow.cfg.
    """
    def __init__(self):
        """
        Attempt to create hook with airflow[gcp_api].
        """
        remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
        self.hook = None

        try:
            from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
            self.hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=remote_conn_id)
        except:
            logging.error(
                'Could not create a GoogleCloudStorageHook with connection id '
                '"{}". Please make sure that airflow[gcp_api] is installed '
                'and the GCS connection exists.'.format(remote_conn_id))

    def log_exists(self, remote_log_location):
        """
        Check if remote_log_location exists in remote storage
        :param remote_log_location: log's location in remote storage
        :return: True if location exists else False
        """
        if self.hook:
            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                return self.hook.exists(bkt, blob)
            except Exception:
                pass
        return False

    def read(self, remote_log_location, return_error=False):
        """
        Returns the log found at the remote_log_location.

        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param return_error: if True, returns a string error message if an
            error occurs. Otherwise returns '' when an error occurs.
        :type return_error: bool
        """
        if self.hook:
            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                return self.hook.download(bkt, blob).decode()
            except:
                pass

        # raise/return error if we get here
        err = 'Could not read logs from {}'.format(remote_log_location)
        logging.error(err)
        return err if return_error else ''

    def write(self, log, remote_log_location, append=True):
        """
        Writes the log to the remote_log_location. Fails silently if no hook
        was created.

        :param log: the log to write to the remote_log_location
        :type log: string
        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param append: if False, any existing log file is overwritten. If True,
            the new log is appended to any existing logs.
        :type append: bool

        """
        if self.hook:
            if append:
                old_log = self.read(remote_log_location)
                log = old_log + '\n' + log

            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                from tempfile import NamedTemporaryFile
                with NamedTemporaryFile(mode='w+') as tmpfile:
                    tmpfile.write(log)
                    # Force the file to be flushed, since we're doing the
                    # upload from within the file context (it hasn't been
                    # closed).
                    tmpfile.flush()
                    self.hook.upload(bkt, blob, tmpfile.name)
            except:
                # raise/return error if we get here
                logging.error('Could not write logs to {}'.format(remote_log_location))

    def parse_gcs_url(self, gsurl):
        """
        Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a
        tuple containing the corresponding bucket and blob.
        """
        # Python 3
        try:
            from urllib.parse import urlparse
        # Python 2
        except ImportError:
            from urlparse import urlparse

        parsed_url = urlparse(gsurl)
        if not parsed_url.netloc:
            raise AirflowException('Please provide a bucket name')
        else:
            bucket = parsed_url.netloc
            blob = parsed_url.path.strip('/')
            return (bucket, blob)
Ejemplo n.º 11
0
def check_gcs_file_exists(file_name, google_cloud_conn_id, bucket):
    hook = GoogleCloudStorageHook(
        google_cloud_storage_conn_id=google_cloud_conn_id)
    return hook.exists(bucket, file_name)
Ejemplo n.º 12
0
class GCSLog(object):
    """
    Utility class for reading and writing logs in GCS. Requires
    airflow[gcp_api] and setting the REMOTE_BASE_LOG_FOLDER and
    REMOTE_LOG_CONN_ID configuration options in airflow.cfg.
    """
    def __init__(self):
        """
        Attempt to create hook with airflow[gcp_api].
        """
        remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
        self.hook = None

        try:
            from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook
            self.hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=remote_conn_id)
        except:
            logging.error(
                'Could not create a GoogleCloudStorageHook with connection id '
                '"{}". Please make sure that airflow[gcp_api] is installed '
                'and the GCS connection exists.'.format(remote_conn_id))

    def log_exists(self, remote_log_location):
        """
        Check if remote_log_location exists in remote storage
        :param remote_log_location: log's location in remote storage
        :return: True if location exists else False
        """
        if self.hook:
            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                return self.hook.exists(bkt, blob)
            except Exception:
                pass
        return False

    def read(self, remote_log_location, return_error=False):
        """
        Returns the log found at the remote_log_location.

        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param return_error: if True, returns a string error message if an
            error occurs. Otherwise returns '' when an error occurs.
        :type return_error: bool
        """
        if self.hook:
            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                return self.hook.download(bkt, blob).decode()
            except:
                pass

        # return error if needed
        if return_error:
            msg = 'Could not read logs from {}'.format(remote_log_location)
            logging.error(msg)
            return msg

        return ''

    def write(self, log, remote_log_location, append=True):
        """
        Writes the log to the remote_log_location. Fails silently if no hook
        was created.

        :param log: the log to write to the remote_log_location
        :type log: string
        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param append: if False, any existing log file is overwritten. If True,
            the new log is appended to any existing logs.
        :type append: bool
        """
        if self.hook:
            if append:
                old_log = self.read(remote_log_location)
                log = '\n'.join([old_log, log])

            try:
                bkt, blob = self.parse_gcs_url(remote_log_location)
                from tempfile import NamedTemporaryFile
                with NamedTemporaryFile(mode='w+') as tmpfile:
                    tmpfile.write(log)
                    # Force the file to be flushed, since we're doing the
                    # upload from within the file context (it hasn't been
                    # closed).
                    tmpfile.flush()
                    self.hook.upload(bkt, blob, tmpfile.name)
            except:
                logging.error(
                    'Could not write logs to {}'.format(remote_log_location))

    def parse_gcs_url(self, gsurl):
        """
        Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a
        tuple containing the corresponding bucket and blob.
        """
        # Python 3
        try:
            from urllib.parse import urlparse
        # Python 2
        except ImportError:
            from urlparse import urlparse

        parsed_url = urlparse(gsurl)
        if not parsed_url.netloc:
            raise AirflowException('Please provide a bucket name')
        else:
            bucket = parsed_url.netloc
            blob = parsed_url.path.strip('/')
            return (bucket, blob)