Example #1
0
class GCSLog(object):
    """
    Utility class for reading and writing logs in GCS.
    Requires either airflow[gcloud] or airflow[gcp_api] and
    setting the REMOTE_BASE_LOG_FOLDER and REMOTE_LOG_CONN_ID configuration
    options in airflow.cfg.
    """
    def __init__(self):
        """
        Attempt to create hook with airflow[gcloud] (and set
        use_gcloud = True), otherwise uses airflow[gcp_api]
        """
        remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
        self.use_gcloud = False

        try:
            from airflow.contrib.hooks import GCSHook
            self.hook = GCSHook(remote_conn_id)
            self.use_gcloud = True
        except:
            try:
                from airflow.contrib.hooks import GoogleCloudStorageHook
                self.hook = GoogleCloudStorageHook(remote_conn_id)
            except:
                self.hook = None
                logging.error(
                    'Could not create a GCSHook with connection id "{}". '
                    'Please make sure that either airflow[gcloud] or '
                    'airflow[gcp_api] is installed and the GCS connection '
                    'exists.'.format(remote_conn_id))

    def read(self, remote_log_location, return_error=True):
        """
        Returns the log found at the remote_log_location.

        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param return_error: if True, returns a string error message if an
            error occurs. Otherwise returns '' when an error occurs.
        :type return_error: bool
        """
        if self.hook:
            try:
                if self.use_gcloud:
                    gcs_blob = self.hook.get_blob(remote_log_location)
                    if gcs_blob:
                        return gcs_blob.download_as_string().decode()
                else:
                    bkt, blob = remote_log_location.lstrip('gs:/').split('/', 1)
                    return self.hook.download(bkt, blob).decode()
            except:
                pass

        # raise/return error if we get here
        err = 'Could not read logs from {}'.format(remote_log_location)
        logging.error(err)
        return err if return_error else ''

    def write(self, log, remote_log_location, append=False):
        """
        Writes the log to the remote_log_location. Fails silently if no hook
        was created.

        :param log: the log to write to the remote_log_location
        :type log: string
        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param append: if False, any existing log file is overwritten. If True,
            the new log is appended to any existing logs.
        :type append: bool

        """
        if self.hook:

            if append:
                old_log = self.read(remote_log_location)
                log = old_log + '\n' + log

            try:
                if self.use_gcloud:
                    self.hook.upload_from_string(
                        log,
                        blob=remote_log_location,
                        replace=True)
                    return
                else:
                    bkt, blob = remote_log_location.lstrip('gs:/').split('/', 1)
                    from tempfile import NamedTemporaryFile
                    with NamedTemporaryFile(mode='w+') as tmpfile:
                        tmpfile.write(log)
                        self.hook.upload(bkt, blob, tmpfile.name)
                    return
            except:
                pass

        # raise/return error if we get here
        logging.error('Could not write logs to {}'.format(remote_log_location))
Example #2
0
class GCSLog(object):
    """
    Utility class for reading and writing logs in GCS.
    Requires either airflow[gcloud] or airflow[gcp_api] and
    setting the REMOTE_BASE_LOG_FOLDER and REMOTE_LOG_CONN_ID configuration
    options in airflow.cfg.
    """
    def __init__(self):
        """
        Attempt to create hook with airflow[gcloud] (and set
        use_gcloud = True), otherwise uses airflow[gcp_api]
        """
        remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
        self.use_gcloud = False

        try:
            from airflow.contrib.hooks import GCSHook
            self.hook = GCSHook(remote_conn_id)
            self.use_gcloud = True
        except:
            try:
                from airflow.contrib.hooks import GoogleCloudStorageHook
                self.hook = GoogleCloudStorageHook(remote_conn_id)
            except:
                self.hook = None
                logging.error(
                    'Could not create a GCSHook with connection id "{}". '
                    'Please make sure that either airflow[gcloud] or '
                    'airflow[gcp_api] is installed and the GCS connection '
                    'exists.'.format(remote_conn_id))

    def read(self, remote_log_location, return_error=True):
        """
        Returns the log found at the remote_log_location.

        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param return_error: if True, returns a string error message if an
            error occurs. Otherwise returns '' when an error occurs.
        :type return_error: bool
        """
        if self.hook:
            try:
                if self.use_gcloud:
                    gcs_blob = self.hook.get_blob(remote_log_location)
                    if gcs_blob:
                        return gcs_blob.download_as_string().decode()
                else:
                    bkt, blob = self.parse_gcs_url(remote_log_location)
                    return self.hook.download(bkt, blob).decode()
            except:
                pass

        # raise/return error if we get here
        err = 'Could not read logs from {}'.format(remote_log_location)
        logging.error(err)
        return err if return_error else ''

    def write(self, log, remote_log_location, append=False):
        """
        Writes the log to the remote_log_location. Fails silently if no hook
        was created.

        :param log: the log to write to the remote_log_location
        :type log: string
        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param append: if False, any existing log file is overwritten. If True,
            the new log is appended to any existing logs.
        :type append: bool

        """
        if self.hook:

            if append:
                old_log = self.read(remote_log_location)
                log = old_log + '\n' + log

            try:
                if self.use_gcloud:
                    self.hook.upload_from_string(log,
                                                 blob=remote_log_location,
                                                 replace=True)
                    return
                else:
                    bkt, blob = self.parse_gcs_url(remote_log_location)
                    from tempfile import NamedTemporaryFile
                    with NamedTemporaryFile(mode='w+') as tmpfile:
                        tmpfile.write(log)
                        self.hook.upload(bkt, blob, tmpfile.name)
                    return
            except:
                pass

        # raise/return error if we get here
        logging.error('Could not write logs to {}'.format(remote_log_location))

    def parse_gcs_url(self, gsurl):
        """
        Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a
        tuple containing the corresponding bucket and blob.
        """
        # Python 3
        try:
            from urllib.parse import urlparse
        # Python 2
        except ImportError:
            from urlparse import urlparse

        parsed_url = urlparse(gsurl)
        if not parsed_url.netloc:
            raise AirflowException('Please provide a bucket name')
        else:
            bucket = parsed_url.netloc
            blob = parsed_url.path.strip('/')
            return (bucket, blob)
Example #3
0
class GCSLog(object):
    """
    Utility class for reading and writing logs in GCS.
    Requires either airflow[gcloud] or airflow[gcp_api] and
    setting the REMOTE_BASE_LOG_FOLDER and REMOTE_LOG_CONN_ID configuration
    options in airflow.cfg.
    """
    def __init__(self):
        """
        Attempt to create hook with airflow[gcloud] (and set
        use_gcloud = True), otherwise uses airflow[gcp_api]
        """
        remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID')
        self.use_gcloud = False

        try:
            from airflow.contrib.hooks import GCSHook
            self.hook = GCSHook(remote_conn_id)
            self.use_gcloud = True
        except:
            try:
                from airflow.contrib.hooks import GoogleCloudStorageHook
                self.hook = GoogleCloudStorageHook(
                    scope='https://www.googleapis.com/auth/devstorage.read_write',
                    google_cloud_storage_conn_id=remote_conn_id)
            except:
                self.hook = None
                logging.error(
                    'Could not create a GCSHook with connection id "{}". '
                    'Please make sure that either airflow[gcloud] or '
                    'airflow[gcp_api] is installed and the GCS connection '
                    'exists.'.format(remote_conn_id))

    def read(self, remote_log_location, return_error=False):
        """
        Returns the log found at the remote_log_location.

        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param return_error: if True, returns a string error message if an
            error occurs. Otherwise returns '' when an error occurs.
        :type return_error: bool
        """
        if self.hook:
            try:
                if self.use_gcloud:
                    gcs_blob = self.hook.get_blob(remote_log_location)
                    if gcs_blob:
                        return gcs_blob.download_as_string().decode()
                else:
                    bkt, blob = self.parse_gcs_url(remote_log_location)
                    return self.hook.download(bkt, blob).decode()
            except:
                pass

        # raise/return error if we get here
        err = 'Could not read logs from {}'.format(remote_log_location)
        logging.error(err)
        return err if return_error else ''

    def write(self, log, remote_log_location, append=False):
        """
        Writes the log to the remote_log_location. Fails silently if no hook
        was created.

        :param log: the log to write to the remote_log_location
        :type log: string
        :param remote_log_location: the log's location in remote storage
        :type remote_log_location: string (path)
        :param append: if False, any existing log file is overwritten. If True,
            the new log is appended to any existing logs.
        :type append: bool

        """
        if self.hook:

            if append:
                old_log = self.read(remote_log_location)
                log = old_log + '\n' + log

            try:
                if self.use_gcloud:
                    self.hook.upload_from_string(
                        log,
                        blob=remote_log_location,
                        replace=True)
                    return
                else:
                    bkt, blob = self.parse_gcs_url(remote_log_location)
                    from tempfile import NamedTemporaryFile
                    with NamedTemporaryFile(mode='w+') as tmpfile:
                        tmpfile.write(log)
                        # Force the file to be flushed, since we're doing the
                        # upload from within the file context (it hasn't been
                        # closed).
                        tmpfile.flush()
                        self.hook.upload(bkt, blob, tmpfile.name)
                    return
            except:
                pass

        # raise/return error if we get here
        logging.error('Could not write logs to {}'.format(remote_log_location))

    def parse_gcs_url(self, gsurl):
        """
        Given a Google Cloud Storage URL (gs://<bucket>/<blob>), returns a
        tuple containing the corresponding bucket and blob.
        """
        # Python 3
        try:
            from urllib.parse import urlparse
        # Python 2
        except ImportError:
            from urlparse import urlparse

        parsed_url = urlparse(gsurl)
        if not parsed_url.netloc:
            raise AirflowException('Please provide a bucket name')
        else:
            bucket = parsed_url.netloc
            blob = parsed_url.path.strip('/')
            return (bucket, blob)