Esempi in Python per GCSHook.GCSHook, esempi in Python per airflow.gcp.hooks.gcs.GCSHook.GCSHook

Esempio n. 1

0

Mostra file

File: gcs.py Progetto: shyamsunder0072/RaWorkflowOrchestrator

 def poke(self, context):
     self.log.info('Sensor checks existence of objects: %s, %s',
                   self.bucket, self.prefix)
     hook = GCSHook(google_cloud_storage_conn_id=self.google_cloud_conn_id,
                    delegate_to=self.delegate_to)
     self._matches = hook.list(self.bucket, prefix=self.prefix)
     return bool(self._matches)

Esempio n. 2

0

Mostra file

    def execute(self, context):

        self.gcs_hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                                delegate_to=self.delegate_to)
        self.gdrive_hook = GoogleDriveHook(gcp_conn_id=self.gcp_conn_id,
                                           delegate_to=self.delegate_to)

        if WILDCARD in self.source_object:
            total_wildcards = self.source_object.count(WILDCARD)
            if total_wildcards > 1:
                error_msg = (
                    "Only one wildcard '*' is allowed in source_object parameter. "
                    "Found {} in {}.".format(total_wildcards,
                                             self.source_object))

                raise AirflowException(error_msg)

            prefix, delimiter = self.source_object.split(WILDCARD, 1)
            objects = self.gcs_hook.list(self.source_bucket,
                                         prefix=prefix,
                                         delimiter=delimiter)

            for source_object in objects:
                if self.destination_object is None:
                    destination_object = source_object
                else:
                    destination_object = source_object.replace(
                        prefix, self.destination_object, 1)

                self._copy_single_object(source_object=source_object,
                                         destination_object=destination_object)
        else:
            self._copy_single_object(
                source_object=self.source_object,
                destination_object=self.destination_object)

Esempio n. 3

0

Mostra file

    def execute(self, context):
        gcs_hook = GCSHook(gcp_conn_id=self.gcp_conn_id,
                           delegate_to=self.delegate_to)

        sftp_hook = SFTPHook(self.sftp_conn_id)

        if WILDCARD in self.source_object:
            total_wildcards = self.source_object.count(WILDCARD)
            if total_wildcards > 1:
                raise AirflowException(
                    "Only one wildcard '*' is allowed in source_object parameter. "
                    "Found {} in {}.".format(total_wildcards,
                                             self.source_object))

            prefix, delimiter = self.source_object.split(WILDCARD, 1)
            objects = gcs_hook.list(self.source_bucket,
                                    prefix=prefix,
                                    delimiter=delimiter)

            for source_object in objects:
                destination_path = os.path.join(self.destination_path,
                                                source_object)
                self._copy_single_object(gcs_hook, sftp_hook, source_object,
                                         destination_path)

            self.log.info("Done. Uploaded '%d' files to %s", len(objects),
                          self.destination_path)
        else:
            destination_path = os.path.join(self.destination_path,
                                            self.source_object)
            self._copy_single_object(gcs_hook, sftp_hook, self.source_object,
                                     destination_path)
            self.log.info("Done. Uploaded '%s' file to %s", self.source_object,
                          destination_path)

Esempio n. 4

0

Mostra file

File: gcs.py Progetto: shyamsunder0072/RaWorkflowOrchestrator

 def poke(self, context):
     self.log.info('Sensor checks existence of : %s, %s', self.bucket,
                   self.object)
     hook = GCSHook(google_cloud_storage_conn_id=self.google_cloud_conn_id,
                    delegate_to=self.delegate_to)
     return hook.is_updated_after(self.bucket, self.object,
                                  self.ts_func(context))

Esempio n. 5

0

Mostra file

File: sftp_to_gcs.py Progetto: ritabanchowdhury319/airflow

    def execute(self, context):
        gcs_hook = GCSHook(gcp_conn_id=self.gcp_conn_id,
                           delegate_to=self.delegate_to)

        sftp_hook = SFTPHook(self.sftp_conn_id)

        if WILDCARD in self.source_path:
            total_wildcards = self.source_path.count(WILDCARD)
            if total_wildcards > 1:
                raise AirflowException(
                    "Only one wildcard '*' is allowed in source_path parameter. "
                    "Found {} in {}.".format(total_wildcards,
                                             self.source_path))

            prefix, delimiter = self.source_path.split(WILDCARD, 1)
            base_path = os.path.dirname(prefix)

            files, _, _ = sftp_hook.get_tree_map(base_path,
                                                 prefix=prefix,
                                                 delimiter=delimiter)

            for file in files:
                destination_path = file.replace(base_path,
                                                self.destination_path, 1)
                self._copy_single_object(gcs_hook, sftp_hook, file,
                                         destination_path)

        else:
            destination_object = (self.destination_path
                                  if self.destination_path else
                                  self.source_path.rsplit("/", 1)[1])
            self._copy_single_object(gcs_hook, sftp_hook, self.source_path,
                                     destination_object)

Esempio n. 6

0

Mostra file

 def execute(self, context):
     hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id)
     hook.insert_object_acl(bucket_name=self.bucket,
                            object_name=self.object_name,
                            entity=self.entity,
                            role=self.role,
                            generation=self.generation,
                            user_project=self.user_project)

Esempio n. 7

0

Mostra file

File: cassandra_to_gcs.py Progetto: yqian1991/airflow

 def _upload_to_gcs(self, files_to_upload: Dict[str, Any]):
     hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                    delegate_to=self.delegate_to)
     for obj, tmp_file_handle in files_to_upload.items():
         hook.upload(bucket_name=self.bucket,
                     object_name=obj,
                     filename=tmp_file_handle.name,
                     mime_type='application/json',
                     gzip=self.gzip)

Esempio n. 8

0

Mostra file

    def execute(self, context):
        hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                       delegate_to=self.delegate_to)

        hook.create_bucket(bucket_name=self.bucket_name,
                           resource=self.resource,
                           storage_class=self.storage_class,
                           location=self.location,
                           project_id=self.project_id,
                           labels=self.labels)

Esempio n. 9

0

Mostra file

 def apply_validate_fn(*args, **kwargs):
     prediction_path = kwargs["templates_dict"]["prediction_path"]
     scheme, bucket, obj, _, _ = urlsplit(prediction_path)
     if scheme != "gs" or not bucket or not obj:
         raise ValueError(
             "Wrong format prediction_path: {}".format(prediction_path))
     summary = os.path.join(obj.strip("/"), "prediction.summary.json")
     gcs_hook = GCSHook()
     summary = json.loads(gcs_hook.download(bucket, summary))
     return validate_fn(summary)

Esempio n. 10

0

Mostra file

File: gcs_to_gcs.py Progetto: wjones127/airflow

 def execute(self, context):
     hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                    delegate_to=self.delegate_to)
     hook.sync(source_bucket=self.source_bucket,
               destination_bucket=self.destination_bucket,
               source_object=self.source_object,
               destination_object=self.destination_object,
               recursive=self.recursive,
               delete_extra_files=self.delete_extra_files,
               allow_overwrite=self.allow_overwrite)

Esempio n. 11

0

Mostra file

    def execute(self, context):

        hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                       delegate_to=self.delegate_to)

        self.log.info(
            'Getting list of the files. Bucket: %s; Delimiter: %s; Prefix: %s',
            self.bucket, self.delimiter, self.prefix)

        return hook.list(bucket_name=self.bucket,
                         prefix=self.prefix,
                         delimiter=self.delimiter)

Esempio n. 12

0

Mostra file

File: gcs_task_handler.py Progetto: yqian1991/airflow

 def hook(self):
     """
     Returns GCS hook.
     """
     remote_conn_id = conf.get('logging', 'REMOTE_LOG_CONN_ID')
     try:
         from airflow.gcp.hooks.gcs import GCSHook
         return GCSHook(google_cloud_storage_conn_id=remote_conn_id)
     except Exception as e:  # pylint: disable=broad-except
         self.log.error(
             'Could not create a GoogleCloudStorageHook with connection id '
             '"%s". %s\n\nPlease make sure that airflow[gcp] is installed '
             'and the GCS connection exists.', remote_conn_id, str(e))

Esempio n. 13

0

Mostra file

 def _upload_to_gcs(self, files_to_upload):
     """
     Upload all of the file splits (and optionally the schema .json file) to
     Google Cloud Storage.
     """
     hook = GCSHook(
         google_cloud_storage_conn_id=self.gcp_conn_id,
         delegate_to=self.delegate_to)
     for tmp_file in files_to_upload:
         hook.upload(self.bucket, tmp_file.get('file_name'),
                     tmp_file.get('file_handle').name,
                     mime_type=tmp_file.get('file_mime_type'),
                     gzip=self.gzip if tmp_file.get('file_name') == self.schema_filename else False)

Esempio n. 14

0

Mostra file

    def execute(self, context):
        """
        Uploads the file to Google Cloud Storage
        """
        hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                       delegate_to=self.delegate_to)

        hook.upload(
            bucket_name=self.bucket,
            object_name=self.dst,
            mime_type=self.mime_type,
            filename=self.src,
            gzip=self.gzip,
        )

Esempio n. 15

0

Mostra file

    def execute(self, context):
        hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                       delegate_to=self.delegate_to)

        if self.objects:
            objects = self.objects
        else:
            objects = hook.list(bucket_name=self.bucket_name,
                                prefix=self.prefix)

        self.log.info("Deleting %s objects from %s", len(objects),
                      self.bucket_name)
        for object_name in objects:
            hook.delete(bucket_name=self.bucket_name, object_name=object_name)

Esempio n. 16

0

Mostra file

 def execute(self, context):
     hook = CloudTextToSpeechHook(gcp_conn_id=self.gcp_conn_id)
     result = hook.synthesize_speech(
         input_data=self.input_data,
         voice=self.voice,
         audio_config=self.audio_config,
         retry=self.retry,
         timeout=self.timeout,
     )
     with NamedTemporaryFile() as temp_file:
         temp_file.write(result.audio_content)
         cloud_storage_hook = GCSHook(
             google_cloud_storage_conn_id=self.gcp_conn_id)
         cloud_storage_hook.upload(bucket_name=self.target_bucket_name,
                                   object_name=self.target_filename,
                                   filename=temp_file.name)

Esempio n. 17

0

Mostra file

File: display_video.py Progetto: yqian1991/airflow

    def execute(self, context: Dict):
        hook = GoogleDisplayVideo360Hook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            api_version=self.api_version,
        )
        gcs_hook = GCSHook(
            google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to
        )

        resource = hook.get_query(query_id=self.report_id)
        # Check if report is ready
        if resource["metadata"]["running"]:
            raise AirflowException('Report {} is still running'.format(self.report_id))

        # If no custom report_name provided, use DV360 name
        file_url = resource["metadata"]["googleCloudStoragePathForLatestReport"]
        report_name = self.report_name or urlparse(file_url).path.split('/')[2]
        report_name = self._resolve_file_name(report_name)

        # Download the report
        self.log.info("Starting downloading report %s", self.report_id)
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            with urllib.request.urlopen(file_url) as response:
                shutil.copyfileobj(response, temp_file, length=self.chunk_size)

            temp_file.flush()
            # Upload the local file to bucket
            gcs_hook.upload(
                bucket_name=self.bucket_name,
                object_name=report_name,
                gzip=self.gzip,
                filename=temp_file.name,
                mime_type="text/csv",
            )
        self.log.info(
            "Report %s was saved in bucket %s as %s.",
            self.report_id,
            self.bucket_name,
            report_name,
        )
        self.xcom_push(context, key='report_name', value=report_name)

Esempio n. 18

0

Mostra file

    def execute(self, context):
        self.log.info('Executing download: %s, %s, %s', self.bucket,
                      self.object, self.filename)
        hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                       delegate_to=self.delegate_to)

        if self.store_to_xcom_key:
            file_bytes = hook.download(bucket_name=self.bucket,
                                       object_name=self.object)
            if sys.getsizeof(file_bytes) < MAX_XCOM_SIZE:
                context['ti'].xcom_push(key=self.store_to_xcom_key,
                                        value=file_bytes)
            else:
                raise AirflowException(
                    'The size of the downloaded file is too large to push to XCom!'
                )
        else:
            hook.download(bucket_name=self.bucket,
                          object_name=self.object,
                          filename=self.filename)

Esempio n. 19

0

Mostra file

File: gcs_to_s3.py Progetto: yqian1991/airflow

    def execute(self, context):
        # use the super to list all files in an Google Cloud Storage bucket
        files = super().execute(context)
        s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id,
                         verify=self.dest_verify)

        if not self.replace:
            # if we are not replacing -> list all files in the S3 bucket
            # and only keep those files which are present in
            # Google Cloud Storage and not in S3
            bucket_name, prefix = S3Hook.parse_s3_url(self.dest_s3_key)
            # look for the bucket and the prefix to avoid look into
            # parent directories/keys
            existing_files = s3_hook.list_keys(bucket_name, prefix=prefix)
            # in case that no files exists, return an empty array to avoid errors
            existing_files = existing_files if existing_files is not None else []
            # remove the prefix for the existing files to allow the match
            existing_files = [
                file.replace(prefix, '', 1) for file in existing_files
            ]
            files = list(set(files) - set(existing_files))

        if files:
            hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                           delegate_to=self.delegate_to)

            for file in files:
                file_bytes = hook.download(self.bucket, file)

                dest_key = self.dest_s3_key + file
                self.log.info("Saving file to %s", dest_key)

                s3_hook.load_bytes(file_bytes,
                                   key=dest_key,
                                   replace=self.replace)

            self.log.info("All done, uploaded %d files to S3", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to S3")

        return files

Esempio n. 20

0

Mostra file

    def execute(self, context: Dict):
        hook = GoogleSearchAdsHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            api_version=self.api_version,
        )

        gcs_hook = GCSHook(gcp_conn_id=self.gcp_conn_id,
                           delegate_to=self.delegate_to)

        # Resolve file name of the report
        report_name = self.report_name or self.report_id
        report_name = self._resolve_file_name(report_name)

        response = hook.get(report_id=self.report_id)
        if not response['isReportReady']:
            raise AirflowException('Report {} is not ready yet'.format(
                self.report_id))

        # Resolve report fragments
        fragments_count = len(response["files"])

        # Download chunks of report's data
        self.log.info("Downloading Search Ads report %s", self.report_id)
        with NamedTemporaryFile() as temp_file:
            for i in range(fragments_count):
                byte_content = hook.get_file(report_fragment=i,
                                             report_id=self.report_id)
                fragment = (byte_content if i == 0 else
                            self._handle_report_fragment(byte_content))
                temp_file.write(fragment)

            temp_file.flush()

            gcs_hook.upload(
                bucket_name=self.bucket_name,
                object_name=report_name,
                gzip=self.gzip,
                filename=temp_file.name,
            )
        self.xcom_push(context, key="file_name", value=report_name)

Esempio n. 21

0

Mostra file

    def execute(self, context):
        # use the super to list all files in an Azure Data Lake path
        files = super().execute(context)
        g_hook = GCSHook(
            google_cloud_storage_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the ADLS path
            # and only keep those files which are present in
            # ADLS and not in Google Cloud Storage
            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
            existing_files = g_hook.list(bucket_name=bucket_name, prefix=prefix)
            files = set(files) - set(existing_files)

        if files:
            hook = AzureDataLakeHook(
                azure_data_lake_conn_id=self.azure_data_lake_conn_id
            )

            for obj in files:
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    hook.download_file(local_path=f.name, remote_path=obj)
                    f.flush()
                    dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs)
                    dest_path = os.path.join(dest_gcs_prefix, obj)
                    self.log.info("Saving file to %s", dest_path)

                    g_hook.upload(
                        bucket_name=dest_gcs_bucket,
                        object_name=dest_path,
                        filename=f.name,
                        gzip=self.gzip
                    )

            self.log.info("All done, uploaded %d files to GCS", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to GCS")

        return files

Esempio n. 22

0

Mostra file

File: campaign_manager.py Progetto: yqian1991/airflow

    def execute(self, context: Dict):
        hook = GoogleCampaignManagerHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            api_version=self.api_version,
        )
        gcs_hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                           delegate_to=self.delegate_to)
        # Get name of the report
        report = hook.get_report(file_id=self.file_id,
                                 profile_id=self.profile_id,
                                 report_id=self.report_id)
        report_name = self.report_name or report.get("fileName",
                                                     str(uuid.uuid4()))
        report_name = self._resolve_file_name(report_name)

        # Download the report
        self.log.info("Starting downloading report %s", self.report_id)
        request = hook.get_report_file(profile_id=self.profile_id,
                                       report_id=self.report_id,
                                       file_id=self.file_id)
        with tempfile.NamedTemporaryFile() as temp_file:
            downloader = http.MediaIoBaseDownload(fd=temp_file,
                                                  request=request,
                                                  chunksize=self.chunk_size)
            download_finished = False
            while not download_finished:
                _, download_finished = downloader.next_chunk()

            temp_file.flush()
            # Upload the local file to bucket
            gcs_hook.upload(
                bucket_name=self.bucket_name,
                object_name=report_name,
                gzip=self.gzip,
                filename=temp_file.name,
                mime_type="text/csv",
            )

        self.xcom_push(context, key="report_name", value=report_name)

Esempio n. 23

0

Mostra file

    def execute(self, context: Dict):
        hook = GCSHook(gcp_conn_id=self.gcp_conn_id)

        with NamedTemporaryFile() as source_file, NamedTemporaryFile(
        ) as destination_file:
            self.log.info("Downloading file from %s", self.source_bucket)
            hook.download(bucket_name=self.source_bucket,
                          object_name=self.source_object,
                          filename=source_file.name)

            self.log.info("Starting the transformation")
            cmd = [self.transform_script] if isinstance(
                self.transform_script, str) else self.transform_script
            cmd += [source_file.name, destination_file.name]
            process = subprocess.Popen(args=cmd,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT,
                                       close_fds=True)
            self.log.info("Process output:")
            for line in iter(process.stdout.readline, b''):
                self.log.info(line.decode(self.output_encoding).rstrip())

            process.wait()
            if process.returncode > 0:
                raise AirflowException("Transform script failed: {0}".format(
                    process.returncode))

            self.log.info(
                "Transformation succeeded. Output temporarily located at %s",
                destination_file.name)

            self.log.info("Uploading file to %s as %s",
                          self.destination_bucket, self.destination_object)
            hook.upload(bucket_name=self.destination_bucket,
                        object_name=self.destination_object,
                        filename=destination_file.name)

Esempio n. 24

0

Mostra file

    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to,
                               location=self.location)

        if not self.schema_fields:
            if self.schema_object and self.source_format != 'DATASTORE_BACKUP':
                gcs_hook = GCSHook(google_cloud_storage_conn_id=self.
                                   google_cloud_storage_conn_id,
                                   delegate_to=self.delegate_to)
                schema_fields = json.loads(
                    gcs_hook.download(self.bucket,
                                      self.schema_object).decode("utf-8"))
            elif self.schema_object is None and self.autodetect is False:
                raise AirflowException(
                    'At least one of `schema_fields`, '
                    '`schema_object`, or `autodetect` must be passed.')
            else:
                schema_fields = None

        else:
            schema_fields = self.schema_fields

        source_uris = [
            'gs://{}/{}'.format(self.bucket, source_object)
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        if self.external_table:
            cursor.create_external_table(
                external_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                compression=self.compression,
                skip_leading_rows=self.skip_leading_rows,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                src_fmt_configs=self.src_fmt_configs,
                encryption_configuration=self.encryption_configuration)
        else:
            cursor.run_load(
                destination_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                autodetect=self.autodetect,
                create_disposition=self.create_disposition,
                skip_leading_rows=self.skip_leading_rows,
                write_disposition=self.write_disposition,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                schema_update_options=self.schema_update_options,
                src_fmt_configs=self.src_fmt_configs,
                time_partitioning=self.time_partitioning,
                cluster_fields=self.cluster_fields,
                encryption_configuration=self.encryption_configuration)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info('Loaded BQ data with max %s.%s=%s',
                          self.destination_project_dataset_table,
                          self.max_id_key, max_id)

Esempio n. 25

0

Mostra file

 def __init__(self,
              gcp_conn_id: str = 'google_cloud_default',
              delegate_to: Optional[str] = None) -> None:
     self._gcs_hook = GCSHook(gcp_conn_id, delegate_to)

Esempio n. 26

0

Mostra file

File: s3_to_gcs_operator.py Progetto: viewthespace/airflow

    def execute(self, context):
        # use the super method to list all the files in an S3 bucket/key
        files = super().execute(context)

        gcs_hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id,
                           delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the GCS bucket
            # and only keep those files which are present in
            # S3 and not in Google Cloud Storage
            bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs)
            existing_files_prefixed = gcs_hook.list(bucket_name,
                                                    prefix=object_prefix)

            existing_files = []

            if existing_files_prefixed:
                # Remove the object prefix itself, an empty directory was found
                if object_prefix in existing_files_prefixed:
                    existing_files_prefixed.remove(object_prefix)

                # Remove the object prefix from all object string paths
                for f in existing_files_prefixed:
                    if f.startswith(object_prefix):
                        existing_files.append(f[len(object_prefix):])
                    else:
                        existing_files.append(f)

            files = list(set(files) - set(existing_files))
            if len(files) > 0:
                self.log.info('%s files are going to be synced: %s.',
                              len(files), files)
            else:
                self.log.info(
                    'There are no new files to sync. Have a nice day!')

        if files:
            hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

            for file in files:
                # GCS hook builds its own in-memory file so we have to create
                # and pass the path
                file_object = hook.get_key(file, self.bucket)
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    file_object.download_fileobj(f)
                    f.flush()

                    dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(
                        self.dest_gcs)
                    # There will always be a '/' before file because it is
                    # enforced at instantiation time
                    dest_gcs_object = dest_gcs_object_prefix + file

                    # Sync is sequential and the hook already logs too much
                    # so skip this for now
                    # self.log.info(
                    #     'Saving file {0} from S3 bucket {1} in GCS bucket {2}'
                    #     ' as object {3}'.format(file, self.bucket,
                    #                             dest_gcs_bucket,
                    #                             dest_gcs_object))

                    gcs_hook.upload(dest_gcs_bucket,
                                    dest_gcs_object,
                                    f.name,
                                    gzip=self.gzip)

            self.log.info(
                "All done, uploaded %d files to Google Cloud Storage",
                len(files))
        else:
            self.log.info(
                'In sync, no files needed to be uploaded to Google Cloud'
                'Storage')

        return files

Esempio n. 27

0

Mostra file

 def poke(self, context):
     hook = GCSHook()
     return self.is_bucket_updated(len(hook.list(self.bucket, prefix=self.prefix)))