def _copy_single_object( self, gcs_hook: GCSHook, sftp_hook: SFTPHook, source_path: str, destination_object: str, ) -> None: """ Helper function to copy single object. """ self.log.info( "Executing copy of %s to gs://%s/%s", source_path, self.destination_bucket, destination_object, ) with NamedTemporaryFile("w") as tmp: sftp_hook.retrieve_file(source_path, tmp.name) gcs_hook.upload( bucket_name=self.destination_bucket, object_name=destination_object, filename=tmp.name, mime_type=self.mime_type, ) if self.move_object: self.log.info("Executing delete of %s", source_path) sftp_hook.delete_file(source_path)
def _upload_to_gcs(self, files_to_upload: Dict[str, Any]): hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) for obj, tmp_file_handle in files_to_upload.items(): hook.upload(bucket_name=self.bucket, object_name=obj, filename=tmp_file_handle.name, mime_type='application/json', gzip=self.gzip)
def _upload_to_gcs(self, files_to_upload): """ Upload all of the file splits (and optionally the schema .json file) to Google Cloud Storage. """ hook = GCSHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) for tmp_file in files_to_upload: hook.upload(self.bucket, tmp_file.get('file_name'), tmp_file.get('file_handle').name, mime_type=tmp_file.get('file_mime_type'), gzip=self.gzip if tmp_file.get('file_name') == self.schema_filename else False)
def execute(self, context): """ Uploads the file to Google Cloud Storage """ hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) hook.upload( bucket_name=self.bucket, object_name=self.dst, mime_type=self.mime_type, filename=self.src, gzip=self.gzip, )
def execute(self, context): hook = CloudTextToSpeechHook(gcp_conn_id=self.gcp_conn_id) result = hook.synthesize_speech( input_data=self.input_data, voice=self.voice, audio_config=self.audio_config, retry=self.retry, timeout=self.timeout, ) with NamedTemporaryFile() as temp_file: temp_file.write(result.audio_content) cloud_storage_hook = GCSHook( google_cloud_storage_conn_id=self.gcp_conn_id) cloud_storage_hook.upload(bucket_name=self.target_bucket_name, object_name=self.target_filename, filename=temp_file.name)
def execute(self, context: Dict): hook = GoogleDisplayVideo360Hook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, api_version=self.api_version, ) gcs_hook = GCSHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to ) resource = hook.get_query(query_id=self.report_id) # Check if report is ready if resource["metadata"]["running"]: raise AirflowException('Report {} is still running'.format(self.report_id)) # If no custom report_name provided, use DV360 name file_url = resource["metadata"]["googleCloudStoragePathForLatestReport"] report_name = self.report_name or urlparse(file_url).path.split('/')[2] report_name = self._resolve_file_name(report_name) # Download the report self.log.info("Starting downloading report %s", self.report_id) with tempfile.NamedTemporaryFile(delete=False) as temp_file: with urllib.request.urlopen(file_url) as response: shutil.copyfileobj(response, temp_file, length=self.chunk_size) temp_file.flush() # Upload the local file to bucket gcs_hook.upload( bucket_name=self.bucket_name, object_name=report_name, gzip=self.gzip, filename=temp_file.name, mime_type="text/csv", ) self.log.info( "Report %s was saved in bucket %s as %s.", self.report_id, self.bucket_name, report_name, ) self.xcom_push(context, key='report_name', value=report_name)
def execute(self, context: Dict): hook = GoogleSearchAdsHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, api_version=self.api_version, ) gcs_hook = GCSHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) # Resolve file name of the report report_name = self.report_name or self.report_id report_name = self._resolve_file_name(report_name) response = hook.get(report_id=self.report_id) if not response['isReportReady']: raise AirflowException('Report {} is not ready yet'.format( self.report_id)) # Resolve report fragments fragments_count = len(response["files"]) # Download chunks of report's data self.log.info("Downloading Search Ads report %s", self.report_id) with NamedTemporaryFile() as temp_file: for i in range(fragments_count): byte_content = hook.get_file(report_fragment=i, report_id=self.report_id) fragment = (byte_content if i == 0 else self._handle_report_fragment(byte_content)) temp_file.write(fragment) temp_file.flush() gcs_hook.upload( bucket_name=self.bucket_name, object_name=report_name, gzip=self.gzip, filename=temp_file.name, ) self.xcom_push(context, key="file_name", value=report_name)
def execute(self, context): # use the super to list all files in an Azure Data Lake path files = super().execute(context) g_hook = GCSHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the ADLS path # and only keep those files which are present in # ADLS and not in Google Cloud Storage bucket_name, prefix = _parse_gcs_url(self.dest_gcs) existing_files = g_hook.list(bucket_name=bucket_name, prefix=prefix) files = set(files) - set(existing_files) if files: hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id ) for obj in files: with NamedTemporaryFile(mode='wb', delete=True) as f: hook.download_file(local_path=f.name, remote_path=obj) f.flush() dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs) dest_path = os.path.join(dest_gcs_prefix, obj) self.log.info("Saving file to %s", dest_path) g_hook.upload( bucket_name=dest_gcs_bucket, object_name=dest_path, filename=f.name, gzip=self.gzip ) self.log.info("All done, uploaded %d files to GCS", len(files)) else: self.log.info("In sync, no files needed to be uploaded to GCS") return files
def execute(self, context: Dict): hook = GoogleCampaignManagerHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, api_version=self.api_version, ) gcs_hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) # Get name of the report report = hook.get_report(file_id=self.file_id, profile_id=self.profile_id, report_id=self.report_id) report_name = self.report_name or report.get("fileName", str(uuid.uuid4())) report_name = self._resolve_file_name(report_name) # Download the report self.log.info("Starting downloading report %s", self.report_id) request = hook.get_report_file(profile_id=self.profile_id, report_id=self.report_id, file_id=self.file_id) with tempfile.NamedTemporaryFile() as temp_file: downloader = http.MediaIoBaseDownload(fd=temp_file, request=request, chunksize=self.chunk_size) download_finished = False while not download_finished: _, download_finished = downloader.next_chunk() temp_file.flush() # Upload the local file to bucket gcs_hook.upload( bucket_name=self.bucket_name, object_name=report_name, gzip=self.gzip, filename=temp_file.name, mime_type="text/csv", ) self.xcom_push(context, key="report_name", value=report_name)
def execute(self, context: Dict): hook = GCSHook(gcp_conn_id=self.gcp_conn_id) with NamedTemporaryFile() as source_file, NamedTemporaryFile( ) as destination_file: self.log.info("Downloading file from %s", self.source_bucket) hook.download(bucket_name=self.source_bucket, object_name=self.source_object, filename=source_file.name) self.log.info("Starting the transformation") cmd = [self.transform_script] if isinstance( self.transform_script, str) else self.transform_script cmd += [source_file.name, destination_file.name] process = subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) self.log.info("Process output:") for line in iter(process.stdout.readline, b''): self.log.info(line.decode(self.output_encoding).rstrip()) process.wait() if process.returncode > 0: raise AirflowException("Transform script failed: {0}".format( process.returncode)) self.log.info( "Transformation succeeded. Output temporarily located at %s", destination_file.name) self.log.info("Uploading file to %s as %s", self.destination_bucket, self.destination_object) hook.upload(bucket_name=self.destination_bucket, object_name=self.destination_object, filename=destination_file.name)
def execute(self, context): # use the super method to list all the files in an S3 bucket/key files = super().execute(context) gcs_hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs) existing_files_prefixed = gcs_hook.list(bucket_name, prefix=object_prefix) existing_files = [] if existing_files_prefixed: # Remove the object prefix itself, an empty directory was found if object_prefix in existing_files_prefixed: existing_files_prefixed.remove(object_prefix) # Remove the object prefix from all object string paths for f in existing_files_prefixed: if f.startswith(object_prefix): existing_files.append(f[len(object_prefix):]) else: existing_files.append(f) files = list(set(files) - set(existing_files)) if len(files) > 0: self.log.info('%s files are going to be synced: %s.', len(files), files) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create # and pass the path file_object = hook.get_key(file, self.bucket) with NamedTemporaryFile(mode='wb', delete=True) as f: file_object.download_fileobj(f) f.flush() dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url( self.dest_gcs) # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file # Sync is sequential and the hook already logs too much # so skip this for now # self.log.info( # 'Saving file {0} from S3 bucket {1} in GCS bucket {2}' # ' as object {3}'.format(file, self.bucket, # dest_gcs_bucket, # dest_gcs_object)) gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name, gzip=self.gzip) self.log.info( "All done, uploaded %d files to Google Cloud Storage", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud' 'Storage') return files