def execute(self, context): # use the super method to list all the files in an S3 bucket/key files = super().execute(context) gcs_hook = GCSHook( google_cloud_storage_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.google_impersonation_chain, ) # pylint: disable=too-many-nested-blocks if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs) existing_files_prefixed = gcs_hook.list(bucket_name, prefix=object_prefix) existing_files = [] if existing_files_prefixed: # Remove the object prefix itself, an empty directory was found if object_prefix in existing_files_prefixed: existing_files_prefixed.remove(object_prefix) # Remove the object prefix from all object string paths for f in existing_files_prefixed: if f.startswith(object_prefix): existing_files.append(f[len(object_prefix):]) else: existing_files.append(f) files = list(set(files) - set(existing_files)) if len(files) > 0: self.log.info('%s files are going to be synced: %s.', len(files), files) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create # and pass the path file_object = hook.get_key(file, self.bucket) with NamedTemporaryFile(mode='wb', delete=True) as f: file_object.download_fileobj(f) f.flush() dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url( self.dest_gcs) # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file # Sync is sequential and the hook already logs too much # so skip this for now # self.log.info( # 'Saving file {0} from S3 bucket {1} in GCS bucket {2}' # ' as object {3}'.format(file, self.bucket, # dest_gcs_bucket, # dest_gcs_object)) gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name, gzip=self.gzip) self.log.info( "All done, uploaded %d files to Google Cloud Storage", len(files)) else: self.log.info( 'In sync, no files needed to be uploaded to Google Cloud' 'Storage') return files
def execute(self, context: dict) -> None: # Define intervals and prefixes. timespan_start = context["execution_date"] timespan_end = context["dag"].following_schedule(timespan_start) if timespan_end is None: self.log.warning( "No following schedule found, setting timespan end to max %s", timespan_end) timespan_end = datetime.datetime.max timespan_start = timespan_start.replace(tzinfo=timezone.utc) timespan_end = timespan_end.replace(tzinfo=timezone.utc) source_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.source_prefix, timespan_start, ) destination_prefix_interp = GCSTimeSpanFileTransformOperator.interpolate_prefix( self.destination_prefix, timespan_start, ) source_hook = GCSHook( gcp_conn_id=self.source_gcp_conn_id, impersonation_chain=self.source_impersonation_chain, ) destination_hook = GCSHook( gcp_conn_id=self.destination_gcp_conn_id, impersonation_chain=self.destination_impersonation_chain, ) # Fetch list of files. blobs_to_transform = source_hook.list_by_timespan( bucket_name=self.source_bucket, prefix=source_prefix_interp, timespan_start=timespan_start, timespan_end=timespan_end, ) with TemporaryDirectory() as temp_input_dir, TemporaryDirectory( ) as temp_output_dir: temp_input_dir = Path(temp_input_dir) temp_output_dir = Path(temp_output_dir) # TODO: download in parallel. for blob_to_transform in blobs_to_transform: destination_file = temp_input_dir / blob_to_transform destination_file.parent.mkdir(parents=True, exist_ok=True) try: source_hook.download( bucket_name=self.source_bucket, object_name=blob_to_transform, filename=str(destination_file), chunk_size=self.chunk_size, num_max_attempts=self.download_num_attempts, ) except GoogleCloudError: if self.download_continue_on_fail: continue raise self.log.info("Starting the transformation") cmd = [self.transform_script] if isinstance( self.transform_script, str) else self.transform_script cmd += [ str(temp_input_dir), str(temp_output_dir), timespan_start.replace(microsecond=0).isoformat(), timespan_end.replace(microsecond=0).isoformat(), ] with subprocess.Popen(args=cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) as process: self.log.info("Process output:") if process.stdout: for line in iter(process.stdout.readline, b''): self.log.info( line.decode(self.output_encoding).rstrip()) process.wait() if process.returncode: raise AirflowException( f"Transform script failed: {process.returncode}") self.log.info( "Transformation succeeded. Output temporarily located at %s", temp_output_dir) files_uploaded = [] # TODO: upload in parallel. for upload_file in temp_output_dir.glob("**/*"): if upload_file.is_dir(): continue upload_file_name = str( upload_file.relative_to(temp_output_dir)) if self.destination_prefix is not None: upload_file_name = f"{destination_prefix_interp}/{upload_file_name}" self.log.info("Uploading file %s to %s", upload_file, upload_file_name) try: destination_hook.upload( bucket_name=self.destination_bucket, object_name=upload_file_name, filename=str(upload_file), chunk_size=self.chunk_size, num_max_attempts=self.upload_num_attempts, ) files_uploaded.append(str(upload_file_name)) except GoogleCloudError: if self.upload_continue_on_fail: continue raise return files_uploaded
def execute(self, context): hook = GCSHook(google_cloud_storage_conn_id=self.gcp_conn_id) hook.insert_bucket_acl(bucket_name=self.bucket, entity=self.entity, role=self.role, user_project=self.user_project)
def execute(self, context): hook = GCSHook(gcp_conn_id=self.gcp_conn_id) hook.delete_bucket(bucket_name=self.bucket_name, force=self.force)
def execute(self, context: 'Context'): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable] = None is_dataflow = self.runner.lower( ) == BeamRunnerType.DataflowRunner.lower() dataflow_job_name: Optional[str] = None if is_dataflow: dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow( pipeline_options=pipeline_options, job_name_variable_key="job_name") pipeline_options.update(self.pipeline_options) # Convert argument names from lowerCamelCase to snake case. formatted_pipeline_options = { convert_camel_to_snake(key): pipeline_options[key] for key in pipeline_options } with ExitStack() as exit_stack: if self.py_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.py_file)) self.py_file = tmp_gcs_file.name if is_dataflow and self.dataflow_hook: with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) if dataflow_job_name and self.dataflow_config.location: self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=False, ) else: self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) return {"dataflow_job_id": self.dataflow_job_id}
def execute(self, context): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=self.runner) pipeline_options = self.default_pipeline_options.copy() process_line_callback: Optional[Callable] = None is_dataflow = self.runner.lower() == BeamRunnerType.DataflowRunner.lower() dataflow_job_name: Optional[str] = None if isinstance(self.dataflow_config, dict): self.dataflow_config = DataflowConfiguration(**self.dataflow_config) if is_dataflow: dataflow_job_name, pipeline_options, process_line_callback = self._set_dataflow( pipeline_options=pipeline_options, job_name_variable_key=None ) pipeline_options.update(self.pipeline_options) with ExitStack() as exit_stack: if self.jar.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( # pylint: disable=no-member gcs_hook.provide_file(object_url=self.jar) ) self.jar = tmp_gcs_file.name if is_dataflow: is_running = False if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob: is_running = ( # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) self.dataflow_hook.is_job_dataflow_running( # pylint: disable=no-value-for-parameter name=self.dataflow_config.job_name, variables=pipeline_options, ) ) while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun: # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) # pylint: disable=no-value-for-parameter is_running = self.dataflow_hook.is_job_dataflow_running( name=self.dataflow_config.job_name, variables=pipeline_options, ) if not is_running: pipeline_options["jobName"] = dataflow_job_name self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=self.dataflow_config.multiple_jobs, project_id=self.dataflow_config.project_id, ) else: self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) return {"dataflow_job_id": self.dataflow_job_id}
def execute(self, context): self._check_inputs() azure_fileshare_hook = AzureFileShareHook(self.azure_fileshare_conn_id) files = azure_fileshare_hook.list_files( share_name=self.share_name, directory_name=self.directory_name ) gcs_hook = GCSHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.google_impersonation_chain, ) dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(self.dest_gcs) if not self.replace: # if we are not replacing -> list all files in the GCS bucket # and only keep those files which are present in # S3 and not in Google Cloud Storage existing_files_prefixed = gcs_hook.list(dest_gcs_bucket, prefix=dest_gcs_object_prefix) existing_files = [] # Remove the object prefix itself, an empty directory was found if dest_gcs_object_prefix in existing_files_prefixed: existing_files_prefixed.remove(dest_gcs_object_prefix) # Remove the object prefix from all object string paths for file in existing_files_prefixed: if file.startswith(dest_gcs_object_prefix): existing_files.append(file[len(dest_gcs_object_prefix) :]) else: existing_files.append(file) files = list(set(files) - set(existing_files)) if files: self.log.info('%s files are going to be synced.', len(files)) else: self.log.info('There are no new files to sync. Have a nice day!') for file in files: with NamedTemporaryFile() as temp_file: azure_fileshare_hook.get_file_to_stream( stream=temp_file, share_name=self.share_name, directory_name=self.directory_name, file_name=file, ) temp_file.flush() # There will always be a '/' before file because it is # enforced at instantiation time dest_gcs_object = dest_gcs_object_prefix + file gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, temp_file.name, gzip=self.gzip) if files: self.log.info("All done, uploaded %d files to Google Cloud Storage.", len(files)) else: self.log.info('In sync, no files needed to be uploaded to Google Cloud Storage') return files
def execute(self, context: 'Context'): """Execute the python dataflow job.""" self.beam_hook = BeamHook(runner=BeamRunnerType.DataflowRunner) self.dataflow_hook = DataflowHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep, impersonation_chain=None, drain_pipeline=self.drain_pipeline, cancel_timeout=self.cancel_timeout, wait_until_finished=self.wait_until_finished, ) job_name = self.dataflow_hook.build_dataflow_job_name( job_name=self.job_name) pipeline_options = self.dataflow_default_options.copy() pipeline_options["job_name"] = job_name pipeline_options[ "project"] = self.project_id or self.dataflow_hook.project_id pipeline_options["region"] = self.location pipeline_options.update(self.options) # Convert argument names from lowerCamelCase to snake case. camel_to_snake = lambda name: re.sub( r"[A-Z]", lambda x: "_" + x.group(0).lower(), name) formatted_pipeline_options = { camel_to_snake(key): pipeline_options[key] for key in pipeline_options } def set_current_job_id(job_id): self.job_id = job_id process_line_callback = process_line_and_extract_dataflow_job_id_callback( on_new_job_id_callback=set_current_job_id) with ExitStack() as exit_stack: if self.py_file.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.py_file)) self.py_file = tmp_gcs_file.name with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_python_pipeline( variables=formatted_pipeline_options, py_file=self.py_file, py_options=self.py_options, py_interpreter=self.py_interpreter, py_requirements=self.py_requirements, py_system_site_packages=self.py_system_site_packages, process_line_callback=process_line_callback, ) self.dataflow_hook.wait_for_done( job_name=job_name, location=self.location, job_id=self.job_id, multiple_jobs=False, ) return {"job_id": self.job_id}
def execute(self, context: 'Context'): """Execute the Apache Beam Pipeline.""" ( is_dataflow, dataflow_job_name, pipeline_options, process_line_callback, ) = self._init_pipeline_options() if not self.beam_hook: raise AirflowException("Beam hook is not defined.") with ExitStack() as exit_stack: if self.jar.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.jar)) self.jar = tmp_gcs_file.name if is_dataflow and self.dataflow_hook: is_running = False if self.dataflow_config.check_if_running != CheckJobRunning.IgnoreJob: is_running = ( # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) self.dataflow_hook.is_job_dataflow_running( name=self.dataflow_config.job_name, variables=pipeline_options, )) while is_running and self.dataflow_config.check_if_running == CheckJobRunning.WaitForRun: # The reason for disable=no-value-for-parameter is that project_id parameter is # required but here is not passed, moreover it cannot be passed here. # This method is wrapped by @_fallback_to_project_id_from_variables decorator which # fallback project_id value from variables and raise error if project_id is # defined both in variables and as parameter (here is already defined in variables) is_running = self.dataflow_hook.is_job_dataflow_running( name=self.dataflow_config.job_name, variables=pipeline_options, ) if not is_running: pipeline_options["jobName"] = dataflow_job_name with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) if dataflow_job_name and self.dataflow_config.location: multiple_jobs = (self.dataflow_config.multiple_jobs if self.dataflow_config.multiple_jobs else False) DataflowJobLink.persist( self, context, self.dataflow_config.project_id, self.dataflow_config.location, self.dataflow_job_id, ) self.dataflow_hook.wait_for_done( job_name=dataflow_job_name, location=self.dataflow_config.location, job_id=self.dataflow_job_id, multiple_jobs=multiple_jobs, project_id=self.dataflow_config.project_id, ) return {"dataflow_job_id": self.dataflow_job_id} else: self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, )
def poke(self, context): self.log.info('Sensor checks existence of : %s, %s', self.bucket, self.object) hook = GCSHook(google_cloud_storage_conn_id=self.google_cloud_conn_id, delegate_to=self.delegate_to) return hook.exists(self.bucket, self.object)
def __init__(self, gcp_conn_id: str = 'google_cloud_default', delegate_to: Optional[str] = None) -> None: self._gcs_hook = GCSHook(gcp_conn_id, delegate_to)
def poke(self, context): hook = GCSHook() return self.is_bucket_updated( len(hook.list(self.bucket, prefix=self.prefix)))
def execute(self, context: 'Context'): """Execute the Apache Beam Pipeline.""" self.beam_hook = BeamHook(runner=BeamRunnerType.DataflowRunner) self.dataflow_hook = DataflowHook( gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to, poll_sleep=self.poll_sleep, cancel_timeout=self.cancel_timeout, wait_until_finished=self.wait_until_finished, ) job_name = self.dataflow_hook.build_dataflow_job_name( job_name=self.job_name) pipeline_options = copy.deepcopy(self.dataflow_default_options) pipeline_options["jobName"] = self.job_name pipeline_options[ "project"] = self.project_id or self.dataflow_hook.project_id pipeline_options["region"] = self.location pipeline_options.update(self.options) pipeline_options.setdefault("labels", {}).update({ "airflow-version": "v" + version.replace(".", "-").replace("+", "-") }) pipeline_options.update(self.options) def set_current_job_id(job_id): self.job_id = job_id process_line_callback = process_line_and_extract_dataflow_job_id_callback( on_new_job_id_callback=set_current_job_id) with ExitStack() as exit_stack: if self.jar.lower().startswith("gs://"): gcs_hook = GCSHook(self.gcp_conn_id, self.delegate_to) tmp_gcs_file = exit_stack.enter_context( gcs_hook.provide_file(object_url=self.jar)) self.jar = tmp_gcs_file.name is_running = False if self.check_if_running != CheckJobRunning.IgnoreJob: is_running = self.dataflow_hook.is_job_dataflow_running( name=self.job_name, variables=pipeline_options, ) while is_running and self.check_if_running == CheckJobRunning.WaitForRun: is_running = self.dataflow_hook.is_job_dataflow_running( name=self.job_name, variables=pipeline_options, ) if not is_running: pipeline_options["jobName"] = job_name with self.dataflow_hook.provide_authorized_gcloud(): self.beam_hook.start_java_pipeline( variables=pipeline_options, jar=self.jar, job_class=self.job_class, process_line_callback=process_line_callback, ) self.dataflow_hook.wait_for_done( job_name=job_name, location=self.location, job_id=self.job_id, multiple_jobs=self.multiple_jobs, ) return {"job_id": self.job_id}
def execute(self, context) -> None: hook = GCSHook(gcp_conn_id=self.gcp_conn_id, impersonation_chain=self.impersonation_chain) hook.delete_bucket(bucket_name=self.bucket_name, force=self.force)
def get_gs_hook(google_cloud_storage_conn_id='google_cloud_default', delegate_to=None): return GCSHook( google_cloud_storage_conn_id=google_cloud_storage_conn_id, delegate_to=delegate_to )
def execute(self, context): bq_hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to, location=self.location, impersonation_chain=self.impersonation_chain, ) if not self.schema_fields: if self.schema_object and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GCSHook( gcp_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to, impersonation_chain=self.impersonation_chain, ) blob = gcs_hook.download( bucket_name=self.bucket, object_name=self.schema_object, ) schema_fields = json.loads(blob.decode("utf-8")) elif self.schema_object is None and self.autodetect is False: raise AirflowException( 'At least one of `schema_fields`, `schema_object`, or `autodetect` must be passed.' ) else: schema_fields = None else: schema_fields = self.schema_fields source_uris = [ f'gs://{self.bucket}/{source_object}' for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() if self.external_table: cursor.create_external_table( external_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, src_fmt_configs=self.src_fmt_configs, encryption_configuration=self.encryption_configuration, labels=self.labels, description=self.description, ) else: cursor.run_load( destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, autodetect=self.autodetect, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning, cluster_fields=self.cluster_fields, encryption_configuration=self.encryption_configuration, labels=self.labels, description=self.description, ) if cursor.use_legacy_sql: escaped_table_name = f'[{self.destination_project_dataset_table}]' else: escaped_table_name = f'`{self.destination_project_dataset_table}`' if self.max_id_key: cursor.execute( f'SELECT MAX({self.max_id_key}) FROM {escaped_table_name}') row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info( 'Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id, )
def _get_gcs_hook(self): if not self.hook: self.hook = GCSHook() return self.hook