def set_object(self, key, obj, serialization_strategy=DEFAULT_SERIALIZATION_STRATEGY): check.str_param(key, 'key') # obj is an arbitrary Python object check.inst_param(serialization_strategy, 'serialization_strategy', SerializationStrategy) if os.path.exists(key): logging.warning('Removing existing path {path}'.format(path=key)) os.unlink(key) # Ensure path exists mkdir_p(os.path.dirname(key)) serialization_strategy.serialize_to_file(obj, key) return ObjectStoreOperation( op=ObjectStoreOperationType.SET_OBJECT, key=key, dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def get_intermediate_from_address( self, context, dagster_type=None, step_output_handle=None, address=None, ): """ This is an experimental method. This will likely to be merged into `get_intermediate_object`. To do so, we will need to update the `get_intermediate_object` to take `address` as an arg """ dagster_type = resolve_dagster_type(dagster_type) check.opt_inst_param(context, "context", SystemExecutionContext) check.inst_param(dagster_type, "dagster_type", DagsterType) check.inst_param(step_output_handle, "step_output_handle", StepOutputHandle) check.str_param(address, "address") # currently it doesn't support type_storage_plugin_registry try: obj, uri = self.object_store.get_object( key=address, serialization_strategy=dagster_type.serialization_strategy) return ObjectStoreOperation( op=ObjectStoreOperationType.GET_OBJECT, key=uri, dest_key=None, obj=obj, serialization_strategy_name=dagster_type. serialization_strategy.name, object_store_name=self.object_store.name, ) except (IOError, OSError) as e: raise DagsterAddressIOError(str(e))
def set_object(self, key, obj, serialization_strategy=None): check.str_param(key, 'key') logging.info('Writing GCS object at: ' + self.uri_for_key(key)) # cannot check obj since could be arbitrary Python object check.inst_param(serialization_strategy, 'serialization_strategy', SerializationStrategy) # cannot be none here if self.has_object(key): logging.warning('Removing existing GCS key: {key}'.format(key=key)) backoff(self.rm_object, args=[key], retry_on=(TooManyRequests, )) with (BytesIO() if serialization_strategy.write_mode == 'wb' or sys.version_info < (3, 0) else StringIO()) as file_like: serialization_strategy.serialize(obj, file_like) file_like.seek(0) backoff( self.bucket_obj.blob(key).upload_from_file, args=[file_like], retry_on=(TooManyRequests, ), ) return ObjectStoreOperation( op=ObjectStoreOperationType.SET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def set_intermediate_object(self, dagster_type, step_output_handle, value, version=None): check.inst_param(dagster_type, "dagster_type", DagsterType) check.inst_param(step_output_handle, "step_output_handle", StepOutputHandle) paths = self._get_paths(step_output_handle) check.param_invariant(len(paths) > 0, "paths") key = self.object_store.key_for_paths([self.root] + paths) try: uri = self.object_store.set_object( key, value, serialization_strategy=dagster_type.serialization_strategy ) except Exception as error: # pylint: disable=broad-except raise DagsterObjectStoreError( _object_store_operation_error_message( step_output_handle=step_output_handle, op=ObjectStoreOperationType.SET_OBJECT, object_store_name=self.object_store.name, serialization_strategy_name=dagster_type.serialization_strategy.name, ) ) from error return ObjectStoreOperation( op=ObjectStoreOperationType.SET_OBJECT, key=uri, dest_key=None, obj=value, serialization_strategy_name=dagster_type.serialization_strategy.name, object_store_name=self.object_store.name, version=version, )
def _set_intermediates(step_context, step_output, step_output_handle, output, version): if step_output.asset_store_handle: # use asset_store if it's configured on provided by the user res = _set_addressable_asset(step_context, step_output_handle, step_output.asset_store_handle, output.value) if isinstance(res, AssetStoreOperation): yield DagsterEvent.asset_store_operation(step_context, res) else: res = step_context.intermediate_storage.set_intermediate( context=step_context, dagster_type=step_output.dagster_type, step_output_handle=step_output_handle, value=output.value, version=version, ) if isinstance(res, ObjectStoreOperation): yield DagsterEvent.object_store_operation( step_context, ObjectStoreOperation.serializable( res, value_name=output.output_name), )
def rm_object(self, key): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') def delete_for_results(store, results): store.s3.delete_objects( Bucket=store.bucket, Delete={'Objects': [{'Key': result['Key']} for result in results['Contents']]}, ) if self.has_object(key): results = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=key) delete_for_results(self, results) continuation = results['IsTruncated'] while continuation: continuation_token = results['NextContinuationToken'] results = self.s3.list_objects_v2( Bucket=self.bucket, Prefix=key, ContinuationToken=continuation_token ) delete_for_results(self, results) continuation = results['IsTruncated'] return ObjectStoreOperation( op=ObjectStoreOperationType.RM_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=None, serialization_strategy_name=None, object_store_name=self.name, )
def get_object(self, key, serialization_strategy=None): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') check.inst_param( serialization_strategy, 'serialization_strategy', SerializationStrategy ) # cannot be none here # FIXME we need better error handling for object store obj = serialization_strategy.deserialize( BytesIO(self.s3.get_object(Bucket=self.bucket, Key=key)['Body'].read()) if serialization_strategy.read_mode == 'rb' else StringIO( self.s3.get_object(Bucket=self.bucket, Key=key)['Body'] .read() .decode(serialization_strategy.encoding) ) ) return ObjectStoreOperation( op=ObjectStoreOperationType.GET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def set_intermediate_object(self, dagster_type, step_output_handle, value, version=None): check.inst_param(dagster_type, "dagster_type", DagsterType) check.inst_param(step_output_handle, "step_output_handle", StepOutputHandle) paths = self._get_paths(step_output_handle) check.param_invariant(len(paths) > 0, "paths") key = self.object_store.key_for_paths([self.root] + paths) uri = self.object_store.set_object( key, value, serialization_strategy=dagster_type.serialization_strategy) return ObjectStoreOperation( op=ObjectStoreOperationType.SET_OBJECT, key=uri, dest_key=None, obj=value, serialization_strategy_name=dagster_type.serialization_strategy. name, object_store_name=self.object_store.name, version=version, )
def set_object(self, key, obj, serialization_strategy=None): check.str_param(key, "key") logging.info("Writing S3 object at: " + self.uri_for_key(key)) # cannot check obj since could be arbitrary Python object check.inst_param(serialization_strategy, "serialization_strategy", SerializationStrategy) # cannot be none here if self.has_object(key): logging.warning("Removing existing S3 key: {key}".format(key=key)) self.rm_object(key) with BytesIO() as bytes_io: if serialization_strategy.write_mode == "w" and sys.version_info >= ( 3, 0): with StringIO() as string_io: string_io = StringIO() serialization_strategy.serialize(obj, string_io) string_io.seek(0) bytes_io.write(string_io.read().encode("utf-8")) else: serialization_strategy.serialize(obj, bytes_io) bytes_io.seek(0) self.s3.put_object(Bucket=self.bucket, Key=key, Body=bytes_io) return ObjectStoreOperation( op=ObjectStoreOperationType.SET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def _set_intermediates(step_context, step_output, step_output_handle, output, version): if step_context.using_asset_store(step_output_handle): res = _set_addressable_asset(step_context, step_output_handle, output.value) for evt in res: if isinstance(evt, AssetStoreOperation): yield DagsterEvent.asset_store_operation(step_context, evt) if isinstance(evt, AssetMaterialization): yield DagsterEvent.step_materialization(step_context, evt) else: res = step_context.intermediate_storage.set_intermediate( context=step_context, dagster_type=step_output.dagster_type, step_output_handle=step_output_handle, value=output.value, version=version, ) if isinstance(res, ObjectStoreOperation): yield DagsterEvent.object_store_operation( step_context, ObjectStoreOperation.serializable( res, value_name=output.output_name), )
def set_object(self, key, obj, serialization_strategy=None): check.str_param(key, 'key') logging.info('Writing GCS object at: ' + self.uri_for_key(key)) # cannot check obj since could be arbitrary Python object check.inst_param(serialization_strategy, 'serialization_strategy', SerializationStrategy) # cannot be none here if self.has_object(key): logging.warning('Removing existing GCS key: {key}'.format(key=key)) self.rm_object(key) with BytesIO() as bytes_io: serialization_strategy.serialize(obj, bytes_io) bytes_io.seek(0) self.bucket_obj.blob(key).upload_from_file(bytes_io) return ObjectStoreOperation( op=ObjectStoreOperationType.SET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def get_object(self, key, serialization_strategy=None): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') check.inst_param(serialization_strategy, 'serialization_strategy', SerializationStrategy) # cannot be none here if serialization_strategy.read_mode == 'rb': file_obj = BytesIO() self.bucket_obj.blob(key).download_to_file(file_obj) else: file_obj = StringIO( self.bucket_obj.blob(key).download_as_string().decode( serialization_strategy.encoding)) file_obj.seek(0) obj = serialization_strategy.deserialize(file_obj) return ObjectStoreOperation( op=ObjectStoreOperationType.GET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def _set_objects(step_context, step_output, step_output_handle, output): from dagster.core.storage.asset_store import AssetStoreHandle output_def = step_output.output_def output_manager = step_context.get_output_manager(step_output_handle) output_context = step_context.get_output_context(step_output_handle) materializations = output_manager.handle_output(output_context, output.value) # TODO yuhan retire ObjectStoreOperation https://github.com/dagster-io/dagster/issues/3043 if isinstance(materializations, ObjectStoreOperation): yield DagsterEvent.object_store_operation( step_context, ObjectStoreOperation.serializable( materializations, value_name=step_output_handle.output_name), ) else: for evt in _materializations_to_events(step_context, step_output_handle, materializations): yield evt # SET_ASSET operation by AssetStore yield DagsterEvent.asset_store_operation( step_context, AssetStoreOperation( AssetStoreOperationType.SET_ASSET, step_output_handle, AssetStoreHandle(output_def.manager_key, output_def.metadata), ), )
def _set_intermediates(step_context, step_output, step_output_handle, output): res = step_context.intermediate_storage.set_intermediate( context=step_context, dagster_type=step_output.dagster_type, step_output_handle=step_output_handle, value=output.value, ) if isinstance(res, ObjectStoreOperation): yield DagsterEvent.object_store_operation( step_context, ObjectStoreOperation.serializable(res, value_name=output.output_name))
def cp_object(self, src, dst): check.str_param(src, 'src') check.str_param(dst, 'dst') source_blob = self.bucket_obj.blob(src) self.bucket_obj.copy_blob(source_blob, self.bucket_obj, dst) return ObjectStoreOperation( op=ObjectStoreOperationType.CP_OBJECT, key=self.uri_for_key(src), dest_key=self.uri_for_key(dst), object_store_name=self.name, )
def cp_object(self, src, dst): check.str_param(src, 'src') check.str_param(dst, 'dst') self.s3.copy_object( Bucket=self.bucket, Key=dst, CopySource={'Bucket': self.bucket, 'Key': src} ) return ObjectStoreOperation( op=ObjectStoreOperationType.CP_OBJECT, key=self.uri_for_key(src), dest_key=self.uri_for_key(dst), object_store_name=self.name, )
def rm_object(self, key): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') if self.bucket_obj.blob(key).exists(): self.bucket_obj.blob(key).delete() return ObjectStoreOperation( op=ObjectStoreOperationType.RM_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=None, serialization_strategy_name=None, object_store_name=self.name, )
def get_object(self, key, serialization_strategy=DEFAULT_SERIALIZATION_STRATEGY): check.str_param(key, "key") check.param_invariant(len(key) > 0, "key") check.inst_param(serialization_strategy, "serialization_strategy", SerializationStrategy) obj = serialization_strategy.deserialize_from_file(key) return ObjectStoreOperation( op=ObjectStoreOperationType.GET_OBJECT, key=key, dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def rm_object(self, key): check.str_param(key, "key") check.param_invariant(len(key) > 0, "key") # This operates recursively already so is nice and simple. self.file_system_client.delete_file(key) return ObjectStoreOperation( op=ObjectStoreOperationType.RM_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=None, serialization_strategy_name=None, object_store_name=self.name, )
def get_object(self, key, serialization_strategy=None): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') # FIXME we need better error handling for object store obj = serialization_strategy.deserialize( BytesIO(self.s3.get_object(Bucket=self.bucket, Key=key)['Body'].read()) ) return ObjectStoreOperation( op=ObjectStoreOperationType.GET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def copy_intermediate_from_run(self, context, run_id, step_output_handle): check.opt_inst_param(context, "context", SystemExecutionContext) check.str_param(run_id, "run_id") check.inst_param(step_output_handle, "step_output_handle", StepOutputHandle) paths = self._get_paths(step_output_handle) src = self.object_store.key_for_paths([self.root_for_run_id(run_id)] + paths) dst = self.object_store.key_for_paths([self.root] + paths) src_uri, dst_uri = self.object_store.cp_object(src, dst) return ObjectStoreOperation( op=ObjectStoreOperationType.CP_OBJECT, key=src_uri, dest_key=dst_uri, object_store_name=self.object_store.name, )
def rm_intermediate(self, context, step_output_handle): check.opt_inst_param(context, "context", SystemExecutionContext) check.inst_param(step_output_handle, "step_output_handle", StepOutputHandle) paths = self._get_paths(step_output_handle) check.param_invariant(len(paths) > 0, "paths") key = self.object_store.key_for_paths([self.root] + paths) uri = self.object_store.rm_object(key) return ObjectStoreOperation( op=ObjectStoreOperationType.RM_OBJECT, key=uri, dest_key=None, obj=None, serialization_strategy_name=None, object_store_name=self.object_store.name, )
def get_object(self, key, serialization_strategy=None): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') file_obj = BytesIO() self.bucket_obj.blob(key).download_to_file(file_obj) file_obj.seek(0) obj = serialization_strategy.deserialize(file_obj) return ObjectStoreOperation( op=ObjectStoreOperationType.GET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def copy_required_intermediates_for_execution(pipeline_context, execution_plan): """ Uses the intermediates manager to copy intermediates from the previous run that apply to the current execution plan, and yields the corresponding events """ check.inst_param(pipeline_context, "pipeline_context", SystemExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) parent_run_id = pipeline_context.pipeline_run.parent_run_id if not parent_run_id: return parent_run_logs = pipeline_context.instance.all_logs(parent_run_id) output_handles_for_current_run = output_handles_from_execution_plan( execution_plan) output_handles_from_previous_run = output_handles_from_event_logs( parent_run_logs) output_handles_to_copy = output_handles_for_current_run.intersection( output_handles_from_previous_run) output_handles_to_copy_by_step = defaultdict(list) for handle in output_handles_to_copy: output_handles_to_copy_by_step[handle.step_key].append(handle) intermediate_storage = pipeline_context.intermediate_storage for step in execution_plan.get_all_steps_in_topo_order(): handles_to_copy = output_handles_to_copy_by_step.get(step.key, []) # exit early to avoid trying to make a context from an UnresolvedExecutionStep if not handles_to_copy: continue step_context = pipeline_context.for_step(step) for handle in handles_to_copy: if intermediate_storage.has_intermediate(pipeline_context, handle): continue operation = intermediate_storage.copy_intermediate_from_run( pipeline_context, parent_run_id, handle) yield DagsterEvent.object_store_operation( step_context, ObjectStoreOperation.serializable( operation, value_name=handle.output_name), )
def rm_object(self, key): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') if self.has_object(key): if os.path.isfile(key): os.unlink(key) elif os.path.isdir(key): shutil.rmtree(key) return ObjectStoreOperation( op=ObjectStoreOperationType.RM_OBJECT, key=key, dest_key=None, obj=None, serialization_strategy_name=None, object_store_name=self.name, )
def _set_objects(step_context, step_output, step_output_handle, output): from dagster.core.storage.asset_store import AssetStoreHandle output_def = step_output.output_def output_manager = step_context.get_output_manager(step_output_handle) output_context = step_context.get_output_context(step_output_handle) with user_code_error_boundary( DagsterExecutionHandleOutputError, control_flow_exceptions=[Failure, RetryRequested], msg_fn=lambda: (f"Error occurred during the the handling of step output:" f' step key: "{step_context.step.key}"' f' output name: "{output_context.name}"'), step_key=step_context.step.key, output_name=output_context.name, ): materializations = output_manager.handle_output( output_context, output.value) # TODO yuhan retire ObjectStoreOperation https://github.com/dagster-io/dagster/issues/3043 if isinstance(materializations, ObjectStoreOperation): yield DagsterEvent.object_store_operation( step_context, ObjectStoreOperation.serializable( materializations, value_name=step_output_handle.output_name), ) else: for evt in _materializations_to_events(step_context, step_output_handle, materializations): yield evt # SET_ASSET operation by AssetStore yield DagsterEvent.asset_store_operation( step_context, AssetStoreOperation.serializable( AssetStoreOperation( AssetStoreOperationType.SET_ASSET, step_output_handle, AssetStoreHandle(output_def.manager_key, output_def.metadata), )), )
def get_object(self, key, serialization_strategy=DEFAULT_SERIALIZATION_STRATEGY): check.str_param(key, 'key') check.param_invariant(len(key) > 0, 'key') if serialization_strategy: obj = serialization_strategy.deserialize_from_file(key) else: with open(key, 'rb') as f: obj = f.read() return ObjectStoreOperation( op=ObjectStoreOperationType.GET_OBJECT, key=key, dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def get_object(self, key, serialization_strategy=None): check.str_param(key, "key") check.param_invariant(len(key) > 0, "key") check.inst_param(serialization_strategy, "serialization_strategy", SerializationStrategy) # cannot be none here # FIXME we need better error handling for object store file = self.file_system_client.get_file_client(key) stream = file.download_file() obj = serialization_strategy.deserialize( BytesIO(stream.readall()) if serialization_strategy.read_mode == "rb" else StringIO(stream.readall().decode(serialization_strategy. encoding))) return ObjectStoreOperation( op=ObjectStoreOperationType.GET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )
def cp_object(self, src, dst): check.invariant(not os.path.exists(dst), "Path already exists {}".format(dst)) # Ensure output path exists mkdir_p(os.path.dirname(dst)) if os.path.isfile(src): shutil.copy(src, dst) elif os.path.isdir(src): shutil.copytree(src, dst) else: check.failed("should not get here") return ObjectStoreOperation( op=ObjectStoreOperationType.CP_OBJECT, key=src, dest_key=dst, obj=None, serialization_strategy_name=None, object_store_name=self.name, )
def set_object(self, key, obj, serialization_strategy=None): check.str_param(key, "key") logging.info("Writing ADLS2 object at: " + self.uri_for_key(key)) # cannot check obj since could be arbitrary Python object check.inst_param(serialization_strategy, "serialization_strategy", SerializationStrategy) # cannot be none here if self.has_object(key): logging.warning( "Removing existing ADLS2 key: {key}".format(key=key)) self.rm_object(key) file = self.file_system_client.create_file(key) with file.acquire_lease(self.lease_duration) as lease: with BytesIO() as bytes_io: if serialization_strategy.write_mode == "w" and sys.version_info >= ( 3, 0): with StringIO() as string_io: string_io = StringIO() serialization_strategy.serialize(obj, string_io) string_io.seek(0) bytes_io.write(string_io.read().encode("utf-8")) else: serialization_strategy.serialize(obj, bytes_io) bytes_io.seek(0) file.upload_data(bytes_io, lease=lease, overwrite=True) return ObjectStoreOperation( op=ObjectStoreOperationType.SET_OBJECT, key=self.uri_for_key(key), dest_key=None, obj=obj, serialization_strategy_name=serialization_strategy.name, object_store_name=self.name, )