def publish_input_data(self, expected_input_id: str, metadata: InputMetadata, input_data_stream: BinaryIO) -> None: input_file_path = self.input_file(expected_input_id) if os.path.exists(input_file_path): input_data_stream.close() return file_hash = hashlib.sha256() fd, temp_file_path = tempfile.mkstemp(dir=self.temp_data_dir) try: with os.fdopen(fd, 'wb') as f: bytes_read = 0 while True: data = input_data_stream.read(READ_BUFFER_SIZE) bytes_read += len(data) log.debug(f'{bytes_read} bytes of input read') if not data: break f.write(data) file_hash.update(data) input_id = file_hash.hexdigest() if input_id != expected_input_id: raise IncorrectInputIDException() os.rename(temp_file_path, input_file_path) if metadata.has_all_args(): self._store_input_id(metadata, input_id) return except Exception: os.remove(temp_file_path) raise
def _has_input(self, input_id: str) -> bool: input_metadata = InputMetadata.of( user=self.user, project=self.project, path=self.path, timestamp_millis=self.timestamp_millis) return self.controller.check_input_data(input_id, input_metadata)
def put_input(self, input_id: str, input_metadata: InputMetadata, input_data_stream: BinaryIO) -> None: if not input_metadata.has_all_args_or_none(): raise BadInputMetadataException(input_metadata.__dict__) self.input_data_configuration.publish_input_data( input_id, input_metadata, request.stream) return jsonify({'id': input_id})
def get_input_id_or_none(self, input_metadata: InputMetadata) -> Optional[str]: if not input_metadata.has_all_args_or_none(): raise BadInputMetadataException(input_metadata.__dict__) id_or_none = \ self.input_data_configuration.get_input_id_from_metadata_or_none( input_metadata) return id_or_none
def _get_input_metadata_from_request() -> InputMetadata: metadata: InputMetadata = InputMetadata() metadata.user = request.args.get('user', default=None, type=str) metadata.project = request.args.get('project', default=None, type=str) metadata.path = request.args.get('path', default=None, type=str) metadata.timestamp_millis = request.args.get('timestamp_millis', default=None, type=str) return metadata
def _put_tarball(self, input_id: str) -> None: self.tarball.seek(0) input_metadata = InputMetadata.of( user=self.user, project=self.project, path=self.path, timestamp_millis=self.timestamp_millis) self.controller.put_input(input_id=input_id, input_metadata=input_metadata, input_data_stream=self.tarball)
def check_input_data(self, input_id: str, metadata: InputMetadata) -> bool: if self._input_file_exists(input_id): if metadata.has_all_args(): # The reason to do this is that, if there's a blob that # changed timestamp but not hash (because of a `touch`, for # instance), the timestamp check will always return false and # the tarball will be constructed all the times on the client # side. It happened. self._store_input_id(metadata, input_id) return True else: return False
def get_input_id_from_metadata_or_none(self, metadata: InputMetadata) \ -> Optional[str]: input_id_bytes = self.redis.hget(_INPUT_ID_KEY, metadata.redis_field()) if not input_id_bytes: return None input_id = str(input_id_bytes, 'utf-8') # We have the metadata stored, but the file doesn't exist. I can # imagine this happening so let's make this cache mechanism resilient # to that. if not self._input_file_exists(input_id): return None else: return input_id
def __enter__(self): # Nothing to save in the context, we have an input id in the controller # and just refer to it if self.input_id: return self if self.path is None: raise ValueError('For input data, neither path nor input id were ' 'given') input_metadata = InputMetadata.of( user=self.user, project=self.project, path=self.path, timestamp_millis=self.timestamp_millis) # Try to avoid building the tarball. Look at maximum modification # time in the input, and if we have in input for the timestamp, use # that one input_id = self.controller.get_input_id_or_none(input_metadata) log_debug(f'Input ID from the controller: {input_id}') if input_id: log_info('Input files not changed according to modification times') self.input_id = input_id return self log_debug('Building the tarball!') files = (os.path.join(directory, file) for directory, _, files in os.walk(self.path) for file in files) self.tarball = tempfile.NamedTemporaryFile() with tarfile.open(self.tarball.name, mode='w:bz2') as tar: for file in files: name = os.path.relpath(file, self.path) size = os.stat(file).st_size with open(file, 'rb') as f: tarinfo = tarfile.TarInfo(name=name) tarinfo.size = size tar.addfile(tarinfo, fileobj=f) return self
def _store_input_id(self, metadata: InputMetadata, input_id: str) -> None: field = metadata.redis_field() self.redis.hset(_INPUT_ID_KEY, field, input_id) log.debug(field + ': ' + str(self.get_input_id_from_metadata_or_none(metadata)))
def check_input_data(self, input_id: str, input_metadata: InputMetadata) -> bool: if not input_metadata.has_all_args_or_none(): raise BadInputMetadataException(input_metadata.__dict__) return self.input_data_configuration.check_input_data( input_id, input_metadata)