def __init__(self, file_): self._local_file = file_ self._metadata = Metadata(self) self._is_outdated_status_local = None self._is_outdated_status_remote = None self._exists = None
def test_update_with_non_string_keys(): prod = FakeProduct(identifier='fake-product') metadata = Metadata(prod) metadata.update('new code', params={1: 1}) assert metadata.params == {1: 1}
def test_delete(): prod = Mock(wraps=FakeProduct(identifier='fake-product')) metadata = Metadata(prod) assert not prod.delete_metadata.call_count metadata.delete() assert prod.delete_metadata.call_count == 1
def test_update(): prod = FakeProduct(identifier='fake-product') metadata = Metadata(prod) metadata.update('new code', params={'a': 1}) # check code was updated assert metadata.stored_source_code == 'new code' assert metadata.params == {'a': 1}
def test_file(tmp_directory): Path('file').touch() product = File('file') m = Metadata(product) m.update('some_source_code', {'a': 1}) m2 = Metadata(product) assert m2.stored_source_code == 'some_source_code' assert m2.timestamp assert m2.params == {'a': 1}
def test_update_with_resource(tmp_directory): Path('file.txt').touch() prod = FakeProduct(identifier='fake-product') metadata = Metadata(prod) metadata.update('new code', params={'resources_': {'file': 'file.txt'}}) assert metadata.stored_source_code == 'new code' assert metadata.params == { 'resources_': { 'file': 'd41d8cd98f00b204e9800998ecf8427e' } }
def test_warns_on_corruped_metadata(tmp_directory): Path('file').touch() product = File('file') mock = Mock(side_effect=ValueError) product.fetch_metadata = mock m = Metadata(product) with pytest.warns(UserWarning) as record: m._get() assert len(record) == 1 assert 'corrupted metadata, ignoring' in record[0].message.args[0] assert m.stored_source_code is None assert m.timestamp is None assert m.params is None
def test_warns_on_unserializable_params(tmp_directory): Path('file').touch() product = File('file') m = Metadata(product) with pytest.warns(UserWarning) as record: m.update('some_source_code', {'a': object()}) m2 = Metadata(product) assert len(record) == 1 assert 'are not serializable' in record[0].message.args[0] assert m2.stored_source_code == 'some_source_code' assert m2.timestamp assert m2.params is None
def test_warns_on_unserializable_params(tmp_directory, num_warnings, params): params.update(final_metadata='this') Path('file').touch() product = File('file') m = Metadata(product) with pytest.warns(UserWarning) as records: m.update('some_source_code', params) assert len(records) == num_warnings assert all([ 'contains an unserializable object' in record.message.args[0] for record in records ]) assert m.stored_source_code == 'some_source_code' assert m.params == {'final_metadata': 'this'}
def test_clear(): prod = Mock(wraps=FakeProduct(identifier='fake-product')) # we need this because if it doesn't exist, fetch_metata is skipped prod.exists.return_value = True prod.fetch_metadata.return_value = dict(timestamp=None, stored_source_code=None) metadata = Metadata(prod) # this should trigger one fetch call metadata.timestamp assert prod.fetch_metadata.call_count == 1 # clear in memory copy metadata.clear() # this should trigger another fetch metadata.timestamp assert prod.fetch_metadata.call_count == 2
def test_cache_flags_are_cleared_up(method, kwargs): prod = FakeProduct(identifier='fake-product') prod._outdated_data_dependencies_status = 1 prod._outdated_code_dependency_status = 1 metadata = Metadata(prod) getattr(metadata, method)(**kwargs) # check cache flags were cleared up assert prod._outdated_data_dependencies_status is None assert prod._outdated_code_dependency_status is None
def __init__(self, identifier): self._identifier = self._init_identifier(identifier) if self._identifier is None: raise TypeError('_init_identifier must return a value, returned ' 'None') self.task = None self.logger = logging.getLogger('{}.{}'.format(__name__, type(self).__name__)) self._outdated_data_dependencies_status = None self._outdated_code_dependency_status = None self._is_outdated_status = None # not all products have clients, but they should still have a client # property to keep the API consistent self._client = None self.metadata = Metadata(self) self.prepare_metadata = _prepare_metadata
class _RemoteFile: """ A product-like object to check status using remote metadata. Since it partially conforms to the Product API, it can use the same Metadata implementation (like File). This is used to determine whether a task should be executed or downloaded from remote storage. Parameters ---------- file_ : ploomber.products.File Product to check status Notes ----- Must be used in a context manager """ def __init__(self, file_): self._local_file = file_ self._metadata = Metadata(self) self._is_outdated_status_local = None self._is_outdated_status_remote = None self._exists = None def _fetch_remote_metadata(self): if self.exists() and not self._metadata._did_fetch: self._local_file.client.download( self._local_file._path_to_metadata, destination=self._path_to_metadata) # load from values from file self._metadata._fetch() try: self._path_to_metadata.unlink() except FileNotFoundError: pass def exists(self): """ Checks if remote File exists. This is used by Metadata to determine whether to use the existing remote metadat (if any) or ignore it: if this returns False, remote metadata is ignored even if it exists """ if self._exists is None: # TODO remove checking if file exists and just make the API # call directly self._exists = (self._local_file.client is not None and self._local_file.client._remote_exists( self._local_file._path_to_metadata) and self._local_file.client._remote_exists( self._local_file._path_to_file)) return self._exists def fetch_metadata(self): return _fetch_metadata_from_file_product(self, check_file_exists=False) @property def metadata(self): self._fetch_remote_metadata() return self._metadata @property def _path_to_metadata(self): """ Path to download remote metadata """ name = f'.{self._local_file._path_to_file.name}.metadata.remote' return self._local_file._path_to_file.with_name(name) def _reset_cached_outdated_status(self): self._is_outdated_status = None def _is_equal_to_local_copy(self): """ Check if local metadata is the same as the remote copy """ return self._local_file.metadata == self.metadata # TODO: _is_outdated, _outdated_code_dependency and # _outdated_data_dependencies are very similar to the implementations # in Product, check what we can abstract to avoid repetition def _is_outdated(self, with_respect_to_local, outdated_by_code=True): """ Determines outdated status using remote metadata, to decide whether to download the remote file or not with_respect_to_local : bool If True, determines status by comparing timestamps with upstream local metadata, otherwise it uses upstream remote metadata """ if with_respect_to_local: if self._is_outdated_status_local is None: self._is_outdated_status_local = self._check_is_outdated( with_respect_to_local, outdated_by_code) return self._is_outdated_status_local else: if self._is_outdated_status_remote is None: self._is_outdated_status_remote = self._check_is_outdated( with_respect_to_local, outdated_by_code) return self._is_outdated_status_remote def _check_is_outdated(self, with_respect_to_local, outdated_by_code): oudated_data = self._outdated_data_dependencies(with_respect_to_local) outdated_code = (outdated_by_code and self._outdated_code_dependency()) return oudated_data or outdated_code def _outdated_code_dependency(self): """ Determine if the source code has changed by looking at the remote metadata """ outdated, _ = self._local_file.task.dag.differ.is_different( a=self.metadata.stored_source_code, b=str(self._local_file.task.source), a_params=self.metadata.params, b_params=self._local_file.task.params.to_json_serializable( params_only=True), extension=self._local_file.task.source.extension) return outdated def _outdated_data_dependencies(self, with_respect_to_local): """ Determine if the product is outdated by checking upstream timestamps """ upstream_outdated = [ self._is_outdated_due_to_upstream(up, with_respect_to_local) for up in self._local_file.task.upstream.values() ] # special case: if all upstream dependencies are waiting for download # or up-to-date, mark this as up-to-date if set(upstream_outdated) <= {TaskStatus.WaitingDownload, False}: return False return any(upstream_outdated) def __del__(self): if self._path_to_metadata.exists(): self._path_to_metadata.unlink() def _is_outdated_due_to_upstream(self, upstream, with_respect_to_local): """ A task becomes data outdated if an upstream product has a higher timestamp or if an upstream product is outdated """ if (upstream.exec_status == TaskStatus.WaitingDownload or not with_respect_to_local): # TODO: delete ._remote will never be None if upstream.product._remote: upstream_timestamp = ( upstream.product._remote.metadata.timestamp) else: upstream_timestamp = None else: upstream_timestamp = upstream.product.metadata.timestamp if (self.metadata.timestamp is None or upstream_timestamp is None): return True else: more_recent_upstream = upstream_timestamp > self.metadata.timestamp if with_respect_to_local: outdated_upstream_prod = upstream.product._is_outdated() else: outdated_upstream_prod = upstream.product._is_remote_outdated( True) return more_recent_upstream or outdated_upstream_prod def __repr__(self): return f'{type(self).__name__}({self._local_file!r})'