Exemple #1
0
    def __init__(self, file_):
        self._local_file = file_
        self._metadata = Metadata(self)

        self._is_outdated_status_local = None
        self._is_outdated_status_remote = None
        self._exists = None
Exemple #2
0
def test_update_with_non_string_keys():
    prod = FakeProduct(identifier='fake-product')
    metadata = Metadata(prod)

    metadata.update('new code', params={1: 1})

    assert metadata.params == {1: 1}
Exemple #3
0
def test_delete():
    prod = Mock(wraps=FakeProduct(identifier='fake-product'))
    metadata = Metadata(prod)
    assert not prod.delete_metadata.call_count

    metadata.delete()

    assert prod.delete_metadata.call_count == 1
Exemple #4
0
def test_update():
    prod = FakeProduct(identifier='fake-product')
    metadata = Metadata(prod)

    metadata.update('new code', params={'a': 1})

    # check code was updated
    assert metadata.stored_source_code == 'new code'
    assert metadata.params == {'a': 1}
Exemple #5
0
def test_file(tmp_directory):
    Path('file').touch()
    product = File('file')

    m = Metadata(product)

    m.update('some_source_code', {'a': 1})

    m2 = Metadata(product)

    assert m2.stored_source_code == 'some_source_code'
    assert m2.timestamp
    assert m2.params == {'a': 1}
Exemple #6
0
def test_update_with_resource(tmp_directory):
    Path('file.txt').touch()

    prod = FakeProduct(identifier='fake-product')
    metadata = Metadata(prod)

    metadata.update('new code', params={'resources_': {'file': 'file.txt'}})

    assert metadata.stored_source_code == 'new code'
    assert metadata.params == {
        'resources_': {
            'file': 'd41d8cd98f00b204e9800998ecf8427e'
        }
    }
Exemple #7
0
def test_warns_on_corruped_metadata(tmp_directory):
    Path('file').touch()
    product = File('file')
    mock = Mock(side_effect=ValueError)
    product.fetch_metadata = mock

    m = Metadata(product)

    with pytest.warns(UserWarning) as record:
        m._get()

    assert len(record) == 1
    assert 'corrupted metadata, ignoring' in record[0].message.args[0]
    assert m.stored_source_code is None
    assert m.timestamp is None
    assert m.params is None
Exemple #8
0
def test_warns_on_unserializable_params(tmp_directory):
    Path('file').touch()
    product = File('file')

    m = Metadata(product)

    with pytest.warns(UserWarning) as record:
        m.update('some_source_code', {'a': object()})

    m2 = Metadata(product)

    assert len(record) == 1
    assert 'are not serializable' in record[0].message.args[0]
    assert m2.stored_source_code == 'some_source_code'
    assert m2.timestamp
    assert m2.params is None
Exemple #9
0
def test_warns_on_unserializable_params(tmp_directory, num_warnings, params):
    params.update(final_metadata='this')

    Path('file').touch()
    product = File('file')

    m = Metadata(product)

    with pytest.warns(UserWarning) as records:
        m.update('some_source_code', params)

    assert len(records) == num_warnings
    assert all([
        'contains an unserializable object' in record.message.args[0]
        for record in records
    ])
    assert m.stored_source_code == 'some_source_code'
    assert m.params == {'final_metadata': 'this'}
Exemple #10
0
def test_clear():
    prod = Mock(wraps=FakeProduct(identifier='fake-product'))
    # we need this because if it doesn't exist, fetch_metata is skipped
    prod.exists.return_value = True
    prod.fetch_metadata.return_value = dict(timestamp=None,
                                            stored_source_code=None)
    metadata = Metadata(prod)

    # this should trigger one fetch call
    metadata.timestamp

    assert prod.fetch_metadata.call_count == 1

    # clear in memory copy
    metadata.clear()
    # this should trigger another fetch
    metadata.timestamp

    assert prod.fetch_metadata.call_count == 2
Exemple #11
0
def test_cache_flags_are_cleared_up(method, kwargs):
    prod = FakeProduct(identifier='fake-product')
    prod._outdated_data_dependencies_status = 1
    prod._outdated_code_dependency_status = 1

    metadata = Metadata(prod)
    getattr(metadata, method)(**kwargs)

    # check cache flags were cleared up
    assert prod._outdated_data_dependencies_status is None
    assert prod._outdated_code_dependency_status is None
Exemple #12
0
    def __init__(self, identifier):
        self._identifier = self._init_identifier(identifier)

        if self._identifier is None:
            raise TypeError('_init_identifier must return a value, returned '
                            'None')

        self.task = None
        self.logger = logging.getLogger('{}.{}'.format(__name__,
                                                       type(self).__name__))

        self._outdated_data_dependencies_status = None
        self._outdated_code_dependency_status = None
        self._is_outdated_status = None
        # not all products have clients, but they should still have a client
        # property to keep the API consistent
        self._client = None
        self.metadata = Metadata(self)

        self.prepare_metadata = _prepare_metadata
Exemple #13
0
class _RemoteFile:
    """
    A product-like object to check status using remote metadata. Since it
    partially conforms to the Product API, it can use the same Metadata
    implementation (like File). This is used to determine whether a
    task should be executed or downloaded from remote storage.

    Parameters
    ----------
    file_ : ploomber.products.File
        Product to check status

    Notes
    -----
    Must be used in a context manager
    """
    def __init__(self, file_):
        self._local_file = file_
        self._metadata = Metadata(self)

        self._is_outdated_status_local = None
        self._is_outdated_status_remote = None
        self._exists = None

    def _fetch_remote_metadata(self):
        if self.exists() and not self._metadata._did_fetch:
            self._local_file.client.download(
                self._local_file._path_to_metadata,
                destination=self._path_to_metadata)

            # load from values from file
            self._metadata._fetch()

            try:
                self._path_to_metadata.unlink()
            except FileNotFoundError:
                pass

    def exists(self):
        """
        Checks if remote File exists. This is used by Metadata to determine
        whether to use the existing remote metadat (if any) or ignore it: if
        this returns False, remote metadata is ignored even if it exists
        """
        if self._exists is None:
            # TODO remove checking if file exists and just make the API
            # call directly
            self._exists = (self._local_file.client is not None
                            and self._local_file.client._remote_exists(
                                self._local_file._path_to_metadata)
                            and self._local_file.client._remote_exists(
                                self._local_file._path_to_file))

        return self._exists

    def fetch_metadata(self):
        return _fetch_metadata_from_file_product(self, check_file_exists=False)

    @property
    def metadata(self):
        self._fetch_remote_metadata()
        return self._metadata

    @property
    def _path_to_metadata(self):
        """
        Path to download remote metadata
        """
        name = f'.{self._local_file._path_to_file.name}.metadata.remote'
        return self._local_file._path_to_file.with_name(name)

    def _reset_cached_outdated_status(self):
        self._is_outdated_status = None

    def _is_equal_to_local_copy(self):
        """
        Check if local metadata is the same as the remote copy
        """
        return self._local_file.metadata == self.metadata

    # TODO: _is_outdated, _outdated_code_dependency and
    # _outdated_data_dependencies are very similar to the implementations
    # in Product, check what we can abstract to avoid repetition

    def _is_outdated(self, with_respect_to_local, outdated_by_code=True):
        """
        Determines outdated status using remote metadata, to decide
        whether to download the remote file or not

        with_respect_to_local : bool
            If True, determines status by comparing timestamps with upstream
            local metadata, otherwise it uses upstream remote metadata
        """
        if with_respect_to_local:
            if self._is_outdated_status_local is None:
                self._is_outdated_status_local = self._check_is_outdated(
                    with_respect_to_local, outdated_by_code)
            return self._is_outdated_status_local
        else:
            if self._is_outdated_status_remote is None:
                self._is_outdated_status_remote = self._check_is_outdated(
                    with_respect_to_local, outdated_by_code)
            return self._is_outdated_status_remote

    def _check_is_outdated(self, with_respect_to_local, outdated_by_code):
        oudated_data = self._outdated_data_dependencies(with_respect_to_local)
        outdated_code = (outdated_by_code and self._outdated_code_dependency())
        return oudated_data or outdated_code

    def _outdated_code_dependency(self):
        """
        Determine if the source code has changed by looking at the remote
        metadata
        """
        outdated, _ = self._local_file.task.dag.differ.is_different(
            a=self.metadata.stored_source_code,
            b=str(self._local_file.task.source),
            a_params=self.metadata.params,
            b_params=self._local_file.task.params.to_json_serializable(
                params_only=True),
            extension=self._local_file.task.source.extension)

        return outdated

    def _outdated_data_dependencies(self, with_respect_to_local):
        """
        Determine if the product is outdated by checking upstream timestamps
        """
        upstream_outdated = [
            self._is_outdated_due_to_upstream(up, with_respect_to_local)
            for up in self._local_file.task.upstream.values()
        ]

        # special case: if all upstream dependencies are waiting for download
        # or up-to-date, mark this as up-to-date
        if set(upstream_outdated) <= {TaskStatus.WaitingDownload, False}:
            return False

        return any(upstream_outdated)

    def __del__(self):
        if self._path_to_metadata.exists():
            self._path_to_metadata.unlink()

    def _is_outdated_due_to_upstream(self, upstream, with_respect_to_local):
        """
        A task becomes data outdated if an upstream product has a higher
        timestamp or if an upstream product is outdated
        """
        if (upstream.exec_status == TaskStatus.WaitingDownload
                or not with_respect_to_local):
            # TODO: delete ._remote will never be None
            if upstream.product._remote:
                upstream_timestamp = (
                    upstream.product._remote.metadata.timestamp)
            else:
                upstream_timestamp = None
        else:
            upstream_timestamp = upstream.product.metadata.timestamp

        if (self.metadata.timestamp is None or upstream_timestamp is None):
            return True
        else:
            more_recent_upstream = upstream_timestamp > self.metadata.timestamp

            if with_respect_to_local:
                outdated_upstream_prod = upstream.product._is_outdated()
            else:
                outdated_upstream_prod = upstream.product._is_remote_outdated(
                    True)

            return more_recent_upstream or outdated_upstream_prod

    def __repr__(self):
        return f'{type(self).__name__}({self._local_file!r})'