Beispiel #1
0
def test_all_same():
    ok_(all_same([0, 0, 0]))
    ok_(not all_same([0, 0, '0']))
    ok_(not all_same([]))

    def never_get_to_not_needed():
        yield 'a'
        yield 'a'
        yield 'b'
        raise ValueError("Should not get here since on b should return")

    ok_(not all_same(never_get_to_not_needed()))

    def gen1(n):
        for x in range(n):
            yield 'a'
    ok_(not all_same(gen1(0)))
    ok_(all_same(gen1(1)))
    ok_(all_same(gen1(2)))
    ok_(all_same(gen1(10)))
Beispiel #2
0
def _the_same_across_datasets(relpath, *dss):
    """Check if the file (present content or not) is identical across two datasets

    Compares files by content if under git, or by checksum if under annex

    Parameters
    ----------
    *ds: Datasets
    relpath: str
        path within datasets

    Returns
    -------
    bool or None
      True if identical, False if not, None if cannot be decided
      (e.g. different git-annex backend used)
    """
    from datalad.utils import md5sum, unique
    from datalad.support.exceptions import FileInGitError
    from datalad.support.digests import Digester

    paths = [op.join(ds.path, relpath) for ds in dss]
    # The simplest check first -- exist in both and content is the same.
    # Even if content is just a symlink file on windows, the same content
    # condition would be correct
    if all(map(op.exists, paths)) and all_same(map(md5sum, paths)):
        return True

    # We first need to find problematic ones which are annexed and
    # have no content locally, and take their
    keys = []
    backends = []
    presents = []
    for ds in dss:
        repo = ds.repo
        key = None
        present = True
        if isinstance(repo, AnnexRepo):
            annexprops = repo.get_file_annexinfo(
                relpath, eval_availability=True)
            if 'key' not in annexprops:
                continue
            key = annexprops['key']
            # For now the rest (e.g. not tracked) remains an error
            if not annexprops['has_content']:
                present = False
                backends.append(repo.get_key_backend(key))
        keys.append(key)
        presents.append(present)

    if all(presents):
        return all_same(map(md5sum, paths))

    backends = unique(backends)
    assert backends, "Since not all present - some must be under annex, and thus must have a backend!"
    # so some files are missing!
    assert not all(presents)
    NeedContentError = RuntimeError
    if len(backends) > 1:
        # TODO: or signal otherwise somehow that we just need to get at least some
        # of those files to do the check!...
        raise NeedContentError(
            "Following paths are missing content and have different annex "
            "backends: %s. Cannot determine here if the same or not!"
            % ", ".join(p for (p, b) in zip(paths, presents) if not b)
        )
    backend = backends[0].lower()
    if backend.endswith('E'):
        backend = backend[':-1']

    if backend not in Digester.DEFAULT_DIGESTS:
        raise NeedContentError(
            "Do not know how to figure out content check for backend %s" % backend
        )

    checksums = [
        split_ext(key).split('--', 1)[1] if key else key
        for key in keys
    ]
    thechecksum = set(
        checksum
        for present, checksum in zip(presents, checksums)
        if present
    )
    if len(thechecksum) > 1:
        # Different checksum (with the same backend)
        return False
    elif not thechecksum:
        raise RuntimeError("We must have had at least one key since prior logic"
                           " showed that not all files have content here")
    thechecksum = thechecksum[0]
    if any(presents):
        # We do need to extract checksum from the key and check the present
        # files' content to match
        digester = Digester([backend])
        for present, path in zip(presents, paths):
            if present and digester(path)[backend] != thechecksum:
                return False
        return True
    return False
Beispiel #3
0
def _the_same_across_datasets(relpath, *dss):
    """Check if the file (present content or not) is identical across two datasets

    Compares files by content if under git, or by checksum if under annex

    Parameters
    ----------
    *ds: Datasets
    relpath: str
        path within datasets

    Returns
    -------
    bool or None
      True if identical, False if not, None if cannot be decided
      (e.g. different git-annex backend used)
    """
    from datalad.utils import md5sum, unique
    from datalad.support.exceptions import FileInGitError
    from datalad.support.digests import Digester

    paths = [op.join(ds.path, relpath) for ds in dss]
    # The simplest check first -- exist in both and content is the same.
    # Even if content is just a symlink file on windows, the same content
    # condition would be correct
    if all(map(op.exists, paths)) and all_same(map(md5sum, paths)):
        return True

    # We first need to find problematic ones which are annexed and
    # have no content locally, and take their
    keys = []
    backends = []
    presents = []
    for ds in dss:
        repo = ds.repo
        key = None
        present = True
        if isinstance(repo, AnnexRepo):
            try:
                key = repo.get_file_key(relpath)
            except FileInGitError:
                continue
            if not key:
                raise ValueError(
                    "Must have got a key, unexpectedly got %r for %s within %s"
                    % (key, relpath, ds)
                )
            # For now the rest (e.g. not tracked) remains an error
            if not repo.file_has_content(relpath):
                present = False
                backends.append(repo.get_key_backend(key))
        keys.append(key)
        presents.append(present)

    if all(presents):
        return all_same(map(md5sum, paths))

    backends = unique(backends)
    assert backends, "Since not all present - some must be under annex, and thus must have a backend!"
    # so some files are missing!
    assert not all(presents)
    NeedContentError = RuntimeError
    if len(backends) > 1:
        # TODO: or signal otherwise somehow that we just need to get at least some
        # of those files to do the check!...
        raise NeedContentError(
            "Following paths are missing conent and have different annex "
            "backends: %s. Cannot determine here if the same or not!"
            % ", ".join(p for (p, b) in zip(paths, presents) if not b)
        )
    backend = backends[0].lower()
    if backend.endswith('E'):
        backend = backend[':-1']

    if backend not in Digester.DEFAULT_DIGESTS:
        raise NeedContentError(
            "Do not know how to figure out content check for backend %s" % backend
        )

    checksums = [
        split_ext(key).split('--', 1)[1] if key else key
        for key in keys
    ]
    thechecksum = set(
        checksum
        for present, checksum in zip(presents, checksums)
        if present
    )
    if len(thechecksum) > 1:
        # Different checksum (with the same backend)
        return False
    elif not thechecksum:
        raise RuntimeError("We must have had at least one key since prior logic"
                           " showed that not all files have content here")
    thechecksum = thechecksum[0]
    if any(presents):
        # We do need to extract checksum from the key and check the present
        # files' content to match
        digester = Digester([backend])
        for present, path in zip(presents, paths):
            if present and digester(path)[backend] != thechecksum:
                return False
        return True
    return False