def test_all_same(): ok_(all_same([0, 0, 0])) ok_(not all_same([0, 0, '0'])) ok_(not all_same([])) def never_get_to_not_needed(): yield 'a' yield 'a' yield 'b' raise ValueError("Should not get here since on b should return") ok_(not all_same(never_get_to_not_needed())) def gen1(n): for x in range(n): yield 'a' ok_(not all_same(gen1(0))) ok_(all_same(gen1(1))) ok_(all_same(gen1(2))) ok_(all_same(gen1(10)))
def _the_same_across_datasets(relpath, *dss): """Check if the file (present content or not) is identical across two datasets Compares files by content if under git, or by checksum if under annex Parameters ---------- *ds: Datasets relpath: str path within datasets Returns ------- bool or None True if identical, False if not, None if cannot be decided (e.g. different git-annex backend used) """ from datalad.utils import md5sum, unique from datalad.support.exceptions import FileInGitError from datalad.support.digests import Digester paths = [op.join(ds.path, relpath) for ds in dss] # The simplest check first -- exist in both and content is the same. # Even if content is just a symlink file on windows, the same content # condition would be correct if all(map(op.exists, paths)) and all_same(map(md5sum, paths)): return True # We first need to find problematic ones which are annexed and # have no content locally, and take their keys = [] backends = [] presents = [] for ds in dss: repo = ds.repo key = None present = True if isinstance(repo, AnnexRepo): annexprops = repo.get_file_annexinfo( relpath, eval_availability=True) if 'key' not in annexprops: continue key = annexprops['key'] # For now the rest (e.g. not tracked) remains an error if not annexprops['has_content']: present = False backends.append(repo.get_key_backend(key)) keys.append(key) presents.append(present) if all(presents): return all_same(map(md5sum, paths)) backends = unique(backends) assert backends, "Since not all present - some must be under annex, and thus must have a backend!" # so some files are missing! assert not all(presents) NeedContentError = RuntimeError if len(backends) > 1: # TODO: or signal otherwise somehow that we just need to get at least some # of those files to do the check!... raise NeedContentError( "Following paths are missing content and have different annex " "backends: %s. Cannot determine here if the same or not!" % ", ".join(p for (p, b) in zip(paths, presents) if not b) ) backend = backends[0].lower() if backend.endswith('E'): backend = backend[':-1'] if backend not in Digester.DEFAULT_DIGESTS: raise NeedContentError( "Do not know how to figure out content check for backend %s" % backend ) checksums = [ split_ext(key).split('--', 1)[1] if key else key for key in keys ] thechecksum = set( checksum for present, checksum in zip(presents, checksums) if present ) if len(thechecksum) > 1: # Different checksum (with the same backend) return False elif not thechecksum: raise RuntimeError("We must have had at least one key since prior logic" " showed that not all files have content here") thechecksum = thechecksum[0] if any(presents): # We do need to extract checksum from the key and check the present # files' content to match digester = Digester([backend]) for present, path in zip(presents, paths): if present and digester(path)[backend] != thechecksum: return False return True return False
def _the_same_across_datasets(relpath, *dss): """Check if the file (present content or not) is identical across two datasets Compares files by content if under git, or by checksum if under annex Parameters ---------- *ds: Datasets relpath: str path within datasets Returns ------- bool or None True if identical, False if not, None if cannot be decided (e.g. different git-annex backend used) """ from datalad.utils import md5sum, unique from datalad.support.exceptions import FileInGitError from datalad.support.digests import Digester paths = [op.join(ds.path, relpath) for ds in dss] # The simplest check first -- exist in both and content is the same. # Even if content is just a symlink file on windows, the same content # condition would be correct if all(map(op.exists, paths)) and all_same(map(md5sum, paths)): return True # We first need to find problematic ones which are annexed and # have no content locally, and take their keys = [] backends = [] presents = [] for ds in dss: repo = ds.repo key = None present = True if isinstance(repo, AnnexRepo): try: key = repo.get_file_key(relpath) except FileInGitError: continue if not key: raise ValueError( "Must have got a key, unexpectedly got %r for %s within %s" % (key, relpath, ds) ) # For now the rest (e.g. not tracked) remains an error if not repo.file_has_content(relpath): present = False backends.append(repo.get_key_backend(key)) keys.append(key) presents.append(present) if all(presents): return all_same(map(md5sum, paths)) backends = unique(backends) assert backends, "Since not all present - some must be under annex, and thus must have a backend!" # so some files are missing! assert not all(presents) NeedContentError = RuntimeError if len(backends) > 1: # TODO: or signal otherwise somehow that we just need to get at least some # of those files to do the check!... raise NeedContentError( "Following paths are missing conent and have different annex " "backends: %s. Cannot determine here if the same or not!" % ", ".join(p for (p, b) in zip(paths, presents) if not b) ) backend = backends[0].lower() if backend.endswith('E'): backend = backend[':-1'] if backend not in Digester.DEFAULT_DIGESTS: raise NeedContentError( "Do not know how to figure out content check for backend %s" % backend ) checksums = [ split_ext(key).split('--', 1)[1] if key else key for key in keys ] thechecksum = set( checksum for present, checksum in zip(presents, checksums) if present ) if len(thechecksum) > 1: # Different checksum (with the same backend) return False elif not thechecksum: raise RuntimeError("We must have had at least one key since prior logic" " showed that not all files have content here") thechecksum = thechecksum[0] if any(presents): # We do need to extract checksum from the key and check the present # files' content to match digester = Digester([backend]) for present, path in zip(presents, paths): if present and digester(path)[backend] != thechecksum: return False return True return False