def setUpClass(cls): np.random.seed(0) cls._imagePaths = createRandomTestImages(cls._testBasePath, 10, 10, 0, 5) cls._simpleFilesMap1 = { dff.calculateMD5Hash(img): img for img in cls._imagePaths } cls._simpleFilesMap1File = os.path.join(cls._testBasePath, 'filesmap1.json') with open(cls._simpleFilesMap1File, 'w') as fp: json.dump(cls._simpleFilesMap1, fp) cls._imagePaths.extend( createRandomTestImages(os.path.join(cls._testBasePath, 'folder1'), 10, 10, 0, 5)) cls._imagePaths.extend( createRandomTestImages(os.path.join(cls._testBasePath, 'folder2'), 10, 10, 0, 5)) cls._imagePaths.extend( createRandomTestImages( os.path.join(cls._testBasePath, 'folder1', 'folder3'), 10, 10, 0, 5)) cls._imagePaths.extend( createRandomTestImages( os.path.join(cls._testBasePath, 'folder2', 'folder4'), 10, 10, 0, 5)) cls._complexFilesMap1 = { dff.calculateMD5Hash(img): img for img in cls._imagePaths } cls._complexFilesMap1File = os.path.join(cls._testBasePath, 'complexFilesMap.json') with open(cls._complexFilesMap1File, 'w') as fp: json.dump(cls._complexFilesMap1, fp)
def test_calculateMD5HashWorks(self): ## small image self.assertEqual(dff.calculateMD5Hash(self._testImgs[0]), "5b4f89add29ae4ad253ce90b24ca132c") ## big image self.assertEqual(dff.calculateMD5Hash(self._testImgs[1]), "7c4143dee5870f2dc5aebc7be1a42e32")
def test_checkForDuplicatesWorksWithNoDuplicateFiles(self): knownFilesMap = { dff.calculateMD5Hash(f): dff.File(f) for f in self._imagePaths } inputFilesList = dff.buildInputFilesList([ os.path.join(self._testBasePath, 'folder1', 'folder3'), os.path.join(self._testBasePath, 'folder2', 'folder4'), ], knownFilesMap) filesMap = copy.deepcopy(knownFilesMap) dff.checkForDuplicates(inputFilesList, filesMap) self.assertEqual( len(self._imagePaths3) + len(self._imagePaths4) + len(knownFilesMap), len(filesMap)) self.assertEqual( sorted(self._imagePaths3 + self._imagePaths4 + [f._original for f in knownFilesMap.values()]), sorted([f._original for f in filesMap.values()])) previouslyKnownFiles = [v._original for v in knownFilesMap.values()] for f in filesMap.values(): self.assertTrue(f._original in previouslyKnownFiles or f._original in self._imagePaths3 or f._original in self._imagePaths4) self.assertFalse(f._duplicates)
def test_checkForDuplicatesWorksWithDuplicateFilesInDifferentFolders(self): inputFilesList = dff.buildInputFilesList([self._testBasePath], {}) filesMap = {} dff.checkForDuplicates(inputFilesList, filesMap) self.assertEqual(len(filesMap), 40) originalFileHashes = [dff.calculateMD5Hash(f) for f in self._originals] self.assertEqual(len(set(originalFileHashes)), len(originalFileHashes)) self.assertEqual(sorted(originalFileHashes), sorted(list(filesMap.keys()))) self.assertEqual(sorted([f._original for f in filesMap.values()]), sorted(self._originals)) ##assert duplicate file count matches duplicateFiles = [ f for dupFiles in filesMap.values() for f in dupFiles._duplicates ] self.assertEqual(len(duplicateFiles), 45) for df in duplicateFiles: self.assertNotIn(df, self._originals)
def setUpClass(cls): np.random.seed(0) cls._imagePaths = createRandomTestImages(cls._testBasePath, 10, 10, 0, 5) cls._imagePaths1 = createRandomTestImages( os.path.join(cls._testBasePath, 'folder1'), 10, 10, 5, 5) cls._imagePaths2 = createRandomTestImages( os.path.join(cls._testBasePath, 'folder2'), 10, 10, 10, 5) cls._imagePaths3 = createRandomTestImages( os.path.join(cls._testBasePath, 'folder1', 'folder3'), 10, 10, 15, 5) cls._imagePaths4 = createRandomTestImages( os.path.join(cls._testBasePath, 'folder2', 'folder4'), 10, 10, 20, 10) cls._knownFilesMap = { dff.calculateMD5Hash(f): dff.File(f) for f in cls._imagePaths }
def test_calculateMD5HashWorksOnEmptyFiles(self): ## md5 hash for empty '' is d41d8cd98f00b204e9800998ecf8427e self.assertEqual(dff.calculateMD5Hash(self._emptyTestImg), "d41d8cd98f00b204e9800998ecf8427e")