def test_moveTheDuplicatesWithDuplicatesInSameFoldersWorks(self): inputFilesList = dff.buildInputFilesList([ os.path.join(self._testBasePath, 'folder3'), os.path.join(self._testBasePath, 'folder4') ], {}) filesMap = {} dff.checkForDuplicates(inputFilesList, filesMap) self.assertEqual(len(filesMap), 10) filesMapBeforeMove = copy.deepcopy(filesMap) dff.moveTheDuplicates(filesMap, '/tmp/duplicates') self.assertTrue( all([os.path.exists(f._original) for f in filesMap.values()])) self.assertTrue( all([ os.path.exists(f) for dupFiles in filesMap.values() for f in dupFiles._duplicates ])) self.assertTrue(not any([ os.path.exists(f) for dupFiles in filesMapBeforeMove.values() for f in dupFiles._duplicates ])) self.assertTrue( all([ f.startswith('/tmp/duplicates/') for dupFiles in filesMap.values() for f in dupFiles._duplicates ]))
def test_checkForDuplicatesWorksWithNoDuplicateFiles(self): knownFilesMap = { dff.calculateMD5Hash(f): dff.File(f) for f in self._imagePaths } inputFilesList = dff.buildInputFilesList([ os.path.join(self._testBasePath, 'folder1', 'folder3'), os.path.join(self._testBasePath, 'folder2', 'folder4'), ], knownFilesMap) filesMap = copy.deepcopy(knownFilesMap) dff.checkForDuplicates(inputFilesList, filesMap) self.assertEqual( len(self._imagePaths3) + len(self._imagePaths4) + len(knownFilesMap), len(filesMap)) self.assertEqual( sorted(self._imagePaths3 + self._imagePaths4 + [f._original for f in knownFilesMap.values()]), sorted([f._original for f in filesMap.values()])) previouslyKnownFiles = [v._original for v in knownFilesMap.values()] for f in filesMap.values(): self.assertTrue(f._original in previouslyKnownFiles or f._original in self._imagePaths3 or f._original in self._imagePaths4) self.assertFalse(f._duplicates)
def test_buildInputFilesListWorksWhenSingleInputPathIsGiven(self): basepath = os.path.join(self._testBasePath, 'folder1', 'folder3') inputFilesList = dff.buildInputFilesList([basepath], None) self.assertEqual(len(inputFilesList), 5) for i in range(5): self.assertIn(os.path.join(basepath, 'img_{}.png'.format(15 + i)), inputFilesList)
def test_buildInputFilesListDoesntReturnKnownFilePaths(self): inputFilesList = dff.buildInputFilesList([ self._testBasePath, os.path.join(self._testBasePath, 'folder1'), os.path.join(self._testBasePath, 'folder2'), os.path.join(self._testBasePath, 'folder1', 'folder3'), os.path.join(self._testBasePath, 'folder2', 'folder4'), ], self._knownFilesMap) self.assertFalse( set(inputFilesList) & set(self._knownFilesMap.values()))
def main(): base_path = '/tmp/profiler' np.random.seed(0) createRandomTestImages(base_path, 100, 100, 0, 10) np.random.seed(0) createRandomTestImages(base_path, 100, 100, 50000, 10) # images = glob.glob('/tmp/profiler/*.png') # filesMap = {dff.calculateMD5Hash(f) : f for f in images} # with open('/tmp/filesMap.json','w') as fw: # json.dump({dff.calculateMD5Hash(f) : f for f in images}, fw, indent=1) filesMap = {} inputFiles = dff.buildInputFilesList([base_path], {}) dff.checkForDuplicates(inputFiles, filesMap) dff.moveTheDuplicates(filesMap, '/tmp/duplicates')
def test_checkForDuplicatesWorksWithDuplicateFilesInSameFolder(self): inputFilesList = dff.buildInputFilesList( [os.path.join(self._testBasePath, 'folder7')], {}) filesMap = {} dff.checkForDuplicates(inputFilesList, filesMap) self.assertEqual(len(filesMap), 10) originalFiles = [f._original for f in filesMap.values()] duplicateFiles = [ f for dupFiles in filesMap.values() for f in dupFiles._duplicates ] self.assertEqual(sorted(originalFiles), sorted(self._imagePaths7Originals)) self.assertEqual(len(duplicateFiles), 20) for dup in duplicateFiles: self.assertNotIn(dup, self._imagePaths7Originals)
def test_buildInputFilesListWorksWithNestedPath(self): inputFilesList = dff.buildInputFilesList([ os.path.join(self._testBasePath, 'folder1', 'folder3'), os.path.join(self._testBasePath, 'folder1'), ], None) self.assertEqual(len(inputFilesList), 10) basepath = os.path.join(self._testBasePath, 'folder1', 'folder3') for i in range(5): self.assertIn(os.path.join(basepath, 'img_{}.png'.format(15 + i)), inputFilesList) basepath = os.path.join(self._testBasePath, 'folder1') for i in range(5): self.assertIn(os.path.join(basepath, 'img_{}.png'.format(5 + i)), inputFilesList)
def test_buildInputFilesListWorksWhenMultipleInputPathsAteGiven(self): inputFilesList = dff.buildInputFilesList([ os.path.join(self._testBasePath, 'folder1', 'folder3'), os.path.join(self._testBasePath, 'folder2', 'folder4') ], None) self.assertEqual(len(inputFilesList), 15) basepath = os.path.join(self._testBasePath, 'folder1', 'folder3') for i in range(5): self.assertIn(os.path.join(basepath, 'img_{}.png'.format(15 + i)), inputFilesList) basepath = os.path.join(self._testBasePath, 'folder2', 'folder4') for i in range(5): self.assertIn(os.path.join(basepath, 'img_{}.png'.format(20 + i)), inputFilesList)
def test_checkForDuplicatesWorksWithDuplicateFilesInDifferentFolders(self): inputFilesList = dff.buildInputFilesList([self._testBasePath], {}) filesMap = {} dff.checkForDuplicates(inputFilesList, filesMap) self.assertEqual(len(filesMap), 40) originalFileHashes = [dff.calculateMD5Hash(f) for f in self._originals] self.assertEqual(len(set(originalFileHashes)), len(originalFileHashes)) self.assertEqual(sorted(originalFileHashes), sorted(list(filesMap.keys()))) self.assertEqual(sorted([f._original for f in filesMap.values()]), sorted(self._originals)) ##assert duplicate file count matches duplicateFiles = [ f for dupFiles in filesMap.values() for f in dupFiles._duplicates ] self.assertEqual(len(duplicateFiles), 45) for df in duplicateFiles: self.assertNotIn(df, self._originals)
def test_saveFilesMapWorks(self): inputFilesList = dff.buildInputFilesList([self._testBasePath], {}) filesMap = {} dff.checkForDuplicates(inputFilesList, filesMap) dff.moveTheDuplicates(filesMap, '/tmp/duplicates') knownFilesPath = '/tmp/logs/knownFiles.json' allFilesPath = '/tmp/logs/allFiles.json' if os.path.exists(knownFilesPath): os.remove(knownFilesPath) if os.path.exists(allFilesPath): os.remove(allFilesPath) if not os.path.exists(os.path.split(knownFilesPath)[0]): os.makedirs(os.path.split(knownFilesPath)[0]) self.assertFalse(os.path.exists(knownFilesPath)) self.assertFalse(os.path.exists(allFilesPath)) dff.saveFileList(filesMap, knownFilesPath, allFilesPath) self.assertTrue(os.path.exists(knownFilesPath)) self.assertTrue(os.path.exists(allFilesPath)) # Test that the file is loadable filesMapNew = dff.loadKnownFilesMap(knownFilesPath) self.assertEqual(len(filesMap), len(filesMapNew)) self.assertEqual(sorted(filesMap.keys()), sorted(filesMapNew.keys()))
def test_moveTheDuplicatesWithDuplicatesInDifferentFoldersWorks(self): inputFilesList = dff.buildInputFilesList([ os.path.join(self._testBasePath, 'folder1'), os.path.join(self._testBasePath, 'folder2') ], {}) filesMap = {} dff.checkForDuplicates(inputFilesList, filesMap) self.assertEqual(len(filesMap), 10) filesMapBeforeMove = copy.deepcopy(filesMap) dff.moveTheDuplicates(filesMap, '/tmp/duplicates') self.assertTrue( all([os.path.exists(f._original) for f in filesMap.values()])) self.assertTrue( all([ os.path.exists(f) for dupFiles in filesMap.values() for f in dupFiles._duplicates ])) self.assertTrue(not any([ os.path.exists(f) for dupFiles in filesMapBeforeMove.values() for f in dupFiles._duplicates ])) self.assertTrue( all([ f.startswith('/tmp/duplicates/') for dupFiles in filesMap.values() for f in dupFiles._duplicates ])) kk = None for k, v in filesMap.items(): if v._duplicates: kk = k os.remove(v._duplicates[0]) break self.assertFalse(os.path.exists(filesMap[kk]._duplicates[0]))
def test_buildInputFilesListFailsWhenInputPathsIsNotAList(self): with self.assertRaises(TypeError): dff.buildInputFilesList(self._testBasePath, self._knownFilesMap)
def test_buildInputFilesListFailsWithInvalidPath(self): with self.assertRaisesRegex(OSError, "Invalid input folder"): inputFilesList = dff.buildInputFilesList([ os.path.join(self._testBasePath, 'folder1', 'folder3'), os.path.join(self._testBasePath, 'folder5'), ], None)
def test_buildInputFilesListReturnsEmptyListWhenNoPathsAreGivenAsInput( self): self.assertFalse(dff.buildInputFilesList([], {}))