def testDstWithSubdirectory(self): newdst = os.path.join(os.path.dirname(os.path.abspath(self._sha1file)), "newsubdir") expected = "/home/belisarius/github/fuse-sha1/test/newsubdir/subdir/file.txt" self.assertEqual(expected, fsu.dstWithSubdirectory("subdir/file.txt", newdst)) self.assertEqual("/media/cdrom/usr/local/test.txt", fsu.dstWithSubdirectory("/usr/local/test.txt", "/media/cdrom")) self.assertEqual("/media/cdrom/subdir/othersubdir/test.txt", fsu.dstWithSubdirectory("/media/cdrom/othersubdir/test.txt", "/media/cdrom/subdir"))
def testDstWithSubdirectoryBad(self): self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory("", "")) self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory("", None)) self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory("", "subdir")) self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory(None, None)) self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory(None, "")) self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory("uouoeuoaeuu", "")) self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory(self._sha1file, "")) self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory(self._sha1file, None)) self.assertRaises(IOError, lambda: fsu.dstWithSubdirectory("/media/cdrom/test.txt", "/media/cdrom"))
def dedup(self, dupdir, doSymlink): """ Moves duplicate entries (based on checksum) into the dupdir. Uses the entry's path to reconstruct a subdirectory hierarchy in dupdir. This will remove any common prefixes between dupdir and the file path itself so as to make a useful subdirectory structure. If doSymlink is true, then the original paths of the files that were moved will be symlinked back to the canonical file; in addition, it will keep the file entry in the database rather than removing it.""" logging.info("De-duping database") if os.path.exists(dupdir) and not len(os.listdir(dupdir)) <= 0: raise Exception("%s is not empty; refusing to move files" % dupdir) try: pathmap = {} # store duplicate paths keyed by file checksum with sqliteConn(self.database) as cursor: cursor.execute("""select chksum, path, link from files where chksum in( select chksum from files where symlink = 0 group by chksum having count(chksum) > 1) and symlink = 0 and link = 1 order by chksum, link;""") for row in cursor: (chksum, path, islink) = row if not chksum in pathmap: # ensure existence of list for checksum pathmap[chksum] = [] paths = pathmap[chksum] paths.append(path) for chksum, paths in pathmap.iteritems(): # the query above will result in single rows for symlinked files, so fix that here # rather than mucking about with temp tables paths = filter(lambda path: not os.path.islink(path), paths) # we'll have at least two elements due to the inner part of the query above for path in paths: dst = dstWithSubdirectory(path, dupdir) moveFile( path, dst, (not doSymlink )) # don't rm empty dirs if we are symlinking if not doSymlink: cursor.execute(REMOVE_ROW, (path, )) else: cursor.execute( "update files set symlink = 1 where path = ?;", (path, )) symlinkFile(canonicalPath, path) logging.info("De-duping complete") except Exception as einst: logging.error("Unable to de-dup database: %s" % einst) raise
def dedup(self, dupdir, doSymlink): """ Moves duplicate entries (based on checksum) into the dupdir. Uses the entry's path to reconstruct a subdirectory hierarchy in dupdir. This will remove any common prefixes between dupdir and the file path itself so as to make a useful subdirectory structure. If doSymlink is true, then the original paths of the files that were moved will be symlinked back to the canonical file; in addition, it will keep the file entry in the database rather than removing it.""" logging.info("De-duping database") if os.path.exists(dupdir) and not len(os.listdir(dupdir)) <= 0: raise Exception("%s is not empty; refusing to move files" % dupdir) try: pathmap = {} # store duplicate paths keyed by file checksum with sqliteConn(self.database) as cursor: cursor.execute("""select chksum, path, link from files where chksum in( select chksum from files where symlink = 0 group by chksum having count(chksum) > 1) and symlink = 0 and link = 1 order by chksum, link;""") for row in cursor: (chksum, path, islink) = row if not chksum in pathmap: # ensure existence of list for checksum pathmap[chksum] = [] paths = pathmap[chksum] paths.append(path) for chksum, paths in pathmap.iteritems(): # the query above will result in single rows for symlinked files, so fix that here # rather than mucking about with temp tables paths = filter(lambda path: not os.path.islink(path), paths) # we'll have at least two elements due to the inner part of the query above for path in paths: dst = dstWithSubdirectory(path, dupdir) moveFile(path, dst, (not doSymlink)) # don't rm empty dirs if we are symlinking if not doSymlink: cursor.execute(REMOVE_ROW, (path, )) else: cursor.execute("update files set symlink = 1 where path = ?;", (path, )) symlinkFile(canonicalPath, path) logging.info("De-duping complete") except Exception as einst: logging.error("Unable to de-dup database: %s" % einst) raise