def _sameFileTestHelper(self, data1, data2): # Make two temporary files fd1, path1 = tempfile.mkstemp() fd2, path2 = tempfile.mkstemp() file1 = os.fdopen(fd1, 'wb') file2 = os.fdopen(fd2, 'wb') # Put the test data in them, and close them file1.write(data1) file2.write(data2) file1.close() file2.close() # Do the test, and clean up afterwards try: return _sameFile(path1, path2) finally: os.remove(path1) os.remove(path2)
def merge_duplicates(con): """Merge duplicate LibraryFileContent rows This is the first step in a full garbage collection run. We assume files are identical if their sha1 hashes and filesizes are identical. For every duplicate detected, we make all LibraryFileAlias entries point to one of them and delete the unnecessary duplicates from the filesystem and the database. """ # Get a list of all (sha1, filesize) that are duplicated in # LibraryFileContent cur = con.cursor() cur.execute(""" SELECT sha1, filesize FROM LibraryFileContent GROUP BY sha1, filesize HAVING COUNT(*) > 1 """) rows = list(cur.fetchall()) # Merge the duplicate entries, each one in a separate transaction for sha1, filesize in rows: cur = con.cursor() sha1 = sha1.encode('US-ASCII') # Can't pass Unicode to execute (yet) # Get a list of our dupes. Where multiple files exist, we return # the most recently added one first, because this is the version # most likely to exist on the staging server (it should be # irrelevant on production). cur.execute(""" SELECT id FROM LibraryFileContent WHERE sha1=%(sha1)s AND filesize=%(filesize)s ORDER BY datecreated DESC """, vars()) dupes = [row[0] for row in cur.fetchall()] if debug: log.debug("Found duplicate LibraryFileContents") # Spit out more info in case it helps work out where # dupes are coming from. for dupe_id in dupes: cur.execute(""" SELECT id, filename, mimetype FROM LibraryFileAlias WHERE content = %(dupe_id)s """, vars()) for id, filename, mimetype in cur.fetchall(): log.debug("> %d %s %s" % (id, filename, mimetype)) # Make sure the first file exists on disk. Don't merge if it # doesn't. This shouldn't happen on production, so we don't try # and cope - just report and skip. However, on staging this will # be more common because database records has been synced from # production but the actual librarian contents has not. dupe1_id = dupes[0] dupe1_path = get_file_path(dupe1_id) if not os.path.exists(dupe1_path): if config.instance_name == 'staging': log.debug( "LibraryFileContent %d data is missing (%s)", dupe1_id, dupe1_path ) else: log.warning( "LibraryFileContent %d data is missing (%s)", dupe1_id, dupe1_path ) continue # Do a manual check that they really are identical, because we # employ paranoids. And we might as well cope with someone breaking # SHA1 enough that it becomes possible to create a SHA1 collision # with an identical filesize to an existing file. Which is pretty # unlikely. Where did I leave my tin foil hat? for dupe2_id in (dupe for dupe in dupes[1:]): dupe2_path = get_file_path(dupe2_id) # Check paths exist, because on staging they may not! if (os.path.exists(dupe2_path) and not _sameFile(dupe1_path, dupe2_path)): log.error( "SHA-1 collision found. LibraryFileContent %d and " "%d have the same SHA1 and filesize, but are not " "byte-for-byte identical.", dupe1_id, dupe2_id ) sys.exit(1) # Update all the LibraryFileAlias entries to point to a single # LibraryFileContent prime_id = dupes[0] other_ids = ', '.join(str(dupe) for dupe in dupes[1:]) log.debug( "Making LibraryFileAliases referencing %s reference %s instead", other_ids, prime_id ) for other_id in dupes[1:]: cur.execute(""" UPDATE LibraryFileAlias SET content=%(prime_id)s WHERE content = %(other_id)s """, vars()) log.debug("Committing") con.commit()