Example #1
0
    def _sameFileTestHelper(self, data1, data2):
        # Make two temporary files
        fd1, path1 = tempfile.mkstemp()
        fd2, path2 = tempfile.mkstemp()
        file1 = os.fdopen(fd1, 'wb')
        file2 = os.fdopen(fd2, 'wb')

        # Put the test data in them, and close them
        file1.write(data1)
        file2.write(data2)
        file1.close()
        file2.close()

        # Do the test, and clean up afterwards
        try:
            return _sameFile(path1, path2)
        finally:
            os.remove(path1)
            os.remove(path2)
    def _sameFileTestHelper(self, data1, data2):
        # Make two temporary files
        fd1, path1 = tempfile.mkstemp()
        fd2, path2 = tempfile.mkstemp()
        file1 = os.fdopen(fd1, 'wb')
        file2 = os.fdopen(fd2, 'wb')

        # Put the test data in them, and close them
        file1.write(data1)
        file2.write(data2)
        file1.close()
        file2.close()

        # Do the test, and clean up afterwards
        try:
            return _sameFile(path1, path2)
        finally:
            os.remove(path1)
            os.remove(path2)
Example #3
0
def merge_duplicates(con):
    """Merge duplicate LibraryFileContent rows

    This is the first step in a full garbage collection run. We assume files
    are identical if their sha1 hashes and filesizes are identical. For every
    duplicate detected, we make all LibraryFileAlias entries point to one of
    them and delete the unnecessary duplicates from the filesystem and the
    database.
    """

    # Get a list of all (sha1, filesize) that are duplicated in
    # LibraryFileContent
    cur = con.cursor()
    cur.execute("""
        SELECT sha1, filesize
        FROM LibraryFileContent
        GROUP BY sha1, filesize
        HAVING COUNT(*) > 1
        """)
    rows = list(cur.fetchall())

    # Merge the duplicate entries, each one in a separate transaction
    for sha1, filesize in rows:
        cur = con.cursor()

        sha1 = sha1.encode('US-ASCII')  # Can't pass Unicode to execute (yet)

        # Get a list of our dupes. Where multiple files exist, we return
        # the most recently added one first, because this is the version
        # most likely to exist on the staging server (it should be
        # irrelevant on production).
        cur.execute("""
            SELECT id
            FROM LibraryFileContent
            WHERE sha1=%(sha1)s AND filesize=%(filesize)s
            ORDER BY datecreated DESC
            """, vars())
        dupes = [row[0] for row in cur.fetchall()]

        if debug:
            log.debug("Found duplicate LibraryFileContents")
            # Spit out more info in case it helps work out where
            # dupes are coming from.
            for dupe_id in dupes:
                cur.execute("""
                    SELECT id, filename, mimetype FROM LibraryFileAlias
                    WHERE content = %(dupe_id)s
                    """, vars())
                for id, filename, mimetype in cur.fetchall():
                    log.debug("> %d %s %s" % (id, filename, mimetype))

        # Make sure the first file exists on disk. Don't merge if it
        # doesn't. This shouldn't happen on production, so we don't try
        # and cope - just report and skip. However, on staging this will
        # be more common because database records has been synced from
        # production but the actual librarian contents has not.
        dupe1_id = dupes[0]
        dupe1_path = get_file_path(dupe1_id)
        if not os.path.exists(dupe1_path):
            if config.instance_name == 'staging':
                log.debug(
                        "LibraryFileContent %d data is missing (%s)",
                        dupe1_id, dupe1_path
                        )
            else:
                log.warning(
                        "LibraryFileContent %d data is missing (%s)",
                        dupe1_id, dupe1_path
                        )
            continue

        # Do a manual check that they really are identical, because we
        # employ paranoids. And we might as well cope with someone breaking
        # SHA1 enough that it becomes possible to create a SHA1 collision
        # with an identical filesize to an existing file. Which is pretty
        # unlikely. Where did I leave my tin foil hat?
        for dupe2_id in (dupe for dupe in dupes[1:]):
            dupe2_path = get_file_path(dupe2_id)
            # Check paths exist, because on staging they may not!
            if (os.path.exists(dupe2_path)
                and not _sameFile(dupe1_path, dupe2_path)):
                log.error(
                        "SHA-1 collision found. LibraryFileContent %d and "
                        "%d have the same SHA1 and filesize, but are not "
                        "byte-for-byte identical.",
                        dupe1_id, dupe2_id
                        )
                sys.exit(1)

        # Update all the LibraryFileAlias entries to point to a single
        # LibraryFileContent
        prime_id = dupes[0]
        other_ids = ', '.join(str(dupe) for dupe in dupes[1:])
        log.debug(
            "Making LibraryFileAliases referencing %s reference %s instead",
            other_ids, prime_id
            )
        for other_id in dupes[1:]:
            cur.execute("""
                UPDATE LibraryFileAlias SET content=%(prime_id)s
                WHERE content = %(other_id)s
                """, vars())

        log.debug("Committing")
        con.commit()