def test_deleteUnwantedFiles(self): self.ztm.begin() cur = cursor() # We may find files in the LibraryFileContent repository # that do not have an corresponding LibraryFileContent row. # Find a content_id we can easily delete and do so. This row is # removed from the database, leaving an orphaned file on the # filesystem that should be removed. cur.execute(""" SELECT LibraryFileContent.id FROM LibraryFileContent LEFT OUTER JOIN LibraryFileAlias ON LibraryFileContent.id = content WHERE LibraryFileAlias.id IS NULL LIMIT 1 """) content_id = cur.fetchone()[0] cur.execute(""" DELETE FROM LibraryFileContent WHERE id=%s """, (content_id,)) self.ztm.commit() path = librariangc.get_file_path(content_id) self.failUnless(os.path.exists(path)) # Ensure delete_unreferenced_files does not remove the file, because # it will have just been created (has a recent date_created). There # is a window between file creation and the garbage collector # bothering to remove the file to avoid the race condition where the # garbage collector is run whilst a file is being uploaded. librariangc.delete_unwanted_files(self.con) self.failUnless(os.path.exists(path)) # To test removal does occur when we want it to, we need to trick # the garbage collector into thinking it is tomorrow. org_time = librariangc.time def tomorrow_time(): return org_time() + 24 * 60 * 60 + 1 try: librariangc.time = tomorrow_time librariangc.delete_unwanted_files(self.con) finally: librariangc.time = org_time self.failIf(os.path.exists(path)) # Make sure nothing else has been removed from disk self.ztm.begin() cur = cursor() cur.execute(""" SELECT id FROM LibraryFileContent """) for content_id in (row[0] for row in cur.fetchall()): path = librariangc.get_file_path(content_id) self.failUnless(os.path.exists(path))
def setUp(self): super(TestLibrarianGarbageCollection, self).setUp() self.client = LibrarianClient() self.patch(librariangc, 'log', BufferLogger()) # A value we use in a number of tests. This represents the # stay of execution hard coded into the garbage collector. # We don't destroy any data unless it has been waiting to be # destroyed for longer than this period. We pick a value # that is close enough to the stay of execution so that # forgetting timezone information will break things, but # far enough so that how long it takes the test to run # is not an issue. 'stay_of_excution - 1 hour' fits these # criteria. self.recent_past = utc_now() - timedelta(days=6, hours=23) # A time beyond the stay of execution. self.ancient_past = utc_now() - timedelta(days=30) self.f1_id, self.f2_id = self._makeDupes() switch_dbuser(config.librarian_gc.dbuser) self.ztm = self.layer.txn # Make sure the files exist. We do this in setup, because we # need to use the get_file_path method later in the setup and we # want to be sure it is working correctly. path = librariangc.get_file_path(self.f1_id) self.failUnless(os.path.exists(path), "Librarian uploads failed") # Make sure that every file the database knows about exists on disk. # We manually remove them for tests that need to cope with missing # library items. self.ztm.begin() cur = cursor() cur.execute("SELECT id FROM LibraryFileContent") for content_id in (row[0] for row in cur.fetchall()): path = librariangc.get_file_path(content_id) if not os.path.exists(path): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) open(path, 'w').write('whatever') self.ztm.abort() self.con = connect( user=config.librarian_gc.dbuser, isolation=ISOLATION_LEVEL_AUTOCOMMIT)
def test_DeleteUnreferencedContent(self): # Merge the duplicates. This creates an # unreferenced LibraryFileContent librariangc.merge_duplicates(self.con) self.ztm.begin() # Locate the unreferenced LibraryFileContent cur = cursor() cur.execute(""" SELECT LibraryFileContent.id FROM LibraryFileContent LEFT OUTER JOIN LibraryFileAlias ON LibraryFileContent.id = LibraryFileAlias.content WHERE LibraryFileAlias.id IS NULL AND LibraryFileContent.id IN (%d, %d) """ % (self.f1_id, self.f2_id)) results = cur.fetchall() self.failUnlessEqual(len(results), 1) unreferenced_id = results[0][0] self.ztm.abort() # Make sure the file exists on disk path = librariangc.get_file_path(unreferenced_id) self.failUnless(os.path.exists(path)) # Delete unreferenced content librariangc.delete_unreferenced_content(self.con) # Make sure the file is gone self.failIf(os.path.exists(path)) # delete_unreferenced_content should have committed self.ztm.begin() # Make sure the unreferenced entries have all gone cur = cursor() cur.execute(""" SELECT LibraryFileContent.id FROM LibraryFileContent LEFT OUTER JOIN LibraryFileAlias ON LibraryFileContent.id = LibraryFileAlias.content WHERE LibraryFileAlias.id IS NULL """) results = list(cur.fetchall()) self.failUnlessEqual( len(results), 0, 'Too many results %r' % (results,) )
def setUp(self): super(TestBlobCollection, self).setUp() # Add in some sample data cur = cursor() # First a blob that has been unclaimed and expired. cur.execute(""" INSERT INTO LibraryFileContent (filesize, sha1, md5, sha256) VALUES (666, 'whatever', 'whatever', 'whatever') """) cur.execute("""SELECT currval('libraryfilecontent_id_seq')""") self.expired_lfc_id = cur.fetchone()[0] cur.execute(""" INSERT INTO LibraryFileAlias ( content, filename, mimetype, expires) VALUES ( %s, 'whatever', 'whatever', CURRENT_TIMESTAMP - '1 day'::interval ) """, (self.expired_lfc_id,)) cur.execute("""SELECT currval('libraryfilealias_id_seq')""") self.expired_lfa_id = cur.fetchone()[0] cur.execute(""" INSERT INTO TemporaryBlobStorage (uuid, file_alias) VALUES ('uuid', %s) """, (self.expired_lfa_id,)) cur.execute("""SELECT currval('temporaryblobstorage_id_seq')""") self.expired_blob_id = cur.fetchone()[0] # Add ApportJob and Job entries - these need to be removed # too. cur.execute(""" INSERT INTO Job (status, date_finished) VALUES (0, CURRENT_TIMESTAMP - interval '2 days') RETURNING id """) self.expired_job_id = cur.fetchone()[0] cur.execute(""" INSERT INTO ApportJob (job, blob, job_type) VALUES (%s, %s, 0) RETURNING id """, (self.expired_job_id, self.expired_blob_id)) self.expired_apportjob_id = cur.fetchone()[0] # Next a blob that has expired, but claimed and now linked to # elsewhere in the database cur.execute(""" INSERT INTO LibraryFileContent (filesize, sha1, md5, sha256) VALUES (666, 'whatever', 'whatever', 'whatever') """) cur.execute("""SELECT currval('libraryfilecontent_id_seq')""") self.expired2_lfc_id = cur.fetchone()[0] cur.execute(""" INSERT INTO LibraryFileAlias ( content, filename, mimetype, expires) VALUES ( %s, 'whatever', 'whatever', CURRENT_TIMESTAMP - '1 day'::interval ) """, (self.expired2_lfc_id,)) cur.execute("""SELECT currval('libraryfilealias_id_seq')""") self.expired2_lfa_id = cur.fetchone()[0] cur.execute(""" INSERT INTO TemporaryBlobStorage (uuid, file_alias) VALUES ('uuid2', %s) """, (self.expired2_lfa_id,)) cur.execute("""SELECT currval('temporaryblobstorage_id_seq')""") self.expired2_blob_id = cur.fetchone()[0] # Link it somewhere else, unexpired cur.execute(""" INSERT INTO LibraryFileAlias (content, filename, mimetype) VALUES (%s, 'whatever', 'whatever') """, (self.expired2_lfc_id,)) cur.execute(""" UPDATE Person SET mugshot=currval('libraryfilealias_id_seq') WHERE name='stub' """) # And a non expired blob cur.execute(""" INSERT INTO LibraryFileContent (filesize, sha1, md5, sha256) VALUES (666, 'whatever', 'whatever', 'whatever') """) cur.execute("""SELECT currval('libraryfilecontent_id_seq')""") self.unexpired_lfc_id = cur.fetchone()[0] cur.execute(""" INSERT INTO LibraryFileAlias ( content, filename, mimetype, expires) VALUES ( %s, 'whatever', 'whatever', CURRENT_TIMESTAMP + '1 day'::interval ) """, (self.unexpired_lfc_id,)) cur.execute("""SELECT currval('libraryfilealias_id_seq')""") self.unexpired_lfa_id = cur.fetchone()[0] cur.execute(""" INSERT INTO TemporaryBlobStorage (uuid, file_alias) VALUES ('uuid3', %s) """, (self.unexpired_lfa_id,)) cur.execute("""SELECT currval('temporaryblobstorage_id_seq')""") self.unexpired_blob_id = cur.fetchone()[0] self.layer.txn.commit() # Make sure all the librarian files actually exist on disk cur = cursor() cur.execute("SELECT id FROM LibraryFileContent") for content_id in (row[0] for row in cur.fetchall()): path = librariangc.get_file_path(content_id) if not os.path.exists(path): if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) open(path, 'w').write('whatever') self.layer.txn.abort() switch_dbuser(config.librarian_gc.dbuser) # Open a connection for our test self.con = connect( user=config.librarian_gc.dbuser, isolation=ISOLATION_LEVEL_AUTOCOMMIT) self.patch(librariangc, 'log', BufferLogger())
def test_DeleteUnreferencedContent2(self): # Like testDeleteUnreferencedContent, except that the file is # removed from disk before attempting to remove the unreferenced # LibraryFileContent. # # Because the garbage collector will remove an unreferenced file from # disk before it commits the database changes, it is possible that the # db removal will fail (eg. an exception was raised on COMMIT) leaving # the rows untouched in the database but no file on disk. # This is fine, as the next gc run will attempt it again and # nothing can use unreferenced files anyway. This test ensures # that this all works. # Merge the duplicates. This creates an # unreferenced LibraryFileContent librariangc.merge_duplicates(self.con) self.ztm.begin() # Locate the unreferenced LibraryFileContent cur = cursor() cur.execute(""" SELECT LibraryFileContent.id FROM LibraryFileContent LEFT OUTER JOIN LibraryFileAlias ON LibraryFileContent.id = LibraryFileAlias.content WHERE LibraryFileAlias.id IS NULL AND LibraryFileContent.id IN (%d, %d) """ % (self.f1_id, self.f2_id)) results = cur.fetchall() self.failUnlessEqual(len(results), 1) unreferenced_id = results[0][0] self.ztm.abort() # Make sure the file exists on disk path = librariangc.get_file_path(unreferenced_id) self.failUnless(os.path.exists(path)) # Remove the file from disk os.unlink(path) self.failIf(os.path.exists(path)) # Delete unreferenced content librariangc.delete_unreferenced_content(self.con) # Make sure the file is gone self.failIf(os.path.exists(path)) # delete_unreferenced_content should have committed self.ztm.begin() # Make sure the unreferenced entries have all gone cur = cursor() cur.execute(""" SELECT LibraryFileContent.id FROM LibraryFileContent LEFT OUTER JOIN LibraryFileAlias ON LibraryFileContent.id = LibraryFileAlias.content WHERE LibraryFileAlias.id IS NULL """) results = list(cur.fetchall()) self.failUnlessEqual( len(results), 0, 'Too many results %r' % (results,) )