def migrate2(self): # fix the dirnames and basenames column types for postgresql cu = self.db.cursor() if self.db.driver == 'postgresql': logMe(2, "fixing column types for pathfields") cu.execute("create table saveDirnames as select dirnameId, dirname from Dirnames") cu.execute("create table saveBasenames as select basenameId, basename from Basenames") self.db.dropForeignKey("FilePaths", "dirnameId") self.db.dropForeignKey("FilePaths", "basenameId") cu.execute("drop table Dirnames") cu.execute("drop table Basenames") self.db.loadSchema() schema.createTroves(self.db, createIndex = False) cu.execute("select dirnameId, dirname from saveDirnames") self.db.bulkload("Dirnames", ( (x[0], cu.binary(x[1])) for x in cu.fetchall() ), ["dirnameId", "dirname"]) cu.execute("select basenameId, basename from saveBasenames") self.db.bulkload("Basenames", ( (x[0], cu.binary(x[1])) for x in cu.fetchall() ), ["basenameId", "basename"]) schema.createTroves(self.db, createIndex = True) self.db.addForeignKey("FilePaths", "dirnameId", "Dirnames", "dirnameId") self.db.addForeignKey("FilePaths", "basenameId", "Basenames", "basenameId") cu.execute("drop table saveDirnames") cu.execute("drop table saveBasenames") self.db.analyze("Dirnames") self.db.analyze("Basenames") self.db.setAutoIncrement("Dirnames", "dirnameId") self.db.setAutoIncrement("Basenames", "basenameId") return True
def migrate2(self): # fix the dirnames and basenames column types for postgresql cu = self.db.cursor() if self.db.driver == 'postgresql': logMe(2, "fixing column types for pathfields") cu.execute("create table saveDirnames as select dirnameId, dirname from Dirnames") cu.execute("create table saveBasenames as select basenameId, basename from Basenames") cu.execute("create table savePrefixes as select dirnameId, prefixId from Prefixes") self.db.dropForeignKey("FilePaths", "dirnameId") self.db.dropForeignKey("FilePaths", "basenameId") cu.execute("drop table Prefixes") cu.execute("drop table Dirnames") cu.execute("drop table Basenames") self.db.loadSchema() schema.createTroves(self.db, createIndex = False) cu.execute("select dirnameId, dirname from saveDirnames") self.db.bulkload("Dirnames", ( (x[0], cu.binary(x[1])) for x in cu.fetchall() ), ["dirnameId", "dirname"]) cu.execute("select basenameId, basename from saveBasenames") self.db.bulkload("Basenames", ( (x[0], cu.binary(x[1])) for x in cu.fetchall() ), ["basenameId", "basename"]) cu.execute("insert into Prefixes(dirnameId, prefixId) " "select dirnameId, prefixId from savePrefixes") schema.createTroves(self.db, createIndex = True) self.db.addForeignKey("FilePaths", "dirnameId", "Dirnames", "dirnameId") self.db.addForeignKey("FilePaths", "basenameId", "Basenames", "basenameId") cu.execute("drop table saveDirnames") cu.execute("drop table saveBasenames") cu.execute("drop table savePrefixes") self.db.analyze("Dirnames") self.db.analyze("Basenames") self.db.setAutoIncrement("Dirnames", "dirnameId") self.db.setAutoIncrement("Basenames", "basenameId") # fix the missing dirnames/prefixes links schema.setupTempTables(self.db) logMe(2, "looking for missing dirnames/prefixes links") cu = self.db.cursor() cu.execute("""select distinct d.dirnameId, d.dirname from Dirnames as d join ( select fp.dirnameId as dirnameId from FilePaths as fp left join Prefixes as p using(dirnameId) where p.dirnameId is null ) as dq using(dirnameId) """) ret = cu.fetchall() if ret: logMe(2, "fixing missing dirnames/prefixes links in %d dirnames" % (len(ret),)) trovestore.addPrefixesFromList(self.db, ret) self.db.analyze("Prefixes") return True
def fixRedirects(self, repos): logMe(2, "removing dep provisions from redirects...") self.db.loadSchema() # avoid creating this index until we had a chance to check the path indexes self.db.tables["TroveFiles"].append("TroveFilesPathIdx") # remove dependency provisions from redirects -- the conary 1.0 # branch would set redirects to provide their own names. this doesn't # clean up the dependency table; that would only matter on a trove # which was cooked as only a redirect in the repository; any other # instances would still need the depId anyway cu = self.db.cursor() cu.execute("delete from provides where instanceId in " "(select instanceId from instances " "where troveType=? and isPresent=1)", trove.TROVE_TYPE_REDIRECT) # need to make sure TroveRedirects is defined... schema.createTroves(self.db) # loop over redirects... cu.execute("select instanceId from instances " "where troveType=? and isPresent=1", trove.TROVE_TYPE_REDIRECT) for (instanceId,) in cu: self.fixTroveSig(repos, instanceId)
def fixRedirects(self, repos): logMe(2, "removing dep provisions from redirects...") self.db.loadSchema() # avoid creating this index until we had a chance to check the path indexes self.db.tables["TroveFiles"].append("TroveFilesPathIdx") # remove dependency provisions from redirects -- the conary 1.0 # branch would set redirects to provide their own names. this doesn't # clean up the dependency table; that would only matter on a trove # which was cooked as only a redirect in the repository; any other # instances would still need the depId anyway cu = self.db.cursor() cu.execute( "delete from provides where instanceId in " "(select instanceId from instances " "where troveType=? and isPresent=1)", trove.TROVE_TYPE_REDIRECT) # need to make sure TroveRedirects is defined... schema.createTroves(self.db) # loop over redirects... cu.execute( "select instanceId from instances " "where troveType=? and isPresent=1", trove.TROVE_TYPE_REDIRECT) for (instanceId, ) in cu: self.fixTroveSig(repos, instanceId)
def _createFilePaths(self): logMe(2, "splitting paths in dirnames and basenames") cu = self.db.cursor() cu.execute(""" create table tmpDirnames ( filePathId %(PRIMARYKEY)s, dirname %(PATHTYPE)s, basename %(PATHTYPE)s ) """ % self.db.keywords) # save a copy of FilePaths before updating the table definition cu.execute("""create table oldFilePaths as select filePathId, pathId, path from FilePaths""") self.db.createIndex("oldFilePaths", "oldFilePathsIdx", "filePathId", check=False, unique=True) self.db.analyze("oldFilePaths") # drop the FK constraint from TroveFiles into FilePaths self.db.loadSchema() self.db.dropForeignKey("TroveFiles", name = "TroveFiles_filePathId_fk") # create Dirnames, Basenames and the new FilePaths tables cu.execute("drop table FilePaths") self.db.loadSchema() schema.createTroves(self.db, createIndex=False) # this is to avoid processing too many entries at once... sliceSize = 200000 analyze = 1 while True: cu.execute(""" select fp.filePathId, fp.path from oldFilePaths as fp left join tmpDirnames as d using(filePathId) where d.filePathId is null limit ?""", sliceSize) tmpl = [ (_fpid, os.path.split(_path)) for _fpid,_path in cu.fetchall() ] if not tmpl: break # no more entries found self.db.bulkload("tmpDirnames", [ (x[0], x[1][0], x[1][1]) for x in tmpl ], ["filePathId", "dirname", "basename"]) # don't analyze too often for speed reasons analyze -= 1 if not analyze: analyze = 5 self.db.analyze("tmpDirnames") self.db.createIndex("tmpDirnames", "tmpDirnames_dirname_idx", "dirname", check = False) logMe(2, "extracting unique dirnames and basenames...") self.db.analyze("tmpDirnames") # the '' and '/' dirnames should already be in the Dirnames table cu.execute(""" insert into Dirnames(dirname) select distinct dirname from tmpDirnames order by dirname """) self.db.analyze("Dirnames") cu.execute(""" insert into Basenames(basename) select distinct basename from tmpDirnames order by basename """) self.db.analyze("Basenames") logMe(2, "generating the new FilePaths table...") cu.execute("""insert into FilePaths(filePathId, dirnameId, basenameId, pathId) select fp.filePathId, d.dirnameId, b.basenameId, fp.pathId from oldFilePaths as fp join tmpDirnames as td using(filePathId) join Dirnames as d on td.dirname = d.dirname join Basenames as b on td.basename = b.basename """) cu.execute("drop table oldFilePaths") cu.execute("drop table tmpDirnames") # fix the autoincrement primary key value on the new FilePaths cu.execute("select max(filePathId) from FilePaths") maxId = cu.fetchone()[0] if maxId: self.db.setAutoIncrement("FilePaths", "filePathId", maxId) self.db.analyze("FilePaths") # re-enable the FK constraint and create indexes logMe(3, "adding foreign key constraints...") self.db.addForeignKey("TroveFiles", "filePathId", "FilePaths", "filePathId") self.db.analyze("TroveFiles") schema.createTroves(self.db)
def fixDuplicatePaths(self, repos): logMe(2, "checking database for duplicate path entries...") cu = self.db.cursor() # we'll have to do a full table scan on TroveFiles. no way # around it... cu.execute(""" create temporary table tmpDups( instanceId integer, path %(PATHTYPE)s, counter integer ) %(TABLEOPTS)s""" % self.db.keywords) logMe(2, "looking for troves with duplicate paths...") # sqlite has a real challenge dealing with large datasets if self.db.driver == 'sqlite': cu2 = self.db.cursor() cu.execute("select distinct instanceId from TroveFiles") # so we split this in very little tasks. lots of them for (instanceId,) in cu: cu2.execute(""" insert into tmpDups (instanceId, path, counter) select instanceId, path, count(*) from TroveFiles where instanceId = ? group by instanceId, path having count(*) > 1""", instanceId) else: # other backends should be able to process this in one shot cu.execute(""" insert into tmpDups (instanceId, path, counter) select instanceId, path, count(*) from TroveFiles group by instanceId, path having count(*) > 1""") counter = cu.execute("select count(*) from tmpDups").fetchall()[0][0] if counter > 0: # drop the old index, if any self.db.loadSchema() self.db.dropIndex("TroveFiles", "TroveFilesPathIdx") logMe(3, "detected %d duplicates" % (counter,)) # loop over every duplicate and apply the appropiate fix cu.execute("select instanceId, path from tmpDups") for (instanceId, path) in cu.fetchall(): cu.execute("""select distinct instanceId, streamId, versionId, pathId, path from trovefiles where instanceId = ? and path = ? order by streamId, versionId, pathId""", (instanceId, path)) ret = cu.fetchall() # delete all the duplicates and put the first one back cu.execute("delete from trovefiles " "where instanceId = ? and path = ?", (instanceId, path)) # in case they are different, we pick the oldest, chances are it is # more "original" cu.execute("insert into trovefiles " "(instanceId, streamId, versionId, pathId, path) " "values (?,?,?,?,?)", tuple(ret[0])) if len(ret) > 1: # need to recompute the sha1 - we might have changed the trove manifest # if the records were different self.fixTroveSig(repos, instanceId) # recreate the indexes and triggers - including new path # index for TroveFiles. Also recreates the indexes table. logMe(2, 'Recreating indexes... (this could take a while)') cu.execute("drop table tmpDups") self.db.loadSchema() schema.createTroves(self.db) logMe(2, 'Indexes created.')
def migrate2(self): # fix the dirnames and basenames column types for postgresql cu = self.db.cursor() if self.db.driver == 'postgresql': logMe(2, "fixing column types for pathfields") cu.execute( "create table saveDirnames as select dirnameId, dirname from Dirnames" ) cu.execute( "create table saveBasenames as select basenameId, basename from Basenames" ) cu.execute( "create table savePrefixes as select dirnameId, prefixId from Prefixes" ) self.db.dropForeignKey("FilePaths", "dirnameId") self.db.dropForeignKey("FilePaths", "basenameId") cu.execute("drop table Prefixes") cu.execute("drop table Dirnames") cu.execute("drop table Basenames") self.db.loadSchema() schema.createTroves(self.db, createIndex=False) cu.execute("select dirnameId, dirname from saveDirnames") self.db.bulkload("Dirnames", ((x[0], cu.binary(x[1])) for x in cu.fetchall()), ["dirnameId", "dirname"]) cu.execute("select basenameId, basename from saveBasenames") self.db.bulkload("Basenames", ((x[0], cu.binary(x[1])) for x in cu.fetchall()), ["basenameId", "basename"]) cu.execute("insert into Prefixes(dirnameId, prefixId) " "select dirnameId, prefixId from savePrefixes") schema.createTroves(self.db, createIndex=True) self.db.addForeignKey("FilePaths", "dirnameId", "Dirnames", "dirnameId") self.db.addForeignKey("FilePaths", "basenameId", "Basenames", "basenameId") cu.execute("drop table saveDirnames") cu.execute("drop table saveBasenames") cu.execute("drop table savePrefixes") self.db.analyze("Dirnames") self.db.analyze("Basenames") self.db.setAutoIncrement("Dirnames", "dirnameId") self.db.setAutoIncrement("Basenames", "basenameId") # fix the missing dirnames/prefixes links schema.setupTempTables(self.db) logMe(2, "looking for missing dirnames/prefixes links") cu = self.db.cursor() cu.execute("""select distinct d.dirnameId, d.dirname from Dirnames as d join ( select fp.dirnameId as dirnameId from FilePaths as fp left join Prefixes as p using(dirnameId) where p.dirnameId is null ) as dq using(dirnameId) """) ret = cu.fetchall() if ret: logMe( 2, "fixing missing dirnames/prefixes links in %d dirnames" % (len(ret), )) trovestore.addPrefixesFromList(self.db, ret) self.db.analyze("Prefixes") return True
def _createFilePaths(self): logMe(2, "splitting paths in dirnames and basenames") cu = self.db.cursor() cu.execute(""" create table tmpDirnames ( filePathId %(PRIMARYKEY)s, dirname %(PATHTYPE)s, basename %(PATHTYPE)s ) """ % self.db.keywords) # save a copy of FilePaths before updating the table definition cu.execute("""create table oldFilePaths as select filePathId, pathId, path from FilePaths""") self.db.createIndex("oldFilePaths", "oldFilePathsIdx", "filePathId", check=False, unique=True) self.db.analyze("oldFilePaths") # drop the FK constraint from TroveFiles into FilePaths self.db.loadSchema() self.db.dropForeignKey("TroveFiles", name="TroveFiles_filePathId_fk") # create Dirnames, Basenames and the new FilePaths tables cu.execute("drop table FilePaths") self.db.loadSchema() schema.createTroves(self.db, createIndex=False) # this is to avoid processing too many entries at once... sliceSize = 200000 analyze = 1 while True: cu.execute( """ select fp.filePathId, fp.path from oldFilePaths as fp left join tmpDirnames as d using(filePathId) where d.filePathId is null limit ?""", sliceSize) tmpl = [(_fpid, os.path.split(_path)) for _fpid, _path in cu.fetchall()] if not tmpl: break # no more entries found self.db.bulkload("tmpDirnames", [(x[0], x[1][0], x[1][1]) for x in tmpl], ["filePathId", "dirname", "basename"]) # don't analyze too often for speed reasons analyze -= 1 if not analyze: analyze = 5 self.db.analyze("tmpDirnames") self.db.createIndex("tmpDirnames", "tmpDirnames_dirname_idx", "dirname", check=False) logMe(2, "extracting unique dirnames and basenames...") self.db.analyze("tmpDirnames") # the '' and '/' dirnames should already be in the Dirnames table cu.execute(""" insert into Dirnames(dirname) select distinct dirname from tmpDirnames order by dirname """) self.db.analyze("Dirnames") cu.execute(""" insert into Basenames(basename) select distinct basename from tmpDirnames order by basename """) self.db.analyze("Basenames") logMe(2, "generating the new FilePaths table...") cu.execute( """insert into FilePaths(filePathId, dirnameId, basenameId, pathId) select fp.filePathId, d.dirnameId, b.basenameId, fp.pathId from oldFilePaths as fp join tmpDirnames as td using(filePathId) join Dirnames as d on td.dirname = d.dirname join Basenames as b on td.basename = b.basename """) cu.execute("drop table oldFilePaths") cu.execute("drop table tmpDirnames") # fix the autoincrement primary key value on the new FilePaths cu.execute("select max(filePathId) from FilePaths") maxId = cu.fetchone()[0] if maxId: self.db.setAutoIncrement("FilePaths", "filePathId", maxId) self.db.analyze("FilePaths") # re-enable the FK constraint and create indexes logMe(3, "adding foreign key constraints...") self.db.addForeignKey("TroveFiles", "filePathId", "FilePaths", "filePathId") self.db.analyze("TroveFiles") schema.createTroves(self.db)
def fixDuplicatePaths(self, repos): logMe(2, "checking database for duplicate path entries...") cu = self.db.cursor() # we'll have to do a full table scan on TroveFiles. no way # around it... cu.execute(""" create temporary table tmpDups( instanceId integer, path %(PATHTYPE)s, counter integer ) %(TABLEOPTS)s""" % self.db.keywords) logMe(2, "looking for troves with duplicate paths...") # sqlite has a real challenge dealing with large datasets if self.db.driver == 'sqlite': cu2 = self.db.cursor() cu.execute("select distinct instanceId from TroveFiles") # so we split this in very little tasks. lots of them for (instanceId, ) in cu: cu2.execute( """ insert into tmpDups (instanceId, path, counter) select instanceId, path, count(*) from TroveFiles where instanceId = ? group by instanceId, path having count(*) > 1""", instanceId) else: # other backends should be able to process this in one shot cu.execute(""" insert into tmpDups (instanceId, path, counter) select instanceId, path, count(*) from TroveFiles group by instanceId, path having count(*) > 1""") counter = cu.execute("select count(*) from tmpDups").fetchall()[0][0] if counter > 0: # drop the old index, if any self.db.loadSchema() self.db.dropIndex("TroveFiles", "TroveFilesPathIdx") logMe(3, "detected %d duplicates" % (counter, )) # loop over every duplicate and apply the appropiate fix cu.execute("select instanceId, path from tmpDups") for (instanceId, path) in cu.fetchall(): cu.execute( """select distinct instanceId, streamId, versionId, pathId, path from trovefiles where instanceId = ? and path = ? order by streamId, versionId, pathId""", (instanceId, path)) ret = cu.fetchall() # delete all the duplicates and put the first one back cu.execute( "delete from trovefiles " "where instanceId = ? and path = ?", (instanceId, path)) # in case they are different, we pick the oldest, chances are it is # more "original" cu.execute( "insert into trovefiles " "(instanceId, streamId, versionId, pathId, path) " "values (?,?,?,?,?)", tuple(ret[0])) if len(ret) > 1: # need to recompute the sha1 - we might have changed the trove manifest # if the records were different self.fixTroveSig(repos, instanceId) # recreate the indexes and triggers - including new path # index for TroveFiles. Also recreates the indexes table. logMe(2, 'Recreating indexes... (this could take a while)') cu.execute("drop table tmpDups") self.db.loadSchema() schema.createTroves(self.db) logMe(2, 'Indexes created.')