Ejemplo n.º 1
0
    def __init__(self, database, useMd5=False):
        self.database = database

        dbExists = os.path.exists(database)

        usingMd5 = useMd5
        if not dbExists:
            logging.info("Sha1DB initialized with connection string %s" %
                         database)
            self._execSql("""create table if not exists files(
path varchar not null primary key,
chksum varchar not null,
symlink boolean default 0);""")
            self._execSql("create index csum_idx on files(chksum);")
            self._execSql(
                "create table if not exists versioning(chksum_type varchar not null)"
            )
            self._execSql("insert into versioning(chksum_type) values(?)",
                          ("md5" if useMd5 else "sha1", ))
        else:
            # pull the checksum type out of the database
            with sqliteConn(self.database) as cursor:
                cursor.execute("select chksum_type from versioning")
                for row in cursor:
                    (chksum_type, ) = row
                    usingMd5 = "md5" == chksum_type

        self.checksum = hashlib.md5 if usingMd5 else hashlib.sha1
Ejemplo n.º 2
0
 def updateChecksum(self, path):
   """ Update/insert checksums for a given path.  If the path points at a symlink, the entry will 
   be marked as being a symlink."""
   try:
     with sqliteConn(self.database) as cursor:
       self._updateChecksumAndLink(path, cursor)
   except Exception as einst:
     logging.error("Unable to update checksum for %s: %s" % (path, einst))
     raise
Ejemplo n.º 3
0
 def updatePath(self, old, new):
   """Updates the path in the database for a given file.  This is meant to be used by functions
   like rename, which may use directories rather than individual files for renames, thus old and
   new may be directories."""
   try:
     with sqliteConn(self.database) as cursor:
       cursor.execute(PATH_UPDATE, (old, new, old + '%'))
   except Exception as einst:
     logging.error("Unable to update path for %s to %s: %s" % (old, new, einst))
     raise
Ejemplo n.º 4
0
 def updateChecksum(self, path):
     """ Update/insert checksums for a given path.  If the path points at a symlink, the entry will 
 be marked as being a symlink."""
     try:
         with sqliteConn(self.database) as cursor:
             self._updateChecksumAndLink(path, cursor)
     except Exception as einst:
         logging.error("Unable to update checksum for %s: %s" %
                       (path, einst))
         raise
Ejemplo n.º 5
0
    def dedup(self, dupdir, doSymlink):
        """ Moves duplicate entries (based on checksum) into the dupdir.  Uses the entry's path to 
    reconstruct a subdirectory hierarchy in dupdir.  This will remove any common prefixes
    between dupdir and the file path itself so as to make a useful subdirectory structure.
    If doSymlink is true, then the original paths of the files that were moved will be symlinked 
    back to the canonical file; in addition, it will keep the file entry in the database rather than
    removing it."""
        logging.info("De-duping database")

        if os.path.exists(dupdir) and not len(os.listdir(dupdir)) <= 0:
            raise Exception("%s is not empty; refusing to move files" % dupdir)

        try:
            pathmap = {}  # store duplicate paths keyed by file checksum

            with sqliteConn(self.database) as cursor:
                cursor.execute("""select chksum, path, link from files 
where chksum in(
select chksum from files where symlink = 0 group by chksum having count(chksum) > 1) 
and symlink = 0 
and link = 1
order by chksum, link;""")
                for row in cursor:
                    (chksum, path, islink) = row
                    if not chksum in pathmap:
                        # ensure existence of list for checksum
                        pathmap[chksum] = []
                    paths = pathmap[chksum]
                    paths.append(path)

                for chksum, paths in pathmap.iteritems():
                    # the query above will result in single rows for symlinked files, so fix that here
                    # rather than mucking about with temp tables
                    paths = filter(lambda path: not os.path.islink(path),
                                   paths)

                    # we'll have at least two elements due to the inner part of the query above
                    for path in paths:
                        dst = dstWithSubdirectory(path, dupdir)
                        moveFile(
                            path, dst,
                            (not doSymlink
                             ))  # don't rm empty dirs if we are symlinking
                        if not doSymlink:
                            cursor.execute(REMOVE_ROW, (path, ))
                        else:
                            cursor.execute(
                                "update files set symlink = 1 where path = ?;",
                                (path, ))
                            symlinkFile(canonicalPath, path)
            logging.info("De-duping complete")
        except Exception as einst:
            logging.error("Unable to de-dup database: %s" % einst)
            raise
Ejemplo n.º 6
0
 def updatePath(self, old, new):
     """Updates the path in the database for a given file.  This is meant to be used by functions
 like rename, which may use directories rather than individual files for renames, thus old and
 new may be directories."""
     try:
         with sqliteConn(self.database) as cursor:
             cursor.execute(PATH_UPDATE, (old, new, old + '%'))
     except Exception as einst:
         logging.error("Unable to update path for %s to %s: %s" %
                       (old, new, einst))
         raise
Ejemplo n.º 7
0
 def _execSql(self, sql, sqlargs = None):
   sql = self._formatSql(sql)
   logging.debug("Running SQL %s with args %s" % (sql, sqlargs))
   
   try:
     with sqliteConn(self.database) as cursor:
       if sqlargs != None: 
         cursor.execute(sql, sqlargs)
       else:
         cursor.execute(sql)
   except Exception as einst:
     logging.error("Unable to exec %s with args %s: %s" % (sql, sqlargs, einst))
     raise
Ejemplo n.º 8
0
    def _execSql(self, sql, sqlargs=None):
        sql = self._formatSql(sql)
        logging.debug("Running SQL %s with args %s" % (sql, sqlargs))

        try:
            with sqliteConn(self.database) as cursor:
                if sqlargs != None:
                    cursor.execute(sql, sqlargs)
                else:
                    cursor.execute(sql)
        except Exception as einst:
            logging.error("Unable to exec %s with args %s: %s" %
                          (sql, sqlargs, einst))
            raise
Ejemplo n.º 9
0
 def updateAllChecksums(self, fsroot):
   logging.info("Updating all checksums under %s" % fsroot)
   """ Update/insert checksums for all of the files located under fsroot.  This is meant as an
   optimization for rescanning the database, as it uses a single connection and transaction."""
   with sqliteConn(self.database) as cursor:
     try:
       for root, dirs, files in os.walk(fsroot):
         for name in files:
           path = os.path.join(root, name)
           logging.info("Updating %s" % path)
           self._updateChecksumAndLink(path, cursor)
     except Exception as einst:
       logging.error("Unable to update checksum for %s: %s" % (path, einst))
       raise
   logging.info("Done updating all checksums")
Ejemplo n.º 10
0
  def dedup(self, dupdir, doSymlink):
    """ Moves duplicate entries (based on checksum) into the dupdir.  Uses the entry's path to 
    reconstruct a subdirectory hierarchy in dupdir.  This will remove any common prefixes
    between dupdir and the file path itself so as to make a useful subdirectory structure.
    If doSymlink is true, then the original paths of the files that were moved will be symlinked 
    back to the canonical file; in addition, it will keep the file entry in the database rather than
    removing it."""
    logging.info("De-duping database")
  
    if os.path.exists(dupdir) and not len(os.listdir(dupdir)) <= 0:
      raise Exception("%s is not empty; refusing to move files" % dupdir)
      
    try:
      pathmap = {} # store duplicate paths keyed by file checksum
      
      with sqliteConn(self.database) as cursor:
        cursor.execute("""select chksum, path, link from files 
where chksum in(
select chksum from files where symlink = 0 group by chksum having count(chksum) > 1) 
and symlink = 0 
and link = 1
order by chksum, link;""")
        for row in cursor:
          (chksum, path, islink) = row
          if not chksum in pathmap: 
            # ensure existence of list for checksum
            pathmap[chksum] = [] 
          paths = pathmap[chksum]
          paths.append(path)
          
        for chksum, paths in pathmap.iteritems():
          # the query above will result in single rows for symlinked files, so fix that here
          # rather than mucking about with temp tables
          paths = filter(lambda path: not os.path.islink(path), paths)
          
          # we'll have at least two elements due to the inner part of the query above
          for path in paths: 
            dst = dstWithSubdirectory(path, dupdir)
            moveFile(path, dst, (not doSymlink)) # don't rm empty dirs if we are symlinking
            if not doSymlink:
              cursor.execute(REMOVE_ROW, (path, ))
            else:
              cursor.execute("update files set symlink = 1 where path = ?;", (path, ))
              symlinkFile(canonicalPath, path)
      logging.info("De-duping complete")
    except Exception as einst:
      logging.error("Unable to de-dup database: %s" % einst)
      raise
Ejemplo n.º 11
0
 def updateAllChecksums(self, fsroot):
     logging.info("Updating all checksums under %s" % fsroot)
     """ Update/insert checksums for all of the files located under fsroot.  This is meant as an
 optimization for rescanning the database, as it uses a single connection and transaction."""
     with sqliteConn(self.database) as cursor:
         try:
             for root, dirs, files in os.walk(fsroot):
                 for name in files:
                     path = os.path.join(root, name)
                     logging.info("Updating %s" % path)
                     self._updateChecksumAndLink(path, cursor)
         except Exception as einst:
             logging.error("Unable to update checksum for %s: %s" %
                           (path, einst))
             raise
     logging.info("Done updating all checksums")
Ejemplo n.º 12
0
 def vacuum(self):
   """ Check the paths in the database, removing entries for which no actual file exists """
   logging.info("Vacuuming database")
   
   try:
     paths = [] # store nonexistent paths
     with sqliteConn(self.database) as cursor:
       cursor.execute("select path from files;")
       for row in cursor:
         (path, ) = row
         if not os.path.exists(path):
           paths.append(path)
         
       for path in paths:
         logging.info("Removing entry for %s; file does not exist" % path)
         cursor.execute("delete from files where path = ?;", (path, ))
       logging.info("Vacuum complete")
   except Exception as einst:
     logging.error("Unable to vacuum database: %s" % einst)
     raise
Ejemplo n.º 13
0
    def vacuum(self):
        """ Check the paths in the database, removing entries for which no actual file exists """
        logging.info("Vacuuming database")

        try:
            paths = []  # store nonexistent paths
            with sqliteConn(self.database) as cursor:
                cursor.execute("select path from files;")
                for row in cursor:
                    (path, ) = row
                    if not os.path.exists(path):
                        paths.append(path)

                for path in paths:
                    logging.info("Removing entry for %s; file does not exist" %
                                 path)
                    cursor.execute("delete from files where path = ?;",
                                   (path, ))
                logging.info("Vacuum complete")
        except Exception as einst:
            logging.error("Unable to vacuum database: %s" % einst)
            raise
Ejemplo n.º 14
0
  def __init__(self, database, useMd5=False):
    self.database = database

    dbExists = os.path.exists(database)
    
    usingMd5 = useMd5
    if not dbExists:
      logging.info("Sha1DB initialized with connection string %s" % database)
      self._execSql("""create table if not exists files(
path varchar not null primary key,
chksum varchar not null,
symlink boolean default 0);""")
      self._execSql("create index csum_idx on files(chksum);")
      self._execSql("create table if not exists versioning(chksum_type varchar not null)");
      self._execSql("insert into versioning(chksum_type) values(?)", ("md5" if useMd5 else "sha1", ));
    else:
      # pull the checksum type out of the database
      with sqliteConn(self.database) as cursor:
        cursor.execute("select chksum_type from versioning")
        for row in cursor:
          (chksum_type, ) = row
          usingMd5 = "md5" == chksum_type

    self.checksum = hashlib.md5 if usingMd5 else hashlib.sha1