def init(cwd): if os.path.isfile(os.path.join(cwd, _db_name)): logger.error("Cowardly refusing to overwrite existing %s", _db_name) sys.exit(1) dbfname, conn = createdb(cwd) logger.info("Created %s", dbfname) return dbfname
def printprogress(s, fname): total = updates + news + deletes percent = float(total) / progresstotal * 100 logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname)
def sync(conn, path, prefix, files=None): # path must be a full path on disk # prefix must be the full path on disk that we're syncing (or empty) start = time.time() news = updates = deletes = 0 tnews = tupdates = tdeletes = 0 # for debug printing with Cursor(conn) as c, Cursor(conn) as cu: c.execute(""" CREATE TEMPORARY TABLE ondisk ( path TEXT PRIMARY KEY COLLATE BINARY, dbpath TEXT COLLATE BINARY, last_modified INTEGER, size INTEGER ); """) exclusions = [] c.execute("SELECT type AS typ, expression AS e FROM exclusions;") for typ, expression in c: if typ == 're': expression = re.compile(expression) exclusions.append((typ, expression)) wpath = path if prefix: wpath = os.path.join(path, prefix) if files is None: os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu) else: visitor(path, prefix, exclusions, cu, wpath, files) logger.debug("Creating temporary index on ondisk(dbpath)") c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)") if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Found %d files on disk", tcount(cu, "ondisk")) # now build three groups: new files to be added, missing files to be # deleted, and old files to be updated # updated ones cu.execute(""" CREATE TEMPORARY VIEW updated_files AS SELECT f.docid AS docid, od.path AS path, od.last_modified AS last_modified, od.size AS size FROM ondisk od, files f WHERE od.dbpath = f.path AND f.last_modified < od.last_modified """) if logger.getEffectiveLevel() <= logging.DEBUG: tupdates = tcount(cu, "updated_files") logger.debug("Prepared %d files for updating", tupdates) # new files to create cu.execute(""" CREATE TEMPORARY VIEW created_files AS SELECT od.path AS path, od.dbpath AS dbpath, od.last_modified, od.size AS size FROM ondisk od WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path) """) if logger.getEffectiveLevel() <= logging.DEBUG: tnews = tcount(cu, "created_files") logger.debug("Prepared %d files for creation", tnews) # files that we've indexed in the past but don't exist anymore if files is None: # has to be a table instead of a view because parameters aren't allowed in views cu.execute( """ CREATE TEMPORARY TABLE deletedocs AS SELECT f.docid AS docid, f.path AS path FROM files f WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :( AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path) """, (prefix, prefix_expr(prefix))) if logger.getEffectiveLevel() <= logging.DEBUG: tdeletes = tcount(cu, "deletedocs") logger.debug("Prepared %d files for deletion", tdeletes) # set up our debugging progress-printing closure def printprogress(*a): pass if logger.getEffectiveLevel() <= logging.INFO: progresstotal = tnews + tupdates + tdeletes if progresstotal > 0: def printprogress(s, fname): total = updates + news + deletes percent = float(total) / progresstotal * 100 logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname) # files that we've indexed in the past but don't exist anymore if files is None: c.execute("SELECT docid, path FROM deletedocs") for (docid, fname) in c: printprogress("Deleting", fname) remove_document(cu, docid) deletes += 1 c.execute( "SELECT docid, path, last_modified, size FROM updated_files;") for (docid, fname, last_modified, size) in c: printprogress("Updating %.2f" % (size / 1024.0), fname) try: with get_bytes(fname, size) as bb: update_document(cu, docid, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue updates += 1 # new files to create c.execute( "SELECT path, dbpath, last_modified, size FROM created_files;") for (fname, dbpath, last_modified, size) in c: # is it safe to re-use the last_modified that we got before, or do # we need to re-stat() the file? reusing it like this could make a # race-condition whereby we never re-update that file printprogress("Adding %.1fk" % (size / 1024.0), fname) try: with get_bytes(fname, size) as bb: add_document(cu, dbpath, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue news += 1 logger.info("%d new documents, %d deletes, %d updates in %.2fs", news, deletes, updates, time.time() - start) cu.execute("DROP VIEW updated_files;") cu.execute("DROP VIEW created_files;") cu.execute("DROP TABLE IF EXISTS deletedocs;") cu.execute("DROP TABLE ondisk;")
def printprogress(s, fname): total = updates+news+deletes percent = float(total)/progresstotal*100 logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname)
def sync(conn, path, prefix, files = None): # path must be a full path on disk # prefix must be the full path on disk that we're syncing (or empty) start = time.time() news = updates = deletes = 0 tnews = tupdates = tdeletes = 0 # for debug printing with Cursor(conn) as c, Cursor(conn) as cu: c.execute(""" CREATE TEMPORARY TABLE ondisk ( path TEXT PRIMARY KEY COLLATE BINARY, dbpath TEXT COLLATE BINARY, last_modified INTEGER, size INTEGER ); """) exclusions = [] c.execute("SELECT type AS typ, expression AS e FROM exclusions;") for typ, expression in c: if typ == 're': expression = re.compile(expression) exclusions.append((typ, expression)) wpath = path if prefix: wpath = os.path.join(path, prefix) if files is None: os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu) else: visitor(path, prefix, exclusions, cu, wpath, files) logger.debug("Creating temporary index on ondisk(dbpath)") c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)") if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Found %d files on disk", tcount(cu, "ondisk")) # now build three groups: new files to be added, missing files to be # deleted, and old files to be updated # updated ones cu.execute(""" CREATE TEMPORARY VIEW updated_files AS SELECT f.docid AS docid, od.path AS path, od.last_modified AS last_modified, od.size AS size FROM ondisk od, files f WHERE od.dbpath = f.path AND f.last_modified < od.last_modified """) if logger.getEffectiveLevel() <= logging.DEBUG: tupdates = tcount(cu, "updated_files") logger.debug("Prepared %d files for updating", tupdates) # new files to create cu.execute(""" CREATE TEMPORARY VIEW created_files AS SELECT od.path AS path, od.dbpath AS dbpath, od.last_modified, od.size AS size FROM ondisk od WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path) """) if logger.getEffectiveLevel() <= logging.DEBUG: tnews = tcount(cu, "created_files") logger.debug("Prepared %d files for creation", tnews) # files that we've indexed in the past but don't exist anymore if files is None: # has to be a table instead of a view because parameters aren't allowed in views cu.execute(""" CREATE TEMPORARY TABLE deletedocs AS SELECT f.docid AS docid, f.path AS path FROM files f WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :( AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path) """, (prefix, prefix_expr(prefix))) if logger.getEffectiveLevel() <= logging.DEBUG: tdeletes = tcount(cu, "deletedocs") logger.debug("Prepared %d files for deletion", tdeletes) # set up our debugging progress-printing closure def printprogress(*a): pass if logger.getEffectiveLevel() <= logging.INFO: progresstotal = tnews + tupdates + tdeletes if progresstotal > 0: def printprogress(s, fname): total = updates+news+deletes percent = float(total)/progresstotal*100 logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname) # files that we've indexed in the past but don't exist anymore if files is None: c.execute("SELECT docid, path FROM deletedocs"); for (docid, fname) in c: printprogress("Deleting", fname) remove_document(cu, docid) deletes += 1 c.execute("SELECT docid, path, last_modified, size FROM updated_files;") for (docid, fname, last_modified, size) in c: printprogress("Updating %.2f" % (size/1024.0), fname) try: with get_bytes(fname, size) as bb: update_document(cu, docid, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue updates += 1 # new files to create c.execute("SELECT path, dbpath, last_modified, size FROM created_files;") for (fname, dbpath, last_modified, size) in c: # is it safe to re-use the last_modified that we got before, or do # we need to re-stat() the file? reusing it like this could make a # race-condition whereby we never re-update that file printprogress("Adding %.1fk" % (size/1024.0), fname) try: with get_bytes(fname, size) as bb: add_document(cu, dbpath, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue news += 1 logger.info("%d new documents, %d deletes, %d updates in %.2fs", news, deletes, updates, time.time()-start) cu.execute("DROP VIEW updated_files;") cu.execute("DROP VIEW created_files;") cu.execute("DROP TABLE IF EXISTS deletedocs;") cu.execute("DROP TABLE ondisk;")