def visitor(path, prefix, exclusions, cu, dirname, fnames): if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Walking %s", dirname) fnames.sort( ) # makes the child 'walking' messages come out in an order the user expects remove = [] for basename in fnames: fname = os.path.join(dirname, basename) assert fname.startswith(path) if prefix: assert fname.startswith(os.path.join(path, prefix)) dbfname = fname[len(path) + 1:] if exclusions and not should_allow(exclusions, basename, dbfname): remove.append(basename) continue try: st = os.stat(fname) mode = st.st_mode size = st[stat.ST_SIZE] if stat.S_ISDIR(mode): continue if not stat.S_ISREG(mode): logger.warn("Skipping non-regular file %s (%s)", dbfname, stat.S_IFMT(mode)) continue except IOError as e: if e.errno == errno.ENOENT: # it was deleted in between continue raise cu.execute( "INSERT INTO ondisk(path, dbpath, last_modified, size) VALUES (?, ?, ?, ?)", (fname, dbfname, int(st[stat.ST_MTIME]), size)) if remove and logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Removing %r from walk", list(remove)) for r in remove: fnames.remove(r)
def visitor(path, prefix, exclusions, cu, dirname, fnames): if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Walking %s", dirname) fnames.sort() # makes the child 'walking' messages come out in an order the user expects remove = [] for basename in fnames: fname = os.path.join(dirname, basename) assert fname.startswith(path) if prefix: assert fname.startswith(os.path.join(path, prefix)) dbfname = fname[len(path)+1:] if exclusions and not should_allow(exclusions, basename, dbfname): remove.append(basename) continue try: st = os.stat(fname) mode = st.st_mode size = st[stat.ST_SIZE] if stat.S_ISDIR(mode): continue if not stat.S_ISREG(mode): logger.warn("Skipping non-regular file %s (%s)", dbfname, stat.S_IFMT(mode)) continue except IOError as e: if e.errno == errno.ENOENT: # it was deleted in between continue raise cu.execute("INSERT INTO ondisk(path, dbpath, last_modified, size) VALUES (?, ?, ?, ?)", (fname, dbfname, int(st[stat.ST_MTIME]), size)) if remove and logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Removing %r from walk", list(remove)) for r in remove: fnames.remove(r)
def main(): ap = argparse.ArgumentParser('fts', description="a command line full text search engine") ap.add_argument('--logging', default='warn', help=argparse.SUPPRESS, choices = ('error', 'warn', 'info', 'debug')) ap.add_argument("--init", action="store_true", help="Create a new .fts.db in the current directory") ap.add_argument("--no-sync", dest='nosync', action="store_true", help="don't sync the database when making a new one. only valid with --init") ap.add_argument("--sync", dest='sync', action="store_true", help="sync the fts database with the files on disk") ap.add_argument("--optimize", action="store_true", help="optimize the sqlite database for size and performance") ap.add_argument('--sync-one', metavar='filename', help="sync a single file (unlike the other commands, this one doesn't care about the current directory)") ap.add_argument("--list-ignores", action='store_true', default=[]) ap.add_argument("--rm-ignore", type=int, metavar='ignoreid', action='append', default=[]) ap.add_argument("--ignore-re", metavar='re', action='append', default=[]) ap.add_argument("--ignore-simple", metavar='filename', action='append', default=[]) ap.add_argument('--ignore', "--ignore-glob", dest='ignore_glob', metavar='pattern', action='append', default=[]) ap.add_argument('-r', '--re', '--regex', '--regexp', dest='searchmode', default='MATCH', action="store_const", const='REGEXP', help="search using a regex instead of MATCH syntax. Much slower!") ap.add_argument('-l', dest='display_mode', action='store_const', const='filename_only', help="print only the matching filenames") ap.add_argument('--color-mode', dest='color_mode', choices=('yes', 'no', 'auto'), default='auto') ap.add_argument('--color', dest='color_mode', action='store_const', const='yes') ap.add_argument("search", nargs="*") args = ap.parse_args() logger.setLevel(getattr(logging, args.logging.upper())) if args.color_mode == 'yes': color = True elif args.color_mode == 'no': color = False else: # it's 'auto' color = (os.isatty(sys.stdout.fileno()) and args.display_mode != 'filename_only' and args.searchmode != 'REGEXP' # since we don't have snippets working here yet ) cwd = os.getcwd() didsomething = False exitval = 0 if args.init: didsomething = True init(cwd) if args.sync_one: # this is designed to be called by tools like procmail or IDEs' on-save # hooks, so rather than making them play games with the cwd we have # special finddb logic for it. note that because of this we are # vulnerable to .fts.db files that shadow the intended one. Also note # that we may operate on a different .fts.db than other commands run in # the same session. # TODO: Maybe we should refuse to allow other commands to operate in the # same session for this reason fpath = args.sync_one if not fpath.startswith('/'): fpath = os.path.join(cwd, fpath) assert os.path.isfile(fpath) dirname, basename = os.path.dirname(fpath), os.path.basename(fpath) froot, fprefix, conn = finddb(dirname) assert fpath.startswith(os.path.join(froot, fprefix)) with conn: sync(conn, froot, fprefix, files = [basename]) didsomething = True root, prefix, conn = finddb(cwd) with conn: # all other top-level functions operate in one global transaction for a in args.rm_ignore: didsomething = True rm_ignore(conn, a) for a in args.ignore_re: didsomething = True try: re.compile(a) except: logging.error("Couldn't compile regex %r, are you sure it's valid?", a) raise add_ignore(conn, 're', a) for a in args.ignore_simple: didsomething = True add_ignore(conn, 'simple', a) for a in args.ignore_glob: didsomething = True add_ignore(conn, 'glob', a) if args.list_ignores: didsomething = True list_ignores(conn) dosync = args.sync or (args.init and not args.nosync) if dosync: didsomething = True sync(conn, root, prefix) if args.optimize: didsomething = True with Cursor(conn) as c: logger.debug("OPTIMIZE") c.execute("INSERT INTO files_fts(files_fts) values('optimize');") logger.debug("VACUUM ANALYZE;") c.execute("VACUUM ANALYZE;") for term in args.search: # for now, ANY search matching a document will return it, and it may be # returned twice didsomething = True for sr in search(conn, prefix, term, args.searchmode, checksync=dosync, color=color): if args.display_mode == 'filename_only': print sr.filename else: print sr.format(color=color) # at least one result was returned exitval = 0 if not didsomething: ap.print_usage() sys.exit(1) sys.exit(exitval)
def sync(conn, path, prefix, files=None): # path must be a full path on disk # prefix must be the full path on disk that we're syncing (or empty) start = time.time() news = updates = deletes = 0 tnews = tupdates = tdeletes = 0 # for debug printing with Cursor(conn) as c, Cursor(conn) as cu: c.execute(""" CREATE TEMPORARY TABLE ondisk ( path TEXT PRIMARY KEY COLLATE BINARY, dbpath TEXT COLLATE BINARY, last_modified INTEGER, size INTEGER ); """) exclusions = [] c.execute("SELECT type AS typ, expression AS e FROM exclusions;") for typ, expression in c: if typ == 're': expression = re.compile(expression) exclusions.append((typ, expression)) wpath = path if prefix: wpath = os.path.join(path, prefix) if files is None: os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu) else: visitor(path, prefix, exclusions, cu, wpath, files) logger.debug("Creating temporary index on ondisk(dbpath)") c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)") if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Found %d files on disk", tcount(cu, "ondisk")) # now build three groups: new files to be added, missing files to be # deleted, and old files to be updated # updated ones cu.execute(""" CREATE TEMPORARY VIEW updated_files AS SELECT f.docid AS docid, od.path AS path, od.last_modified AS last_modified, od.size AS size FROM ondisk od, files f WHERE od.dbpath = f.path AND f.last_modified < od.last_modified """) if logger.getEffectiveLevel() <= logging.DEBUG: tupdates = tcount(cu, "updated_files") logger.debug("Prepared %d files for updating", tupdates) # new files to create cu.execute(""" CREATE TEMPORARY VIEW created_files AS SELECT od.path AS path, od.dbpath AS dbpath, od.last_modified, od.size AS size FROM ondisk od WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path) """) if logger.getEffectiveLevel() <= logging.DEBUG: tnews = tcount(cu, "created_files") logger.debug("Prepared %d files for creation", tnews) # files that we've indexed in the past but don't exist anymore if files is None: # has to be a table instead of a view because parameters aren't allowed in views cu.execute( """ CREATE TEMPORARY TABLE deletedocs AS SELECT f.docid AS docid, f.path AS path FROM files f WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :( AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path) """, (prefix, prefix_expr(prefix))) if logger.getEffectiveLevel() <= logging.DEBUG: tdeletes = tcount(cu, "deletedocs") logger.debug("Prepared %d files for deletion", tdeletes) # set up our debugging progress-printing closure def printprogress(*a): pass if logger.getEffectiveLevel() <= logging.INFO: progresstotal = tnews + tupdates + tdeletes if progresstotal > 0: def printprogress(s, fname): total = updates + news + deletes percent = float(total) / progresstotal * 100 logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname) # files that we've indexed in the past but don't exist anymore if files is None: c.execute("SELECT docid, path FROM deletedocs") for (docid, fname) in c: printprogress("Deleting", fname) remove_document(cu, docid) deletes += 1 c.execute( "SELECT docid, path, last_modified, size FROM updated_files;") for (docid, fname, last_modified, size) in c: printprogress("Updating %.2f" % (size / 1024.0), fname) try: with get_bytes(fname, size) as bb: update_document(cu, docid, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue updates += 1 # new files to create c.execute( "SELECT path, dbpath, last_modified, size FROM created_files;") for (fname, dbpath, last_modified, size) in c: # is it safe to re-use the last_modified that we got before, or do # we need to re-stat() the file? reusing it like this could make a # race-condition whereby we never re-update that file printprogress("Adding %.1fk" % (size / 1024.0), fname) try: with get_bytes(fname, size) as bb: add_document(cu, dbpath, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue news += 1 logger.info("%d new documents, %d deletes, %d updates in %.2fs", news, deletes, updates, time.time() - start) cu.execute("DROP VIEW updated_files;") cu.execute("DROP VIEW created_files;") cu.execute("DROP TABLE IF EXISTS deletedocs;") cu.execute("DROP TABLE ondisk;")
def sync(conn, path, prefix, files = None): # path must be a full path on disk # prefix must be the full path on disk that we're syncing (or empty) start = time.time() news = updates = deletes = 0 tnews = tupdates = tdeletes = 0 # for debug printing with Cursor(conn) as c, Cursor(conn) as cu: c.execute(""" CREATE TEMPORARY TABLE ondisk ( path TEXT PRIMARY KEY COLLATE BINARY, dbpath TEXT COLLATE BINARY, last_modified INTEGER, size INTEGER ); """) exclusions = [] c.execute("SELECT type AS typ, expression AS e FROM exclusions;") for typ, expression in c: if typ == 're': expression = re.compile(expression) exclusions.append((typ, expression)) wpath = path if prefix: wpath = os.path.join(path, prefix) if files is None: os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu) else: visitor(path, prefix, exclusions, cu, wpath, files) logger.debug("Creating temporary index on ondisk(dbpath)") c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)") if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Found %d files on disk", tcount(cu, "ondisk")) # now build three groups: new files to be added, missing files to be # deleted, and old files to be updated # updated ones cu.execute(""" CREATE TEMPORARY VIEW updated_files AS SELECT f.docid AS docid, od.path AS path, od.last_modified AS last_modified, od.size AS size FROM ondisk od, files f WHERE od.dbpath = f.path AND f.last_modified < od.last_modified """) if logger.getEffectiveLevel() <= logging.DEBUG: tupdates = tcount(cu, "updated_files") logger.debug("Prepared %d files for updating", tupdates) # new files to create cu.execute(""" CREATE TEMPORARY VIEW created_files AS SELECT od.path AS path, od.dbpath AS dbpath, od.last_modified, od.size AS size FROM ondisk od WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path) """) if logger.getEffectiveLevel() <= logging.DEBUG: tnews = tcount(cu, "created_files") logger.debug("Prepared %d files for creation", tnews) # files that we've indexed in the past but don't exist anymore if files is None: # has to be a table instead of a view because parameters aren't allowed in views cu.execute(""" CREATE TEMPORARY TABLE deletedocs AS SELECT f.docid AS docid, f.path AS path FROM files f WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :( AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path) """, (prefix, prefix_expr(prefix))) if logger.getEffectiveLevel() <= logging.DEBUG: tdeletes = tcount(cu, "deletedocs") logger.debug("Prepared %d files for deletion", tdeletes) # set up our debugging progress-printing closure def printprogress(*a): pass if logger.getEffectiveLevel() <= logging.INFO: progresstotal = tnews + tupdates + tdeletes if progresstotal > 0: def printprogress(s, fname): total = updates+news+deletes percent = float(total)/progresstotal*100 logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname) # files that we've indexed in the past but don't exist anymore if files is None: c.execute("SELECT docid, path FROM deletedocs"); for (docid, fname) in c: printprogress("Deleting", fname) remove_document(cu, docid) deletes += 1 c.execute("SELECT docid, path, last_modified, size FROM updated_files;") for (docid, fname, last_modified, size) in c: printprogress("Updating %.2f" % (size/1024.0), fname) try: with get_bytes(fname, size) as bb: update_document(cu, docid, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue updates += 1 # new files to create c.execute("SELECT path, dbpath, last_modified, size FROM created_files;") for (fname, dbpath, last_modified, size) in c: # is it safe to re-use the last_modified that we got before, or do # we need to re-stat() the file? reusing it like this could make a # race-condition whereby we never re-update that file printprogress("Adding %.1fk" % (size/1024.0), fname) try: with get_bytes(fname, size) as bb: add_document(cu, dbpath, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue news += 1 logger.info("%d new documents, %d deletes, %d updates in %.2fs", news, deletes, updates, time.time()-start) cu.execute("DROP VIEW updated_files;") cu.execute("DROP VIEW created_files;") cu.execute("DROP TABLE IF EXISTS deletedocs;") cu.execute("DROP TABLE ondisk;")
def main(): ap = argparse.ArgumentParser( 'fts', description="a command line full text search engine") ap.add_argument('--logging', default='warn', help=argparse.SUPPRESS, choices=('error', 'warn', 'info', 'debug')) ap.add_argument("--init", action="store_true", help="Create a new .fts.db in the current directory") ap.add_argument( "--no-sync", dest='nosync', action="store_true", help= "don't sync the database when making a new one. only valid with --init" ) ap.add_argument("--sync", dest='sync', action="store_true", help="sync the fts database with the files on disk") ap.add_argument( "--optimize", action="store_true", help="optimize the sqlite database for size and performance") ap.add_argument( '--sync-one', metavar='filename', help= "sync a single file (unlike the other commands, this one doesn't care about the current directory)" ) ap.add_argument("--list-ignores", action='store_true', default=[]) ap.add_argument("--rm-ignore", type=int, metavar='ignoreid', action='append', default=[]) ap.add_argument("--ignore-re", metavar='re', action='append', default=[]) ap.add_argument("--ignore-simple", metavar='filename', action='append', default=[]) ap.add_argument('--ignore', "--ignore-glob", dest='ignore_glob', metavar='pattern', action='append', default=[]) ap.add_argument( '-r', '--re', '--regex', '--regexp', dest='searchmode', default='MATCH', action="store_const", const='REGEXP', help="search using a regex instead of MATCH syntax. Much slower!") ap.add_argument('-l', dest='display_mode', action='store_const', const='filename_only', help="print only the matching filenames") ap.add_argument('--color-mode', dest='color_mode', choices=('yes', 'no', 'auto'), default='auto') ap.add_argument('--color', dest='color_mode', action='store_const', const='yes') ap.add_argument("search", nargs="*") args = ap.parse_args() logger.setLevel(getattr(logging, args.logging.upper())) if args.color_mode == 'yes': color = True elif args.color_mode == 'no': color = False else: # it's 'auto' color = ( os.isatty(sys.stdout.fileno()) and args.display_mode != 'filename_only' and args.searchmode != 'REGEXP' # since we don't have snippets working here yet ) cwd = os.getcwd() didsomething = False exitval = 0 if args.init: didsomething = True init(cwd) if args.sync_one: # this is designed to be called by tools like procmail or IDEs' on-save # hooks, so rather than making them play games with the cwd we have # special finddb logic for it. note that because of this we are # vulnerable to .fts.db files that shadow the intended one. Also note # that we may operate on a different .fts.db than other commands run in # the same session. # TODO: Maybe we should refuse to allow other commands to operate in the # same session for this reason fpath = args.sync_one if not fpath.startswith('/'): fpath = os.path.join(cwd, fpath) assert os.path.isfile(fpath) dirname, basename = os.path.dirname(fpath), os.path.basename(fpath) froot, fprefix, conn = finddb(dirname) assert fpath.startswith(os.path.join(froot, fprefix)) with conn: sync(conn, froot, fprefix, files=[basename]) didsomething = True root, prefix, conn = finddb(cwd) with conn: # all other top-level functions operate in one global transaction for a in args.rm_ignore: didsomething = True rm_ignore(conn, a) for a in args.ignore_re: didsomething = True try: re.compile(a) except: logging.error( "Couldn't compile regex %r, are you sure it's valid?", a) raise add_ignore(conn, 're', a) for a in args.ignore_simple: didsomething = True add_ignore(conn, 'simple', a) for a in args.ignore_glob: didsomething = True add_ignore(conn, 'glob', a) if args.list_ignores: didsomething = True list_ignores(conn) dosync = args.sync or (args.init and not args.nosync) if dosync: didsomething = True sync(conn, root, prefix) if args.optimize: didsomething = True with Cursor(conn) as c: logger.debug("OPTIMIZE") c.execute( "INSERT INTO files_fts(files_fts) values('optimize');") logger.debug("VACUUM ANALYZE;") c.execute("VACUUM ANALYZE;") for term in args.search: # for now, ANY search matching a document will return it, and it may be # returned twice didsomething = True for sr in search(conn, prefix, term, args.searchmode, checksync=dosync, color=color): if args.display_mode == 'filename_only': print sr.filename else: print sr.format(color=color) # at least one result was returned exitval = 0 if not didsomething: ap.print_usage() sys.exit(1) sys.exit(exitval)