def search(conn, prefix, term, mode, checksync=True, color=False): assert mode in ("MATCH", "REGEXP") with Cursor(conn) as c: prefix = prefix or "" prefixexpr = prefix_expr(prefix) needsync = 0 c.execute( """ SELECT f.path, f.last_modified, offsets(ft.files_fts), snippet(ft.files_fts, ?, ?, ?, -1, -10) FROM files f, files_fts ft WHERE f.docid = ft.docid AND (? = '' OR f.path LIKE ? ESCAPE '\\') -- use the prefix if present -- ESCAPE disables the LIKE optimization :( AND ft.body %(mode)s ? -- TODO: this runs simple_rank, which calls a Python function, many -- times per row. we can decompose this to a subselect to avoid this ORDER BY simple_rank(matchinfo(ft.files_fts)) """ % dict(mode=mode), ( snippet_color if color else "", snippet_end_color if color else "", snippet_elipsis if color else "...", prefix, prefixexpr, term, ), ) for (path, last_modified, offsets, snippet) in c: if prefix: assert path.startswith(prefix) # if they're in a subdirectory, deprefix the filename shortpath = path[len(prefix) + 1 :] if prefix else path if checksync: # check if the returned files are known to be out of date. this # can be skipped when checksync is False (which means that a # sync was done before starting the search) try: st = os.stat(shortpath) if int(st[stat.ST_MTIME]) > last_modified: needsync += 1 except OSError: needsync += 1 yield SearchResult(shortpath, offsets, snippet) if needsync: logger.warning("%d files were missing or out-of-date, you may need to resync", needsync)
def search(conn, prefix, term, mode, checksync=True, color=False): assert mode in ('MATCH', 'REGEXP') with Cursor(conn) as c: prefix = prefix or '' prefixexpr = prefix_expr(prefix) needsync = 0 c.execute( """ SELECT f.path, f.last_modified, offsets(ft.files_fts), snippet(ft.files_fts, ?, ?, ?, -1, -10) FROM files f, files_fts ft WHERE f.docid = ft.docid AND (? = '' OR f.path LIKE ? ESCAPE '\\') -- use the prefix if present -- ESCAPE disables the LIKE optimization :( AND ft.body %(mode)s ? -- TODO: this runs simple_rank, which calls a Python function, many -- times per row. we can decompose this to a subselect to avoid this ORDER BY simple_rank(matchinfo(ft.files_fts)) """ % dict(mode=mode), ( snippet_color if color else '', snippet_end_color if color else '', snippet_elipsis if color else '...', prefix, prefixexpr, term, )) for (path, last_modified, offsets, snippet) in c: if prefix: assert path.startswith(prefix) # if they're in a subdirectory, deprefix the filename shortpath = path[len(prefix) + 1:] if prefix else path if checksync: # check if the returned files are known to be out of date. this # can be skipped when checksync is False (which means that a # sync was done before starting the search) try: st = os.stat(shortpath) if int(st[stat.ST_MTIME]) > last_modified: needsync += 1 except OSError: needsync += 1 yield SearchResult(shortpath, offsets, snippet) if needsync: logger.warning( "%d files were missing or out-of-date, you may need to resync", needsync)
def sync(conn, path, prefix, files=None): # path must be a full path on disk # prefix must be the full path on disk that we're syncing (or empty) start = time.time() news = updates = deletes = 0 tnews = tupdates = tdeletes = 0 # for debug printing with Cursor(conn) as c, Cursor(conn) as cu: c.execute(""" CREATE TEMPORARY TABLE ondisk ( path TEXT PRIMARY KEY COLLATE BINARY, dbpath TEXT COLLATE BINARY, last_modified INTEGER, size INTEGER ); """) exclusions = [] c.execute("SELECT type AS typ, expression AS e FROM exclusions;") for typ, expression in c: if typ == 're': expression = re.compile(expression) exclusions.append((typ, expression)) wpath = path if prefix: wpath = os.path.join(path, prefix) if files is None: os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu) else: visitor(path, prefix, exclusions, cu, wpath, files) logger.debug("Creating temporary index on ondisk(dbpath)") c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)") if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Found %d files on disk", tcount(cu, "ondisk")) # now build three groups: new files to be added, missing files to be # deleted, and old files to be updated # updated ones cu.execute(""" CREATE TEMPORARY VIEW updated_files AS SELECT f.docid AS docid, od.path AS path, od.last_modified AS last_modified, od.size AS size FROM ondisk od, files f WHERE od.dbpath = f.path AND f.last_modified < od.last_modified """) if logger.getEffectiveLevel() <= logging.DEBUG: tupdates = tcount(cu, "updated_files") logger.debug("Prepared %d files for updating", tupdates) # new files to create cu.execute(""" CREATE TEMPORARY VIEW created_files AS SELECT od.path AS path, od.dbpath AS dbpath, od.last_modified, od.size AS size FROM ondisk od WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path) """) if logger.getEffectiveLevel() <= logging.DEBUG: tnews = tcount(cu, "created_files") logger.debug("Prepared %d files for creation", tnews) # files that we've indexed in the past but don't exist anymore if files is None: # has to be a table instead of a view because parameters aren't allowed in views cu.execute( """ CREATE TEMPORARY TABLE deletedocs AS SELECT f.docid AS docid, f.path AS path FROM files f WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :( AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path) """, (prefix, prefix_expr(prefix))) if logger.getEffectiveLevel() <= logging.DEBUG: tdeletes = tcount(cu, "deletedocs") logger.debug("Prepared %d files for deletion", tdeletes) # set up our debugging progress-printing closure def printprogress(*a): pass if logger.getEffectiveLevel() <= logging.INFO: progresstotal = tnews + tupdates + tdeletes if progresstotal > 0: def printprogress(s, fname): total = updates + news + deletes percent = float(total) / progresstotal * 100 logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname) # files that we've indexed in the past but don't exist anymore if files is None: c.execute("SELECT docid, path FROM deletedocs") for (docid, fname) in c: printprogress("Deleting", fname) remove_document(cu, docid) deletes += 1 c.execute( "SELECT docid, path, last_modified, size FROM updated_files;") for (docid, fname, last_modified, size) in c: printprogress("Updating %.2f" % (size / 1024.0), fname) try: with get_bytes(fname, size) as bb: update_document(cu, docid, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue updates += 1 # new files to create c.execute( "SELECT path, dbpath, last_modified, size FROM created_files;") for (fname, dbpath, last_modified, size) in c: # is it safe to re-use the last_modified that we got before, or do # we need to re-stat() the file? reusing it like this could make a # race-condition whereby we never re-update that file printprogress("Adding %.1fk" % (size / 1024.0), fname) try: with get_bytes(fname, size) as bb: add_document(cu, dbpath, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue news += 1 logger.info("%d new documents, %d deletes, %d updates in %.2fs", news, deletes, updates, time.time() - start) cu.execute("DROP VIEW updated_files;") cu.execute("DROP VIEW created_files;") cu.execute("DROP TABLE IF EXISTS deletedocs;") cu.execute("DROP TABLE ondisk;")
def sync(conn, path, prefix, files = None): # path must be a full path on disk # prefix must be the full path on disk that we're syncing (or empty) start = time.time() news = updates = deletes = 0 tnews = tupdates = tdeletes = 0 # for debug printing with Cursor(conn) as c, Cursor(conn) as cu: c.execute(""" CREATE TEMPORARY TABLE ondisk ( path TEXT PRIMARY KEY COLLATE BINARY, dbpath TEXT COLLATE BINARY, last_modified INTEGER, size INTEGER ); """) exclusions = [] c.execute("SELECT type AS typ, expression AS e FROM exclusions;") for typ, expression in c: if typ == 're': expression = re.compile(expression) exclusions.append((typ, expression)) wpath = path if prefix: wpath = os.path.join(path, prefix) if files is None: os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu) else: visitor(path, prefix, exclusions, cu, wpath, files) logger.debug("Creating temporary index on ondisk(dbpath)") c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)") if logger.getEffectiveLevel() <= logging.DEBUG: logger.debug("Found %d files on disk", tcount(cu, "ondisk")) # now build three groups: new files to be added, missing files to be # deleted, and old files to be updated # updated ones cu.execute(""" CREATE TEMPORARY VIEW updated_files AS SELECT f.docid AS docid, od.path AS path, od.last_modified AS last_modified, od.size AS size FROM ondisk od, files f WHERE od.dbpath = f.path AND f.last_modified < od.last_modified """) if logger.getEffectiveLevel() <= logging.DEBUG: tupdates = tcount(cu, "updated_files") logger.debug("Prepared %d files for updating", tupdates) # new files to create cu.execute(""" CREATE TEMPORARY VIEW created_files AS SELECT od.path AS path, od.dbpath AS dbpath, od.last_modified, od.size AS size FROM ondisk od WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path) """) if logger.getEffectiveLevel() <= logging.DEBUG: tnews = tcount(cu, "created_files") logger.debug("Prepared %d files for creation", tnews) # files that we've indexed in the past but don't exist anymore if files is None: # has to be a table instead of a view because parameters aren't allowed in views cu.execute(""" CREATE TEMPORARY TABLE deletedocs AS SELECT f.docid AS docid, f.path AS path FROM files f WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :( AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path) """, (prefix, prefix_expr(prefix))) if logger.getEffectiveLevel() <= logging.DEBUG: tdeletes = tcount(cu, "deletedocs") logger.debug("Prepared %d files for deletion", tdeletes) # set up our debugging progress-printing closure def printprogress(*a): pass if logger.getEffectiveLevel() <= logging.INFO: progresstotal = tnews + tupdates + tdeletes if progresstotal > 0: def printprogress(s, fname): total = updates+news+deletes percent = float(total)/progresstotal*100 logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname) # files that we've indexed in the past but don't exist anymore if files is None: c.execute("SELECT docid, path FROM deletedocs"); for (docid, fname) in c: printprogress("Deleting", fname) remove_document(cu, docid) deletes += 1 c.execute("SELECT docid, path, last_modified, size FROM updated_files;") for (docid, fname, last_modified, size) in c: printprogress("Updating %.2f" % (size/1024.0), fname) try: with get_bytes(fname, size) as bb: update_document(cu, docid, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue updates += 1 # new files to create c.execute("SELECT path, dbpath, last_modified, size FROM created_files;") for (fname, dbpath, last_modified, size) in c: # is it safe to re-use the last_modified that we got before, or do # we need to re-stat() the file? reusing it like this could make a # race-condition whereby we never re-update that file printprogress("Adding %.1fk" % (size/1024.0), fname) try: with get_bytes(fname, size) as bb: add_document(cu, dbpath, last_modified, bb) except IOError as e: if e.errno in (errno.ENOENT, errno.EPERM): logger.warning("Skipping %s: %s", fname, os.strerror(e.errno)) else: raise continue news += 1 logger.info("%d new documents, %d deletes, %d updates in %.2fs", news, deletes, updates, time.time()-start) cu.execute("DROP VIEW updated_files;") cu.execute("DROP VIEW created_files;") cu.execute("DROP TABLE IF EXISTS deletedocs;") cu.execute("DROP TABLE ondisk;")