Ejemplo n.º 1
0
def search(conn, prefix, term, mode, checksync=True, color=False):
    assert mode in ("MATCH", "REGEXP")

    with Cursor(conn) as c:
        prefix = prefix or ""
        prefixexpr = prefix_expr(prefix)
        needsync = 0
        c.execute(
            """
            SELECT f.path, f.last_modified,
                   offsets(ft.files_fts),
                   snippet(ft.files_fts, ?, ?, ?, -1, -10)
              FROM files f, files_fts ft
             WHERE f.docid = ft.docid
               AND (? = '' OR f.path LIKE ? ESCAPE '\\') -- use the prefix if present -- ESCAPE disables the LIKE optimization :(
               AND ft.body %(mode)s ?
          -- TODO: this runs simple_rank, which calls a Python function, many
          -- times per row. we can decompose this to a subselect to avoid this
          ORDER BY simple_rank(matchinfo(ft.files_fts))
        """
            % dict(mode=mode),
            (
                snippet_color if color else "",
                snippet_end_color if color else "",
                snippet_elipsis if color else "...",
                prefix,
                prefixexpr,
                term,
            ),
        )
        for (path, last_modified, offsets, snippet) in c:

            if prefix:
                assert path.startswith(prefix)

            # if they're in a subdirectory, deprefix the filename
            shortpath = path[len(prefix) + 1 :] if prefix else path

            if checksync:
                # check if the returned files are known to be out of date. this
                # can be skipped when checksync is False (which means that a
                # sync was done before starting the search)
                try:
                    st = os.stat(shortpath)
                    if int(st[stat.ST_MTIME]) > last_modified:
                        needsync += 1
                except OSError:
                    needsync += 1

            yield SearchResult(shortpath, offsets, snippet)

        if needsync:
            logger.warning("%d files were missing or out-of-date, you may need to resync", needsync)
Ejemplo n.º 2
0
def search(conn, prefix, term, mode, checksync=True, color=False):
    assert mode in ('MATCH', 'REGEXP')

    with Cursor(conn) as c:
        prefix = prefix or ''
        prefixexpr = prefix_expr(prefix)
        needsync = 0
        c.execute(
            """
            SELECT f.path, f.last_modified,
                   offsets(ft.files_fts),
                   snippet(ft.files_fts, ?, ?, ?, -1, -10)
              FROM files f, files_fts ft
             WHERE f.docid = ft.docid
               AND (? = '' OR f.path LIKE ? ESCAPE '\\') -- use the prefix if present -- ESCAPE disables the LIKE optimization :(
               AND ft.body %(mode)s ?
          -- TODO: this runs simple_rank, which calls a Python function, many
          -- times per row. we can decompose this to a subselect to avoid this
          ORDER BY simple_rank(matchinfo(ft.files_fts))
        """ % dict(mode=mode), (
                snippet_color if color else '',
                snippet_end_color if color else '',
                snippet_elipsis if color else '...',
                prefix,
                prefixexpr,
                term,
            ))
        for (path, last_modified, offsets, snippet) in c:

            if prefix:
                assert path.startswith(prefix)

            # if they're in a subdirectory, deprefix the filename
            shortpath = path[len(prefix) + 1:] if prefix else path

            if checksync:
                # check if the returned files are known to be out of date. this
                # can be skipped when checksync is False (which means that a
                # sync was done before starting the search)
                try:
                    st = os.stat(shortpath)
                    if int(st[stat.ST_MTIME]) > last_modified:
                        needsync += 1
                except OSError:
                    needsync += 1

            yield SearchResult(shortpath, offsets, snippet)

        if needsync:
            logger.warning(
                "%d files were missing or out-of-date, you may need to resync",
                needsync)
Ejemplo n.º 3
0
def sync(conn, path, prefix, files=None):
    # path must be a full path on disk
    # prefix must be the full path on disk that we're syncing (or empty)

    start = time.time()

    news = updates = deletes = 0
    tnews = tupdates = tdeletes = 0  # for debug printing

    with Cursor(conn) as c, Cursor(conn) as cu:
        c.execute("""
                  CREATE TEMPORARY TABLE
                  ondisk (
                     path          TEXT PRIMARY KEY COLLATE BINARY,
                     dbpath        TEXT COLLATE BINARY,
                     last_modified INTEGER,
                     size          INTEGER
                  );
                  """)

        exclusions = []
        c.execute("SELECT type AS typ, expression AS e FROM exclusions;")
        for typ, expression in c:
            if typ == 're':
                expression = re.compile(expression)
            exclusions.append((typ, expression))

        wpath = path
        if prefix:
            wpath = os.path.join(path, prefix)

        if files is None:
            os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu)
        else:
            visitor(path, prefix, exclusions, cu, wpath, files)

        logger.debug("Creating temporary index on ondisk(dbpath)")
        c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)")

        if logger.getEffectiveLevel() <= logging.DEBUG:
            logger.debug("Found %d files on disk", tcount(cu, "ondisk"))

        # now build three groups: new files to be added, missing files to be
        # deleted, and old files to be updated

        # updated ones
        cu.execute("""
            CREATE TEMPORARY VIEW updated_files AS
            SELECT f.docid AS docid,
                   od.path AS path,
                   od.last_modified AS last_modified,
                   od.size AS size
              FROM ondisk od, files f
             WHERE od.dbpath = f.path
               AND f.last_modified < od.last_modified
        """)
        if logger.getEffectiveLevel() <= logging.DEBUG:
            tupdates = tcount(cu, "updated_files")
            logger.debug("Prepared %d files for updating", tupdates)

        # new files to create
        cu.execute("""
            CREATE TEMPORARY VIEW created_files AS
            SELECT od.path AS path,
                   od.dbpath AS dbpath,
                   od.last_modified,
                   od.size AS size
              FROM ondisk od
             WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path)
        """)
        if logger.getEffectiveLevel() <= logging.DEBUG:
            tnews = tcount(cu, "created_files")
            logger.debug("Prepared %d files for creation", tnews)

        # files that we've indexed in the past but don't exist anymore
        if files is None:
            # has to be a table instead of a view because parameters aren't allowed in views
            cu.execute(
                """
                CREATE TEMPORARY TABLE deletedocs AS
                SELECT f.docid AS docid,
                       f.path AS path
                  FROM files f
                 WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :(
                   AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path)
            """, (prefix, prefix_expr(prefix)))
            if logger.getEffectiveLevel() <= logging.DEBUG:
                tdeletes = tcount(cu, "deletedocs")
                logger.debug("Prepared %d files for deletion", tdeletes)

        # set up our debugging progress-printing closure
        def printprogress(*a):
            pass

        if logger.getEffectiveLevel() <= logging.INFO:
            progresstotal = tnews + tupdates + tdeletes
            if progresstotal > 0:

                def printprogress(s, fname):
                    total = updates + news + deletes
                    percent = float(total) / progresstotal * 100
                    logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal,
                                percent, s, fname)

        # files that we've indexed in the past but don't exist anymore
        if files is None:
            c.execute("SELECT docid, path FROM deletedocs")
            for (docid, fname) in c:
                printprogress("Deleting", fname)
                remove_document(cu, docid)

                deletes += 1

        c.execute(
            "SELECT docid, path, last_modified, size FROM updated_files;")
        for (docid, fname, last_modified, size) in c:
            printprogress("Updating %.2f" % (size / 1024.0), fname)
            try:
                with get_bytes(fname, size) as bb:
                    update_document(cu, docid, last_modified, bb)
            except IOError as e:
                if e.errno in (errno.ENOENT, errno.EPERM):
                    logger.warning("Skipping %s: %s", fname,
                                   os.strerror(e.errno))
                else:
                    raise
                continue
            updates += 1

        # new files to create
        c.execute(
            "SELECT path, dbpath, last_modified, size FROM created_files;")
        for (fname, dbpath, last_modified, size) in c:
            # is it safe to re-use the last_modified that we got before, or do
            # we need to re-stat() the file? reusing it like this could make a
            # race-condition whereby we never re-update that file
            printprogress("Adding %.1fk" % (size / 1024.0), fname)
            try:
                with get_bytes(fname, size) as bb:
                    add_document(cu, dbpath, last_modified, bb)
            except IOError as e:
                if e.errno in (errno.ENOENT, errno.EPERM):
                    logger.warning("Skipping %s: %s", fname,
                                   os.strerror(e.errno))
                else:
                    raise
                continue
            news += 1

        logger.info("%d new documents, %d deletes, %d updates in %.2fs", news,
                    deletes, updates,
                    time.time() - start)

        cu.execute("DROP VIEW updated_files;")
        cu.execute("DROP VIEW created_files;")
        cu.execute("DROP TABLE IF EXISTS deletedocs;")
        cu.execute("DROP TABLE ondisk;")
Ejemplo n.º 4
0
def sync(conn, path, prefix, files = None):
    # path must be a full path on disk
    # prefix must be the full path on disk that we're syncing (or empty)

    start = time.time()

    news = updates = deletes = 0
    tnews = tupdates = tdeletes = 0 # for debug printing

    with Cursor(conn) as c, Cursor(conn) as cu:
        c.execute("""
                  CREATE TEMPORARY TABLE
                  ondisk (
                     path          TEXT PRIMARY KEY COLLATE BINARY,
                     dbpath        TEXT COLLATE BINARY,
                     last_modified INTEGER,
                     size          INTEGER
                  );
                  """)

        exclusions = []
        c.execute("SELECT type AS typ, expression AS e FROM exclusions;")
        for typ, expression in c:
            if typ == 're':
                expression = re.compile(expression)
            exclusions.append((typ, expression))

        wpath = path
        if prefix:
            wpath = os.path.join(path, prefix)

        if files is None:
            os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu)
        else:
            visitor(path, prefix, exclusions, cu, wpath, files)

        logger.debug("Creating temporary index on ondisk(dbpath)")
        c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)")

        if logger.getEffectiveLevel() <= logging.DEBUG:
            logger.debug("Found %d files on disk", tcount(cu, "ondisk"))

        # now build three groups: new files to be added, missing files to be
        # deleted, and old files to be updated

        # updated ones
        cu.execute("""
            CREATE TEMPORARY VIEW updated_files AS
            SELECT f.docid AS docid,
                   od.path AS path,
                   od.last_modified AS last_modified,
                   od.size AS size
              FROM ondisk od, files f
             WHERE od.dbpath = f.path
               AND f.last_modified < od.last_modified
        """)
        if logger.getEffectiveLevel() <= logging.DEBUG:
            tupdates = tcount(cu, "updated_files")
            logger.debug("Prepared %d files for updating", tupdates)

        # new files to create
        cu.execute("""
            CREATE TEMPORARY VIEW created_files AS
            SELECT od.path AS path,
                   od.dbpath AS dbpath,
                   od.last_modified,
                   od.size AS size
              FROM ondisk od
             WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path)
        """)
        if logger.getEffectiveLevel() <= logging.DEBUG:
            tnews = tcount(cu, "created_files")
            logger.debug("Prepared %d files for creation", tnews)

        # files that we've indexed in the past but don't exist anymore
        if files is None:
            # has to be a table instead of a view because parameters aren't allowed in views
            cu.execute("""
                CREATE TEMPORARY TABLE deletedocs AS
                SELECT f.docid AS docid,
                       f.path AS path
                  FROM files f
                 WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :(
                   AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path)
            """, (prefix, prefix_expr(prefix)))
            if logger.getEffectiveLevel() <= logging.DEBUG:
                tdeletes = tcount(cu, "deletedocs")
                logger.debug("Prepared %d files for deletion", tdeletes)

        # set up our debugging progress-printing closure
        def printprogress(*a):
            pass
        if logger.getEffectiveLevel() <= logging.INFO:
            progresstotal = tnews + tupdates + tdeletes
            if progresstotal > 0:
                def printprogress(s, fname):
                    total = updates+news+deletes
                    percent = float(total)/progresstotal*100
                    logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname)

        # files that we've indexed in the past but don't exist anymore
        if files is None:
            c.execute("SELECT docid, path FROM deletedocs");
            for (docid, fname) in c:
                printprogress("Deleting", fname)
                remove_document(cu, docid)

                deletes += 1

        c.execute("SELECT docid, path, last_modified, size FROM updated_files;")
        for (docid, fname, last_modified, size) in c:
            printprogress("Updating %.2f" % (size/1024.0), fname)
            try:
                with get_bytes(fname, size) as bb:
                    update_document(cu, docid, last_modified, bb)
            except IOError as e:
                if e.errno in (errno.ENOENT, errno.EPERM):
                    logger.warning("Skipping %s: %s", fname, os.strerror(e.errno))
                else:
                    raise
                continue
            updates += 1

        # new files to create
        c.execute("SELECT path, dbpath, last_modified, size FROM created_files;")
        for (fname, dbpath, last_modified, size) in c:
            # is it safe to re-use the last_modified that we got before, or do
            # we need to re-stat() the file? reusing it like this could make a
            # race-condition whereby we never re-update that file
            printprogress("Adding %.1fk" % (size/1024.0), fname)
            try:
                with get_bytes(fname, size) as bb:
                    add_document(cu, dbpath, last_modified, bb)
            except IOError as e:
                if e.errno in (errno.ENOENT, errno.EPERM):
                    logger.warning("Skipping %s: %s", fname, os.strerror(e.errno))
                else:
                    raise
                continue
            news += 1

        logger.info("%d new documents, %d deletes, %d updates in %.2fs", news, deletes, updates, time.time()-start)

        cu.execute("DROP VIEW updated_files;")
        cu.execute("DROP VIEW created_files;")
        cu.execute("DROP TABLE IF EXISTS deletedocs;")
        cu.execute("DROP TABLE ondisk;")