Ejemplo n.º 1
0
def visitor(path, prefix, exclusions, cu, dirname, fnames):
    if logger.getEffectiveLevel() <= logging.DEBUG:
        logger.debug("Walking %s", dirname)
        fnames.sort(
        )  # makes the child 'walking' messages come out in an order the user expects

    remove = []

    for basename in fnames:
        fname = os.path.join(dirname, basename)

        assert fname.startswith(path)
        if prefix:
            assert fname.startswith(os.path.join(path, prefix))

        dbfname = fname[len(path) + 1:]

        if exclusions and not should_allow(exclusions, basename, dbfname):
            remove.append(basename)
            continue

        try:
            st = os.stat(fname)
            mode = st.st_mode
            size = st[stat.ST_SIZE]
            if stat.S_ISDIR(mode):
                continue
            if not stat.S_ISREG(mode):
                logger.warn("Skipping non-regular file %s (%s)", dbfname,
                            stat.S_IFMT(mode))
                continue
        except IOError as e:
            if e.errno == errno.ENOENT:
                # it was deleted in between
                continue
            raise

        cu.execute(
            "INSERT INTO ondisk(path, dbpath, last_modified, size) VALUES (?, ?, ?, ?)",
            (fname, dbfname, int(st[stat.ST_MTIME]), size))

    if remove and logger.getEffectiveLevel() <= logging.DEBUG:
        logger.debug("Removing %r from walk", list(remove))
    for r in remove:
        fnames.remove(r)
Ejemplo n.º 2
0
def visitor(path, prefix, exclusions, cu, dirname, fnames):
    if logger.getEffectiveLevel() <= logging.DEBUG:
        logger.debug("Walking %s", dirname)
        fnames.sort() # makes the child 'walking' messages come out in an order the user expects

    remove = []

    for basename in fnames:
        fname = os.path.join(dirname, basename)

        assert fname.startswith(path)
        if prefix:
            assert fname.startswith(os.path.join(path, prefix))

        dbfname = fname[len(path)+1:]

        if exclusions and not should_allow(exclusions, basename, dbfname):
            remove.append(basename)
            continue

        try:
            st = os.stat(fname)
            mode = st.st_mode
            size = st[stat.ST_SIZE]
            if stat.S_ISDIR(mode):
                continue
            if not stat.S_ISREG(mode):
                logger.warn("Skipping non-regular file %s (%s)", dbfname, stat.S_IFMT(mode))
                continue
        except IOError as e:
            if e.errno == errno.ENOENT:
                # it was deleted in between
                continue
            raise

        cu.execute("INSERT INTO ondisk(path, dbpath, last_modified, size) VALUES (?, ?, ?, ?)",
                   (fname, dbfname, int(st[stat.ST_MTIME]), size))

    if remove and logger.getEffectiveLevel() <= logging.DEBUG:
        logger.debug("Removing %r from walk", list(remove))
    for r in remove:
        fnames.remove(r)
Ejemplo n.º 3
0
Archivo: fts.py Proyecto: ketralnis/fts
def main():
    ap = argparse.ArgumentParser('fts', description="a command line full text search engine")

    ap.add_argument('--logging', default='warn', help=argparse.SUPPRESS,
                    choices = ('error', 'warn', 'info', 'debug'))

    ap.add_argument("--init", action="store_true", help="Create a new .fts.db in the current directory")
    ap.add_argument("--no-sync", dest='nosync', action="store_true", help="don't sync the database when making a new one. only valid with --init")

    ap.add_argument("--sync", dest='sync', action="store_true", help="sync the fts database with the files on disk")
    ap.add_argument("--optimize", action="store_true", help="optimize the sqlite database for size and performance")

    ap.add_argument('--sync-one', metavar='filename', help="sync a single file (unlike the other commands, this one doesn't care about the current directory)")

    ap.add_argument("--list-ignores", action='store_true', default=[])
    ap.add_argument("--rm-ignore", type=int, metavar='ignoreid', action='append', default=[])
    ap.add_argument("--ignore-re", metavar='re', action='append', default=[])
    ap.add_argument("--ignore-simple", metavar='filename', action='append', default=[])
    ap.add_argument('--ignore', "--ignore-glob", dest='ignore_glob', metavar='pattern', action='append', default=[])

    ap.add_argument('-r', '--re', '--regex', '--regexp', dest='searchmode',
                    default='MATCH', action="store_const", const='REGEXP',
                    help="search using a regex instead of MATCH syntax. Much slower!")

    ap.add_argument('-l', dest='display_mode', action='store_const', const='filename_only', help="print only the matching filenames")
    ap.add_argument('--color-mode', dest='color_mode', choices=('yes', 'no', 'auto'), default='auto')
    ap.add_argument('--color', dest='color_mode', action='store_const', const='yes')

    ap.add_argument("search", nargs="*")

    args = ap.parse_args()

    logger.setLevel(getattr(logging, args.logging.upper()))

    if args.color_mode == 'yes':
        color = True
    elif args.color_mode == 'no':
        color = False
    else:
        # it's 'auto'
        color = (os.isatty(sys.stdout.fileno())
                 and args.display_mode != 'filename_only'
                 and args.searchmode != 'REGEXP' # since we don't have snippets working here yet
                 )

    cwd = os.getcwd()
    didsomething = False
    exitval = 0

    if args.init:
        didsomething = True
        init(cwd)

    if args.sync_one:
        # this is designed to be called by tools like procmail or IDEs' on-save
        # hooks, so rather than making them play games with the cwd we have
        # special finddb logic for it. note that because of this we are
        # vulnerable to .fts.db files that shadow the intended one. Also note
        # that we may operate on a different .fts.db than other commands run in
        # the same session.
        # TODO: Maybe we should refuse to allow other commands to operate in the
        # same session for this reason
        fpath = args.sync_one
        if not fpath.startswith('/'):
            fpath = os.path.join(cwd, fpath)
        assert os.path.isfile(fpath)
        dirname, basename = os.path.dirname(fpath), os.path.basename(fpath)

        froot, fprefix, conn = finddb(dirname)

        assert fpath.startswith(os.path.join(froot, fprefix))

        with conn:
            sync(conn, froot, fprefix, files = [basename])

        didsomething = True


    root, prefix, conn = finddb(cwd)

    with conn:
        # all other top-level functions operate in one global transaction
        for a in args.rm_ignore:
            didsomething = True
            rm_ignore(conn, a)

        for a in args.ignore_re:
            didsomething = True
            try:
                re.compile(a)
            except:
                logging.error("Couldn't compile regex %r, are you sure it's valid?", a)
                raise
            add_ignore(conn, 're', a)
        for a in args.ignore_simple:
            didsomething = True
            add_ignore(conn, 'simple', a)
        for a in args.ignore_glob:
            didsomething = True
            add_ignore(conn, 'glob', a)

        if args.list_ignores:
            didsomething = True
            list_ignores(conn)

        dosync = args.sync or (args.init and not args.nosync)

        if dosync:
            didsomething = True
            sync(conn, root, prefix)

        if args.optimize:
            didsomething = True
            with Cursor(conn) as c:
                logger.debug("OPTIMIZE")
                c.execute("INSERT INTO files_fts(files_fts) values('optimize');")
                logger.debug("VACUUM ANALYZE;")
                c.execute("VACUUM ANALYZE;")

        for term in args.search:
            # for now, ANY search matching a document will return it, and it may be
            # returned twice
            didsomething = True

            for sr in search(conn, prefix, term, args.searchmode,
                             checksync=dosync, color=color):
                if args.display_mode == 'filename_only':
                    print sr.filename
                else:
                    print sr.format(color=color)

                # at least one result was returned
                exitval = 0

    if not didsomething:
        ap.print_usage()
        sys.exit(1)

    sys.exit(exitval)
Ejemplo n.º 4
0
def sync(conn, path, prefix, files=None):
    # path must be a full path on disk
    # prefix must be the full path on disk that we're syncing (or empty)

    start = time.time()

    news = updates = deletes = 0
    tnews = tupdates = tdeletes = 0  # for debug printing

    with Cursor(conn) as c, Cursor(conn) as cu:
        c.execute("""
                  CREATE TEMPORARY TABLE
                  ondisk (
                     path          TEXT PRIMARY KEY COLLATE BINARY,
                     dbpath        TEXT COLLATE BINARY,
                     last_modified INTEGER,
                     size          INTEGER
                  );
                  """)

        exclusions = []
        c.execute("SELECT type AS typ, expression AS e FROM exclusions;")
        for typ, expression in c:
            if typ == 're':
                expression = re.compile(expression)
            exclusions.append((typ, expression))

        wpath = path
        if prefix:
            wpath = os.path.join(path, prefix)

        if files is None:
            os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu)
        else:
            visitor(path, prefix, exclusions, cu, wpath, files)

        logger.debug("Creating temporary index on ondisk(dbpath)")
        c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)")

        if logger.getEffectiveLevel() <= logging.DEBUG:
            logger.debug("Found %d files on disk", tcount(cu, "ondisk"))

        # now build three groups: new files to be added, missing files to be
        # deleted, and old files to be updated

        # updated ones
        cu.execute("""
            CREATE TEMPORARY VIEW updated_files AS
            SELECT f.docid AS docid,
                   od.path AS path,
                   od.last_modified AS last_modified,
                   od.size AS size
              FROM ondisk od, files f
             WHERE od.dbpath = f.path
               AND f.last_modified < od.last_modified
        """)
        if logger.getEffectiveLevel() <= logging.DEBUG:
            tupdates = tcount(cu, "updated_files")
            logger.debug("Prepared %d files for updating", tupdates)

        # new files to create
        cu.execute("""
            CREATE TEMPORARY VIEW created_files AS
            SELECT od.path AS path,
                   od.dbpath AS dbpath,
                   od.last_modified,
                   od.size AS size
              FROM ondisk od
             WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path)
        """)
        if logger.getEffectiveLevel() <= logging.DEBUG:
            tnews = tcount(cu, "created_files")
            logger.debug("Prepared %d files for creation", tnews)

        # files that we've indexed in the past but don't exist anymore
        if files is None:
            # has to be a table instead of a view because parameters aren't allowed in views
            cu.execute(
                """
                CREATE TEMPORARY TABLE deletedocs AS
                SELECT f.docid AS docid,
                       f.path AS path
                  FROM files f
                 WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :(
                   AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path)
            """, (prefix, prefix_expr(prefix)))
            if logger.getEffectiveLevel() <= logging.DEBUG:
                tdeletes = tcount(cu, "deletedocs")
                logger.debug("Prepared %d files for deletion", tdeletes)

        # set up our debugging progress-printing closure
        def printprogress(*a):
            pass

        if logger.getEffectiveLevel() <= logging.INFO:
            progresstotal = tnews + tupdates + tdeletes
            if progresstotal > 0:

                def printprogress(s, fname):
                    total = updates + news + deletes
                    percent = float(total) / progresstotal * 100
                    logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal,
                                percent, s, fname)

        # files that we've indexed in the past but don't exist anymore
        if files is None:
            c.execute("SELECT docid, path FROM deletedocs")
            for (docid, fname) in c:
                printprogress("Deleting", fname)
                remove_document(cu, docid)

                deletes += 1

        c.execute(
            "SELECT docid, path, last_modified, size FROM updated_files;")
        for (docid, fname, last_modified, size) in c:
            printprogress("Updating %.2f" % (size / 1024.0), fname)
            try:
                with get_bytes(fname, size) as bb:
                    update_document(cu, docid, last_modified, bb)
            except IOError as e:
                if e.errno in (errno.ENOENT, errno.EPERM):
                    logger.warning("Skipping %s: %s", fname,
                                   os.strerror(e.errno))
                else:
                    raise
                continue
            updates += 1

        # new files to create
        c.execute(
            "SELECT path, dbpath, last_modified, size FROM created_files;")
        for (fname, dbpath, last_modified, size) in c:
            # is it safe to re-use the last_modified that we got before, or do
            # we need to re-stat() the file? reusing it like this could make a
            # race-condition whereby we never re-update that file
            printprogress("Adding %.1fk" % (size / 1024.0), fname)
            try:
                with get_bytes(fname, size) as bb:
                    add_document(cu, dbpath, last_modified, bb)
            except IOError as e:
                if e.errno in (errno.ENOENT, errno.EPERM):
                    logger.warning("Skipping %s: %s", fname,
                                   os.strerror(e.errno))
                else:
                    raise
                continue
            news += 1

        logger.info("%d new documents, %d deletes, %d updates in %.2fs", news,
                    deletes, updates,
                    time.time() - start)

        cu.execute("DROP VIEW updated_files;")
        cu.execute("DROP VIEW created_files;")
        cu.execute("DROP TABLE IF EXISTS deletedocs;")
        cu.execute("DROP TABLE ondisk;")
Ejemplo n.º 5
0
def sync(conn, path, prefix, files = None):
    # path must be a full path on disk
    # prefix must be the full path on disk that we're syncing (or empty)

    start = time.time()

    news = updates = deletes = 0
    tnews = tupdates = tdeletes = 0 # for debug printing

    with Cursor(conn) as c, Cursor(conn) as cu:
        c.execute("""
                  CREATE TEMPORARY TABLE
                  ondisk (
                     path          TEXT PRIMARY KEY COLLATE BINARY,
                     dbpath        TEXT COLLATE BINARY,
                     last_modified INTEGER,
                     size          INTEGER
                  );
                  """)

        exclusions = []
        c.execute("SELECT type AS typ, expression AS e FROM exclusions;")
        for typ, expression in c:
            if typ == 're':
                expression = re.compile(expression)
            exclusions.append((typ, expression))

        wpath = path
        if prefix:
            wpath = os.path.join(path, prefix)

        if files is None:
            os.path.walk(wpath, partial(visitor, path, prefix, exclusions), cu)
        else:
            visitor(path, prefix, exclusions, cu, wpath, files)

        logger.debug("Creating temporary index on ondisk(dbpath)")
        c.execute("CREATE INDEX tmp_ondisk_dbpath_idx ON ondisk(dbpath)")

        if logger.getEffectiveLevel() <= logging.DEBUG:
            logger.debug("Found %d files on disk", tcount(cu, "ondisk"))

        # now build three groups: new files to be added, missing files to be
        # deleted, and old files to be updated

        # updated ones
        cu.execute("""
            CREATE TEMPORARY VIEW updated_files AS
            SELECT f.docid AS docid,
                   od.path AS path,
                   od.last_modified AS last_modified,
                   od.size AS size
              FROM ondisk od, files f
             WHERE od.dbpath = f.path
               AND f.last_modified < od.last_modified
        """)
        if logger.getEffectiveLevel() <= logging.DEBUG:
            tupdates = tcount(cu, "updated_files")
            logger.debug("Prepared %d files for updating", tupdates)

        # new files to create
        cu.execute("""
            CREATE TEMPORARY VIEW created_files AS
            SELECT od.path AS path,
                   od.dbpath AS dbpath,
                   od.last_modified,
                   od.size AS size
              FROM ondisk od
             WHERE NOT EXISTS(SELECT 1 FROM files f1 WHERE od.dbpath = f1.path)
        """)
        if logger.getEffectiveLevel() <= logging.DEBUG:
            tnews = tcount(cu, "created_files")
            logger.debug("Prepared %d files for creation", tnews)

        # files that we've indexed in the past but don't exist anymore
        if files is None:
            # has to be a table instead of a view because parameters aren't allowed in views
            cu.execute("""
                CREATE TEMPORARY TABLE deletedocs AS
                SELECT f.docid AS docid,
                       f.path AS path
                  FROM files f
                 WHERE (? = '' OR f.path LIKE ? ESCAPE '\\') -- ESCAPE disables the LIKE optimization :(
                   AND NOT EXISTS(SELECT 1 FROM ondisk od WHERE od.dbpath = f.path)
            """, (prefix, prefix_expr(prefix)))
            if logger.getEffectiveLevel() <= logging.DEBUG:
                tdeletes = tcount(cu, "deletedocs")
                logger.debug("Prepared %d files for deletion", tdeletes)

        # set up our debugging progress-printing closure
        def printprogress(*a):
            pass
        if logger.getEffectiveLevel() <= logging.INFO:
            progresstotal = tnews + tupdates + tdeletes
            if progresstotal > 0:
                def printprogress(s, fname):
                    total = updates+news+deletes
                    percent = float(total)/progresstotal*100
                    logger.info("%d/%d (%.1f%%) %s: %s", total, progresstotal, percent, s, fname)

        # files that we've indexed in the past but don't exist anymore
        if files is None:
            c.execute("SELECT docid, path FROM deletedocs");
            for (docid, fname) in c:
                printprogress("Deleting", fname)
                remove_document(cu, docid)

                deletes += 1

        c.execute("SELECT docid, path, last_modified, size FROM updated_files;")
        for (docid, fname, last_modified, size) in c:
            printprogress("Updating %.2f" % (size/1024.0), fname)
            try:
                with get_bytes(fname, size) as bb:
                    update_document(cu, docid, last_modified, bb)
            except IOError as e:
                if e.errno in (errno.ENOENT, errno.EPERM):
                    logger.warning("Skipping %s: %s", fname, os.strerror(e.errno))
                else:
                    raise
                continue
            updates += 1

        # new files to create
        c.execute("SELECT path, dbpath, last_modified, size FROM created_files;")
        for (fname, dbpath, last_modified, size) in c:
            # is it safe to re-use the last_modified that we got before, or do
            # we need to re-stat() the file? reusing it like this could make a
            # race-condition whereby we never re-update that file
            printprogress("Adding %.1fk" % (size/1024.0), fname)
            try:
                with get_bytes(fname, size) as bb:
                    add_document(cu, dbpath, last_modified, bb)
            except IOError as e:
                if e.errno in (errno.ENOENT, errno.EPERM):
                    logger.warning("Skipping %s: %s", fname, os.strerror(e.errno))
                else:
                    raise
                continue
            news += 1

        logger.info("%d new documents, %d deletes, %d updates in %.2fs", news, deletes, updates, time.time()-start)

        cu.execute("DROP VIEW updated_files;")
        cu.execute("DROP VIEW created_files;")
        cu.execute("DROP TABLE IF EXISTS deletedocs;")
        cu.execute("DROP TABLE ondisk;")
Ejemplo n.º 6
0
def main():
    ap = argparse.ArgumentParser(
        'fts', description="a command line full text search engine")

    ap.add_argument('--logging',
                    default='warn',
                    help=argparse.SUPPRESS,
                    choices=('error', 'warn', 'info', 'debug'))

    ap.add_argument("--init",
                    action="store_true",
                    help="Create a new .fts.db in the current directory")
    ap.add_argument(
        "--no-sync",
        dest='nosync',
        action="store_true",
        help=
        "don't sync the database when making a new one. only valid with --init"
    )

    ap.add_argument("--sync",
                    dest='sync',
                    action="store_true",
                    help="sync the fts database with the files on disk")
    ap.add_argument(
        "--optimize",
        action="store_true",
        help="optimize the sqlite database for size and performance")

    ap.add_argument(
        '--sync-one',
        metavar='filename',
        help=
        "sync a single file (unlike the other commands, this one doesn't care about the current directory)"
    )

    ap.add_argument("--list-ignores", action='store_true', default=[])
    ap.add_argument("--rm-ignore",
                    type=int,
                    metavar='ignoreid',
                    action='append',
                    default=[])
    ap.add_argument("--ignore-re", metavar='re', action='append', default=[])
    ap.add_argument("--ignore-simple",
                    metavar='filename',
                    action='append',
                    default=[])
    ap.add_argument('--ignore',
                    "--ignore-glob",
                    dest='ignore_glob',
                    metavar='pattern',
                    action='append',
                    default=[])

    ap.add_argument(
        '-r',
        '--re',
        '--regex',
        '--regexp',
        dest='searchmode',
        default='MATCH',
        action="store_const",
        const='REGEXP',
        help="search using a regex instead of MATCH syntax. Much slower!")

    ap.add_argument('-l',
                    dest='display_mode',
                    action='store_const',
                    const='filename_only',
                    help="print only the matching filenames")
    ap.add_argument('--color-mode',
                    dest='color_mode',
                    choices=('yes', 'no', 'auto'),
                    default='auto')
    ap.add_argument('--color',
                    dest='color_mode',
                    action='store_const',
                    const='yes')

    ap.add_argument("search", nargs="*")

    args = ap.parse_args()

    logger.setLevel(getattr(logging, args.logging.upper()))

    if args.color_mode == 'yes':
        color = True
    elif args.color_mode == 'no':
        color = False
    else:
        # it's 'auto'
        color = (
            os.isatty(sys.stdout.fileno())
            and args.display_mode != 'filename_only' and args.searchmode !=
            'REGEXP'  # since we don't have snippets working here yet
        )

    cwd = os.getcwd()
    didsomething = False
    exitval = 0

    if args.init:
        didsomething = True
        init(cwd)

    if args.sync_one:
        # this is designed to be called by tools like procmail or IDEs' on-save
        # hooks, so rather than making them play games with the cwd we have
        # special finddb logic for it. note that because of this we are
        # vulnerable to .fts.db files that shadow the intended one. Also note
        # that we may operate on a different .fts.db than other commands run in
        # the same session.
        # TODO: Maybe we should refuse to allow other commands to operate in the
        # same session for this reason
        fpath = args.sync_one
        if not fpath.startswith('/'):
            fpath = os.path.join(cwd, fpath)
        assert os.path.isfile(fpath)
        dirname, basename = os.path.dirname(fpath), os.path.basename(fpath)

        froot, fprefix, conn = finddb(dirname)

        assert fpath.startswith(os.path.join(froot, fprefix))

        with conn:
            sync(conn, froot, fprefix, files=[basename])

        didsomething = True

    root, prefix, conn = finddb(cwd)

    with conn:
        # all other top-level functions operate in one global transaction
        for a in args.rm_ignore:
            didsomething = True
            rm_ignore(conn, a)

        for a in args.ignore_re:
            didsomething = True
            try:
                re.compile(a)
            except:
                logging.error(
                    "Couldn't compile regex %r, are you sure it's valid?", a)
                raise
            add_ignore(conn, 're', a)
        for a in args.ignore_simple:
            didsomething = True
            add_ignore(conn, 'simple', a)
        for a in args.ignore_glob:
            didsomething = True
            add_ignore(conn, 'glob', a)

        if args.list_ignores:
            didsomething = True
            list_ignores(conn)

        dosync = args.sync or (args.init and not args.nosync)

        if dosync:
            didsomething = True
            sync(conn, root, prefix)

        if args.optimize:
            didsomething = True
            with Cursor(conn) as c:
                logger.debug("OPTIMIZE")
                c.execute(
                    "INSERT INTO files_fts(files_fts) values('optimize');")
                logger.debug("VACUUM ANALYZE;")
                c.execute("VACUUM ANALYZE;")

        for term in args.search:
            # for now, ANY search matching a document will return it, and it may be
            # returned twice
            didsomething = True

            for sr in search(conn,
                             prefix,
                             term,
                             args.searchmode,
                             checksync=dosync,
                             color=color):
                if args.display_mode == 'filename_only':
                    print sr.filename
                else:
                    print sr.format(color=color)

                # at least one result was returned
                exitval = 0

    if not didsomething:
        ap.print_usage()
        sys.exit(1)

    sys.exit(exitval)