Ejemplo n.º 1
0
Archivo: fsstat.py Proyecto: Byron/bit
    def _update_db(self, args):
        """Update the sqlite database database
        @return error code"""
        log = self.log()

        num_sources = bool(args.directories) + bool(args.merge_paths)
        if num_sources > 1:
            raise AssertionError("Cannot use --from-directories or --merge together")
        elif num_sources and args.remove_duplicates:
            raise AssertionError("--remove-duplicate-paths cannot be used in conjunction with any source")
        elif not (args.fast or args.remove_duplicates) and num_sources == 0:
            raise AssertionError("Specify at least one of the flags specifying from where to update the database")
        # end assure consistency

        #############
        # INIT DB ##
        ###########
        path = args.update_db
        engine = create_engine(self._url_from_path(path))
        meta = None
        # Assume file exists
        if is_url(path) or path.isfile():
            meta = MetaData(engine, reflect=True)
        # end handle file exists

        if not meta or args.table_name not in meta.tables:
            if args.fast:
                log.warn("Database didn't exist yet - fast implicitly disabled")
                args.fast = False
                if num_sources == 0:
                    raise AssertionError(
                        "Require at least one initial data source, either --from-directories or --merge"
                    )
                # end handle logic
            # end handle fast
            if args.remove_duplicates:
                raise AssertionError("Cannot remove duplicates on non-existing table")
            # end handle remove duplicates

            meta = fsstat_schema.meta
            fsstat_schema.record.name = args.table_name
            meta.bind = engine
            meta.create_all()
            log.info("initalized database at %s", path)
            fsitem = fsstat_schema.record
            # assure we have the meta-data with the proper name - renaming the table before we create_all
            # is kind of a hack
            meta = MetaData(engine, reflect=True)
        else:
            if args.with_index:
                log.info("Cannot create index on exiting table without additional logic - turning index creation off")
            # end
            args.with_index = False

            fsitem = meta.tables[args.table_name]
            log.info("Updating database '%s' at '%s'", path, args.table_name)
        # end initialize table

        strip = str.strip
        basename = os.path.basename
        connection = engine.connect()
        insert = fsitem.insert()

        st = time()
        nr = 0  # num records handled
        records = list()

        ########################
        # REMOVE DUPLICATES ###
        ######################
        if args.remove_duplicates:
            nr = self._remove_duplicates(connection, fsitem)
        ######################
        # FAST UPDATE ####
        ###############
        elif args.fast:
            nr = self._fast_update_database(engine, args)
        ###########################
        ## DIRECTORY CRAWLING ####
        #########################
        elif args.directories:

            streamer = HashStreamer(hashlib.sha1, lz4dumps)
            join = os.path.join
            normalize = os.path.normpath
            totalbcount = 0  # total amount of bytes processed

            lct = time()
            progress_every = 500
            commit_every_fcount = 15000
            commit_every_seconds = 1 * 60  ## commits per minute

            def progress():
                elapsed = time() - st
                log.info(
                    "Processed %i files with %s in %.2fs (%.2f files/s | %s MB/s)",
                    nr,
                    int_to_size_string(totalbcount),
                    elapsed,
                    nr / elapsed,
                    mb(totalbcount) / elapsed,
                )

            # end

            for directory in args.directories:
                if not os.path.isdir(directory):
                    log.error("Skipped non-existing directory '%s'", directory)
                    continue
                # end handle failed directory acccess

                # normalize to prevent extra stuff
                directory = normalize(directory)
                for root, dirs, files in os.walk(directory, followlinks=False):
                    # NOTE: We also take directories, as it allows to find directories with many files, or with
                    # no files (empty directories). Also, we can optimize updates that way
                    # Just to also handle root ! It must be in the database, otherwise we can never
                    # handle additions correctly, at least not for the root folder
                    chains = [files, dirs]
                    if root is directory:
                        # an empty string joined with root, is root
                        chains.insert(0, [""])
                    # end handle root
                    for filename in chain(*chains):
                        nr += 1
                        # only join if we are not seeing the root. Otherwise we get a slash appended
                        # Which is something we really don't want as it could hinder later updates
                        path = filename and join(root, filename) or root
                        stat = self._append_path_record(records, path, streamer, log)
                        if stat:
                            totalbcount += stat.st_size

                            if nr % progress_every == 0:
                                progress()
                            # end show progress
                        # end managaed to handle file

                        if time() - lct >= commit_every_seconds or nr % commit_every_fcount == 0:
                            lct = time()
                            progress()
                            self.do_execute_records(connection, insert, records, log, st, nr)
                        # end commit
                # end for each file
            # end for each directory to traverse
            # final execute
            progress()
            self.do_execute_records(connection, insert, records, log, st, nr)
        #########################
        ## Database Merges  ####
        ######################
        elif args.merge_paths:
            ## Commit this amount of records at once
            commit_count = 100000

            def progress():
                elapsed = time() - st
                log.info("Inserted %i records in %.2fs (%.2f records/s)", nr, elapsed, nr / elapsed)

            # end

            for merge_path in args.merge_paths:
                merge_path = Path(merge_path)

                if not is_url(merge_path) and not merge_path.isfile():
                    log.error("Database at '%s' didn't exist - skipping", merge_path)
                    continue
                # end for each path

                log.info("Merging DB at '%s' ...", merge_path)
                merge_engine = create_engine(self._url_from_path(merge_path))
                mcon = merge_engine.connect()
                md = MetaData(merge_engine, reflect=True)

                try:
                    for table in md.tables.itervalues():
                        # If id is part of it, and we rollback because of a unicode error, the counter
                        # will be offset and we cannot commit anymore. Just let it be done automatically, no
                        # matter what
                        column_names = [c.name for c in table.columns if c != "id"]
                        try:
                            cursor = mcon.execute(select([table]))

                            # We assume the cursor deals with the query efficiently, and doesn't really fetch everything
                            while True:
                                fst = time()
                                log.info("Fetching %i '%s' records from '%s' ...", commit_count, table.name, merge_path)

                                rows = cursor.fetchmany(commit_count)
                                records.extend(dict(zip(column_names, row)) for row in rows)

                                elapsed = time() - fst
                                log.info(
                                    "Fetched %i records in %.2fs (%.2f records/s)",
                                    len(records),
                                    elapsed,
                                    len(records) / elapsed,
                                )

                                nr += len(records)
                                must_break = len(records) < commit_count

                                ##############
                                self.do_execute_records(connection, insert, records, log, st, nr)
                                progress()
                                ##############

                                # Did we get enough ?
                                if must_break:
                                    break
                                # end check for end of iteration
                            # end endless loop
                        finally:
                            cursor.close()
                    # end for each table to merge
                finally:
                    mcon.close()
                # end assure we close resources
            # end for each merge path
        else:
            raise AssertionError("Reached unexpected mode")
        # end handle mode of operation

        ##############################
        # CREATE INDICES AND VIEWS ##
        ############################
        if args.with_index:
            # Create one index per column, which allows fast searches over it
            # Create a custom one that speeds up our common search group by path, order by path, mtime.
            for col in fsitem.columns:
                # id is primary, and thus already indexed
                # path is too big - it needs to be hashed to be useful in an actual index
                # file as well
                if col in (fsitem.c.id, fsitem.c.path, fsitem.c.sha1):
                    continue
                # end handle index creation
                ist = time()
                log.info("Creating index for columns '%s' ...", col)
                try:
                    Index("idx_%s_%s" % (fsitem.name, col.name), col).create(engine)
                except Exception:
                    log.error("Index creation failed", exc_info=True)
                else:
                    elapsed = time() - ist
                    log.info("Created index with %i entries in %.2fs (%.2f entries/s)" % (nr, elapsed, nr / elapsed))
                # end handle creation errors
            # end for each index to create
        # end handle index creation

        if args.sql_directories:
            for sql_dir in args.sql_directories:
                sql_dir = Path(sql_dir)
                for sql_file in sql_dir.files(pattern="*.sql"):
                    try:
                        transaction = connection.begin()
                        log.info("Creating view from '%s'", sql_file)
                        connection.execute(sqlite_view_from_file(sql_file))
                        transaction.commit()
                    except Exception:
                        transaction.rollback()
                        log.error("Failed to create view for file '%s' - it might have existed - skipping", sql_file)
                        continue
                    # end handle transaction per sql view
                # end for each file
            # end for eeach sqldir
        # end have sql directories

        # FINAL CLEANUP
        ################
        # If there were unicode errors, we end up having a row with a null-path. This breaks our code
        # Lets keep the data consistent instead of altering code
        dst = time()
        connection.execute(fsitem.delete().where(fsitem.c.path == None))
        log.info("Cleaned dataset after (possible) unicode errors in %fs", time() - dst)

        connection.close()

        ##################
        # FINAL INFO ###
        ###############
        elapsed = time() - st
        log.info("Overall time to process %i records is %.2fs (%.2f records/s)", nr, elapsed, nr / elapsed)
        log.info("File written to %s", Path(args.update_db).abspath())

        return self.SUCCESS