def _update_db(self, args): """Update the sqlite database database @return error code""" log = self.log() num_sources = bool(args.directories) + bool(args.merge_paths) if num_sources > 1: raise AssertionError("Cannot use --from-directories or --merge together") elif num_sources and args.remove_duplicates: raise AssertionError("--remove-duplicate-paths cannot be used in conjunction with any source") elif not (args.fast or args.remove_duplicates) and num_sources == 0: raise AssertionError("Specify at least one of the flags specifying from where to update the database") # end assure consistency ############# # INIT DB ## ########### path = args.update_db engine = create_engine(self._url_from_path(path)) meta = None # Assume file exists if is_url(path) or path.isfile(): meta = MetaData(engine, reflect=True) # end handle file exists if not meta or args.table_name not in meta.tables: if args.fast: log.warn("Database didn't exist yet - fast implicitly disabled") args.fast = False if num_sources == 0: raise AssertionError( "Require at least one initial data source, either --from-directories or --merge" ) # end handle logic # end handle fast if args.remove_duplicates: raise AssertionError("Cannot remove duplicates on non-existing table") # end handle remove duplicates meta = fsstat_schema.meta fsstat_schema.record.name = args.table_name meta.bind = engine meta.create_all() log.info("initalized database at %s", path) fsitem = fsstat_schema.record # assure we have the meta-data with the proper name - renaming the table before we create_all # is kind of a hack meta = MetaData(engine, reflect=True) else: if args.with_index: log.info("Cannot create index on exiting table without additional logic - turning index creation off") # end args.with_index = False fsitem = meta.tables[args.table_name] log.info("Updating database '%s' at '%s'", path, args.table_name) # end initialize table strip = str.strip basename = os.path.basename connection = engine.connect() insert = fsitem.insert() st = time() nr = 0 # num records handled records = list() ######################## # REMOVE DUPLICATES ### ###################### if args.remove_duplicates: nr = self._remove_duplicates(connection, fsitem) ###################### # FAST UPDATE #### ############### elif args.fast: nr = self._fast_update_database(engine, args) ########################### ## DIRECTORY CRAWLING #### ######################### elif args.directories: streamer = HashStreamer(hashlib.sha1, lz4dumps) join = os.path.join normalize = os.path.normpath totalbcount = 0 # total amount of bytes processed lct = time() progress_every = 500 commit_every_fcount = 15000 commit_every_seconds = 1 * 60 ## commits per minute def progress(): elapsed = time() - st log.info( "Processed %i files with %s in %.2fs (%.2f files/s | %s MB/s)", nr, int_to_size_string(totalbcount), elapsed, nr / elapsed, mb(totalbcount) / elapsed, ) # end for directory in args.directories: if not os.path.isdir(directory): log.error("Skipped non-existing directory '%s'", directory) continue # end handle failed directory acccess # normalize to prevent extra stuff directory = normalize(directory) for root, dirs, files in os.walk(directory, followlinks=False): # NOTE: We also take directories, as it allows to find directories with many files, or with # no files (empty directories). Also, we can optimize updates that way # Just to also handle root ! It must be in the database, otherwise we can never # handle additions correctly, at least not for the root folder chains = [files, dirs] if root is directory: # an empty string joined with root, is root chains.insert(0, [""]) # end handle root for filename in chain(*chains): nr += 1 # only join if we are not seeing the root. Otherwise we get a slash appended # Which is something we really don't want as it could hinder later updates path = filename and join(root, filename) or root stat = self._append_path_record(records, path, streamer, log) if stat: totalbcount += stat.st_size if nr % progress_every == 0: progress() # end show progress # end managaed to handle file if time() - lct >= commit_every_seconds or nr % commit_every_fcount == 0: lct = time() progress() self.do_execute_records(connection, insert, records, log, st, nr) # end commit # end for each file # end for each directory to traverse # final execute progress() self.do_execute_records(connection, insert, records, log, st, nr) ######################### ## Database Merges #### ###################### elif args.merge_paths: ## Commit this amount of records at once commit_count = 100000 def progress(): elapsed = time() - st log.info("Inserted %i records in %.2fs (%.2f records/s)", nr, elapsed, nr / elapsed) # end for merge_path in args.merge_paths: merge_path = Path(merge_path) if not is_url(merge_path) and not merge_path.isfile(): log.error("Database at '%s' didn't exist - skipping", merge_path) continue # end for each path log.info("Merging DB at '%s' ...", merge_path) merge_engine = create_engine(self._url_from_path(merge_path)) mcon = merge_engine.connect() md = MetaData(merge_engine, reflect=True) try: for table in md.tables.itervalues(): # If id is part of it, and we rollback because of a unicode error, the counter # will be offset and we cannot commit anymore. Just let it be done automatically, no # matter what column_names = [c.name for c in table.columns if c != "id"] try: cursor = mcon.execute(select([table])) # We assume the cursor deals with the query efficiently, and doesn't really fetch everything while True: fst = time() log.info("Fetching %i '%s' records from '%s' ...", commit_count, table.name, merge_path) rows = cursor.fetchmany(commit_count) records.extend(dict(zip(column_names, row)) for row in rows) elapsed = time() - fst log.info( "Fetched %i records in %.2fs (%.2f records/s)", len(records), elapsed, len(records) / elapsed, ) nr += len(records) must_break = len(records) < commit_count ############## self.do_execute_records(connection, insert, records, log, st, nr) progress() ############## # Did we get enough ? if must_break: break # end check for end of iteration # end endless loop finally: cursor.close() # end for each table to merge finally: mcon.close() # end assure we close resources # end for each merge path else: raise AssertionError("Reached unexpected mode") # end handle mode of operation ############################## # CREATE INDICES AND VIEWS ## ############################ if args.with_index: # Create one index per column, which allows fast searches over it # Create a custom one that speeds up our common search group by path, order by path, mtime. for col in fsitem.columns: # id is primary, and thus already indexed # path is too big - it needs to be hashed to be useful in an actual index # file as well if col in (fsitem.c.id, fsitem.c.path, fsitem.c.sha1): continue # end handle index creation ist = time() log.info("Creating index for columns '%s' ...", col) try: Index("idx_%s_%s" % (fsitem.name, col.name), col).create(engine) except Exception: log.error("Index creation failed", exc_info=True) else: elapsed = time() - ist log.info("Created index with %i entries in %.2fs (%.2f entries/s)" % (nr, elapsed, nr / elapsed)) # end handle creation errors # end for each index to create # end handle index creation if args.sql_directories: for sql_dir in args.sql_directories: sql_dir = Path(sql_dir) for sql_file in sql_dir.files(pattern="*.sql"): try: transaction = connection.begin() log.info("Creating view from '%s'", sql_file) connection.execute(sqlite_view_from_file(sql_file)) transaction.commit() except Exception: transaction.rollback() log.error("Failed to create view for file '%s' - it might have existed - skipping", sql_file) continue # end handle transaction per sql view # end for each file # end for eeach sqldir # end have sql directories # FINAL CLEANUP ################ # If there were unicode errors, we end up having a row with a null-path. This breaks our code # Lets keep the data consistent instead of altering code dst = time() connection.execute(fsitem.delete().where(fsitem.c.path == None)) log.info("Cleaned dataset after (possible) unicode errors in %fs", time() - dst) connection.close() ################## # FINAL INFO ### ############### elapsed = time() - st log.info("Overall time to process %i records is %.2fs (%.2f records/s)", nr, elapsed, nr / elapsed) log.info("File written to %s", Path(args.update_db).abspath()) return self.SUCCESS