def dump_histories(all_histories: List[Tuple[str, List[DbVisit]]]) -> None: logger = get_logger() output_dir = Path(config.get().output_dir) db_path = output_dir / 'promnesia.sqlite' def iter_visits(): for e, h in all_histories: # TODO sort them somehow for determinism? # TODO what do we do with errors? # TODO maybe conform them to schema and dump too? # TODO or, dump to a separate table? yield from h tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite' engine = create_engine(f'sqlite:///{tpath}') binder = NTBinder.make(DbVisit) meta = MetaData(engine) table = Table('visits', meta, *binder.columns) meta.create_all() with engine.begin() as conn: for chunk in chunked(iter_visits(), n=_CHUNK_BY): bound = [binder.to_row(x) for x in chunk] # pylint: disable=no-value-for-parameter conn.execute(table.insert().values(bound)) shutil.move(str(tpath), str(db_path)) logger.info('saved database to %s', db_path)
def _get_stuff(outdir: Path): db_path = outdir / 'promnesia.sqlite' assert db_path.exists() engine = create_engine(f'sqlite:///{db_path}') binder = NTBinder.make(DbVisit) meta = MetaData(engine) table = Table('visits', meta, *binder.columns) return engine, binder, table
def _get_stuff(db_path: PathWithMtime): get_logger().debug('Reloading DB: %s', db_path) # todo how to open read only? engine = create_engine(f'sqlite:///{db_path.path}') # , echo=True) binder = NTBinder.make(DbVisit) meta = MetaData(engine) table = Table('visits', meta, *binder.columns) from sqlalchemy import Index # type: ignore idx = Index('index_norm_url', table.c.norm_url) try: idx.create(bind=engine) except exc.OperationalError as e: if 'already exists' in str(e): # meh, but no idea how to check it properly... pass else: raise e # NOTE: apparently it's ok to open connection on every request? at least my comparisons didn't show anything return engine, binder, table
def visits_to_sqlite(vit: Iterable[Res[DbVisit]]) -> List[Exception]: logger = get_logger() db_path = config.get().db now = now_tz() ok = 0 errors = 0 def vit_ok() -> Iterable[DbVisit]: nonlocal errors, ok for v in vit: if isinstance(v, DbVisit): ok += 1 yield v else: errors += 1 # conform to the schema and dump. can't hurt anyway ev = DbVisit( norm_url='<error>', orig_url='<error>', dt=now, locator=Loc.make('<errror>'), src='error', # todo attach backtrace? context=repr(v), ) yield ev tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite' policy_update = update_policy_active() if not policy_update: engine = create_engine(f'sqlite:///{tpath}') else: engine = create_engine(f'sqlite:///{db_path}') binder = NTBinder.make(DbVisit) meta = MetaData(engine) table = Table('visits', meta, *binder.columns) meta.create_all() cleared: Set[str] = set() with engine.begin() as conn: for chunk in chunked(vit_ok(), n=_CHUNK_BY): srcs = set(v.src or '' for v in chunk) new = srcs.difference(cleared) for src in new: conn.execute(table.delete().where(table.c.src == src)) cleared.add(src) bound = [binder.to_row(x) for x in chunk] # pylint: disable=no-value-for-parameter conn.execute(table.insert().values(bound)) if not policy_update: shutil.move(str(tpath), str(db_path)) errs = '' if errors == 0 else f', {errors} ERRORS' total = ok + errors what = 'updated' if policy_update else 'overwritten' logger.info('%s database "%s". %d total (%d OK%s)', what, db_path, total, ok, errs) res: List[Exception] = [] if total == 0: res.append( RuntimeError( 'No visits were indexed, something is probably wrong!')) return res
def visits_to_sqlite(vit: Iterable[Res[DbVisit]], *, overwrite_db: bool) -> List[Exception]: logger = get_logger() db_path = config.get().db now = now_tz() ok = 0 errors = 0 def vit_ok() -> Iterable[DbVisit]: nonlocal errors, ok for v in vit: if isinstance(v, DbVisit): ok += 1 yield v else: errors += 1 # conform to the schema and dump. can't hurt anyway ev = DbVisit( norm_url='<error>', orig_url='<error>', dt=now, locator=Loc.make('<errror>'), src='error', # todo attach backtrace? context=repr(v), ) yield ev tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite' if overwrite_db: # here we don't need timeout, since it's a brand new DB engine = create_engine(f'sqlite:///{tpath}') else: # here we need a timeout, othewise concurrent indexing might not work # (note that this also needs WAL mode) # see test_concurrent_indexing engine = create_engine( f'sqlite:///{db_path}', connect_args={'timeout': _CONNECTION_TIMEOUT_SECONDS}) # using WAL keeps database readable while we're writing in it # this is tested by test_query_while_indexing def enable_wal(dbapi_con, con_record): dbapi_con.execute('PRAGMA journal_mode = WAL') event.listen(engine, 'connect', enable_wal) binder = NTBinder.make(DbVisit) meta = MetaData(engine) table = Table('visits', meta, *binder.columns) meta.create_all() cleared: Set[str] = set() with engine.begin() as conn: for chunk in chunked(vit_ok(), n=_CHUNK_BY): srcs = set(v.src or '' for v in chunk) new = srcs.difference(cleared) for src in new: conn.execute(table.delete().where(table.c.src == src)) cleared.add(src) bound = [binder.to_row(x) for x in chunk] # pylint: disable=no-value-for-parameter conn.execute(table.insert().values(bound)) if overwrite_db: shutil.move(str(tpath), str(db_path)) errs = '' if errors == 0 else f', {errors} ERRORS' total = ok + errors what = 'overwritten' if overwrite_db else 'updated' logger.info('%s database "%s". %d total (%d OK%s)', what, db_path, total, ok, errs) res: List[Exception] = [] if total == 0: res.append( RuntimeError( 'No visits were indexed, something is probably wrong!')) return res