Beispiel #1
0
def dump_histories(all_histories: List[Tuple[str, List[DbVisit]]]) -> None:
    logger = get_logger()
    output_dir = Path(config.get().output_dir)
    db_path = output_dir / 'promnesia.sqlite'

    def iter_visits():
        for e, h in all_histories:
            # TODO sort them somehow for determinism?
            # TODO what do we do with errors?
            # TODO maybe conform them to schema and dump too?
            # TODO or, dump to a separate table?
            yield from h

    tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite'
    engine = create_engine(f'sqlite:///{tpath}')
    binder = NTBinder.make(DbVisit)
    meta = MetaData(engine)
    table = Table('visits', meta, *binder.columns)
    meta.create_all()

    with engine.begin() as conn:
        for chunk in chunked(iter_visits(), n=_CHUNK_BY):
            bound = [binder.to_row(x) for x in chunk]
            # pylint: disable=no-value-for-parameter
            conn.execute(table.insert().values(bound))

    shutil.move(str(tpath), str(db_path))

    logger.info('saved database to %s', db_path)
Beispiel #2
0
def _get_stuff(outdir: Path):
    db_path = outdir / 'promnesia.sqlite'
    assert db_path.exists()

    engine = create_engine(f'sqlite:///{db_path}')

    binder = NTBinder.make(DbVisit)

    meta = MetaData(engine)
    table = Table('visits', meta, *binder.columns)

    return engine, binder, table
Beispiel #3
0
def _get_stuff(db_path: PathWithMtime):
    get_logger().debug('Reloading DB: %s', db_path)
    # todo how to open read only?
    engine = create_engine(f'sqlite:///{db_path.path}')  # , echo=True)

    binder = NTBinder.make(DbVisit)

    meta = MetaData(engine)
    table = Table('visits', meta, *binder.columns)

    from sqlalchemy import Index  # type: ignore
    idx = Index('index_norm_url', table.c.norm_url)
    try:
        idx.create(bind=engine)
    except exc.OperationalError as e:
        if 'already exists' in str(e):
            # meh, but no idea how to check it properly...
            pass
        else:
            raise e

    # NOTE: apparently it's ok to open connection on every request? at least my comparisons didn't show anything
    return engine, binder, table
Beispiel #4
0
def visits_to_sqlite(vit: Iterable[Res[DbVisit]]) -> List[Exception]:
    logger = get_logger()
    db_path = config.get().db

    now = now_tz()
    ok = 0
    errors = 0

    def vit_ok() -> Iterable[DbVisit]:
        nonlocal errors, ok
        for v in vit:
            if isinstance(v, DbVisit):
                ok += 1
                yield v
            else:
                errors += 1
                # conform to the schema and dump. can't hurt anyway
                ev = DbVisit(
                    norm_url='<error>',
                    orig_url='<error>',
                    dt=now,
                    locator=Loc.make('<errror>'),
                    src='error',
                    # todo attach backtrace?
                    context=repr(v),
                )
                yield ev

    tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite'
    policy_update = update_policy_active()
    if not policy_update:
        engine = create_engine(f'sqlite:///{tpath}')
    else:
        engine = create_engine(f'sqlite:///{db_path}')

    binder = NTBinder.make(DbVisit)
    meta = MetaData(engine)
    table = Table('visits', meta, *binder.columns)
    meta.create_all()

    cleared: Set[str] = set()
    with engine.begin() as conn:
        for chunk in chunked(vit_ok(), n=_CHUNK_BY):
            srcs = set(v.src or '' for v in chunk)
            new = srcs.difference(cleared)
            for src in new:
                conn.execute(table.delete().where(table.c.src == src))
                cleared.add(src)

            bound = [binder.to_row(x) for x in chunk]
            # pylint: disable=no-value-for-parameter
            conn.execute(table.insert().values(bound))

    if not policy_update:
        shutil.move(str(tpath), str(db_path))

    errs = '' if errors == 0 else f', {errors} ERRORS'
    total = ok + errors
    what = 'updated' if policy_update else 'overwritten'
    logger.info('%s database "%s". %d total (%d OK%s)', what, db_path, total,
                ok, errs)
    res: List[Exception] = []
    if total == 0:
        res.append(
            RuntimeError(
                'No visits were indexed, something is probably wrong!'))
    return res
Beispiel #5
0
def visits_to_sqlite(vit: Iterable[Res[DbVisit]], *,
                     overwrite_db: bool) -> List[Exception]:
    logger = get_logger()
    db_path = config.get().db

    now = now_tz()
    ok = 0
    errors = 0

    def vit_ok() -> Iterable[DbVisit]:
        nonlocal errors, ok
        for v in vit:
            if isinstance(v, DbVisit):
                ok += 1
                yield v
            else:
                errors += 1
                # conform to the schema and dump. can't hurt anyway
                ev = DbVisit(
                    norm_url='<error>',
                    orig_url='<error>',
                    dt=now,
                    locator=Loc.make('<errror>'),
                    src='error',
                    # todo attach backtrace?
                    context=repr(v),
                )
                yield ev

    tpath = Path(get_tmpdir().name) / 'promnesia.tmp.sqlite'
    if overwrite_db:
        # here we don't need timeout, since it's a brand new DB
        engine = create_engine(f'sqlite:///{tpath}')
    else:
        # here we need a timeout, othewise concurrent indexing might not work
        # (note that this also needs WAL mode)
        # see test_concurrent_indexing
        engine = create_engine(
            f'sqlite:///{db_path}',
            connect_args={'timeout': _CONNECTION_TIMEOUT_SECONDS})

    # using WAL keeps database readable while we're writing in it
    # this is tested by test_query_while_indexing
    def enable_wal(dbapi_con, con_record):
        dbapi_con.execute('PRAGMA journal_mode = WAL')

    event.listen(engine, 'connect', enable_wal)

    binder = NTBinder.make(DbVisit)
    meta = MetaData(engine)
    table = Table('visits', meta, *binder.columns)
    meta.create_all()

    cleared: Set[str] = set()
    with engine.begin() as conn:
        for chunk in chunked(vit_ok(), n=_CHUNK_BY):
            srcs = set(v.src or '' for v in chunk)
            new = srcs.difference(cleared)
            for src in new:
                conn.execute(table.delete().where(table.c.src == src))
                cleared.add(src)

            bound = [binder.to_row(x) for x in chunk]
            # pylint: disable=no-value-for-parameter
            conn.execute(table.insert().values(bound))

    if overwrite_db:
        shutil.move(str(tpath), str(db_path))

    errs = '' if errors == 0 else f', {errors} ERRORS'
    total = ok + errors
    what = 'overwritten' if overwrite_db else 'updated'
    logger.info('%s database "%s". %d total (%d OK%s)', what, db_path, total,
                ok, errs)
    res: List[Exception] = []
    if total == 0:
        res.append(
            RuntimeError(
                'No visits were indexed, something is probably wrong!'))
    return res