Ejemplo n.º 1
0
def drop_remake(progargs,
                nlpdef: NlpDefinition,
                incremental: bool = False,
                skipdelete: bool = False) -> None:
    """
    Drop output tables and recreate them.
    """
    # Not parallel.
    # -------------------------------------------------------------------------
    # 1. Progress database
    # -------------------------------------------------------------------------
    progengine = nlpdef.get_progdb_engine()
    if not incremental:
        log.debug("Dropping progress tables")
        NlpRecord.__table__.drop(progengine, checkfirst=True)
    log.info("Creating progress table (with index)")
    NlpRecord.__table__.create(progengine, checkfirst=True)

    # -------------------------------------------------------------------------
    # 2. Output database(s)
    # -------------------------------------------------------------------------
    pretty_names = []  # type: List[str]
    for processor in nlpdef.get_processors():
        new_pretty_names = processor.make_tables(drop_first=not incremental)
        for npn in new_pretty_names:
            if npn in pretty_names:
                log.warning("An NLP processor has tried to re-make a table "
                            "made by one of its colleagues: {}".format(npn))
        pretty_names.extend(new_pretty_names)

    # -------------------------------------------------------------------------
    # 3. Delete WHERE NOT IN for incremental
    # -------------------------------------------------------------------------
    for ifconfig in nlpdef.get_ifconfigs():
        with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE):
            if incremental:
                if not skipdelete:
                    delete_where_no_source(
                        nlpdef,
                        ifconfig,
                        report_every=progargs.report_every_fast,
                        chunksize=progargs.chunksize)
            else:  # full
                ifconfig.delete_all_progress_records()

    # -------------------------------------------------------------------------
    # 4. Overall commit (superfluous)
    # -------------------------------------------------------------------------
    nlpdef.commit_all()
Ejemplo n.º 2
0
def delete_where_no_source(nlpdef: NlpDefinition,
                           ifconfig: InputFieldConfig,
                           report_every: int = DEFAULT_REPORT_EVERY,
                           chunksize: int = DEFAULT_CHUNKSIZE) -> None:
    """
    Delete destination records where source records no longer exist.

    - Can't do this in a single SQL command, since the engine can't necessarily
      see both databases.
    - Can't use a single temporary table, since the progress database isn't
      necessarily the same as any of the destination database(s).
    - Can't do this in a multiprocess way, because we're trying to do a
      DELETE WHERE NOT IN.
    - So we fetch all source PKs (which, by definition, do exist), stash them
      keep them in memory, and do a DELETE WHERE NOT IN based on those
      specified values (or, if there are no PKs in the source, delete
      everything from the destination).

    Problems:
    - This is IMPERFECT if we have string source PKs and there are hash
      collisions (e.g. PKs for records X and Y both hash to the same thing;
      record X is deleted; then its processed version might not be).
    - With massive tables, we might run out of memory or (much more likely)
      SQL parameter slots. -- This is now happening; error looks like:
      pyodbc.ProgrammingError: ('The SQL contains 30807 parameter parkers, but
      2717783 parameters were supplied', 'HY000')

    A better way might be:
    - for each table, make a temporary table in the same database
    - populate that table with (source PK integer/hash, source PK string) pairs
    - delete where pairs don't match -- is that portable SQL?
      http://stackoverflow.com/questions/7356108/sql-query-for-deleting-rows-with-not-in-using-2-columns  # noqa
    - More efficient would be to make one table per destination database.

    On the "delete where multiple fields don't match":
    - Single field syntax is
        DELETE FROM a WHERE a1 NOT IN (SELECT b1 FROM b)
    - Multiple field syntax is
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND a.a2 = b.b2
        )
    - In SQLAlchemy, exists():
        http://stackoverflow.com/questions/14600619
        http://docs.sqlalchemy.org/en/latest/core/selectable.html
    - Furthermore, in SQL NULL = NULL is false, and NULL <> NULL is also false,
      so we have to do an explicit null check.
      You do that with "field == None" (disable
      See http://stackoverflow.com/questions/21668606
      We're aiming, therefore, for:
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND (
                a.a2 = b.b2
                OR (a.a2 IS NULL AND b.b2 IS NULL)
            )
        )
    """

    # -------------------------------------------------------------------------
    # Sub-functions
    # -------------------------------------------------------------------------

    def insert(records_):
        n_rows = len(records_)
        log.debug("... inserting {} records".format(n_rows))
        for db in databases:
            session_ = db['session']
            temptable_ = db['temptable']  # type: Table
            session_.execute(temptable_.insert(), records_)
            nlpdef.notify_transaction(session_,
                                      n_rows=n_rows,
                                      n_bytes=sys.getsizeof(records_))

    def commit():
        for db in databases:
            nlpdef.commit(db['session'])

    # -------------------------------------------------------------------------
    # Main code
    # -------------------------------------------------------------------------
    # Use info log level, otherwise it looks like our code hangs with very
    # large databases.

    log.info("delete_where_no_source: examining source table {}.{}; "
             "MAY BE SLOW".format(ifconfig.get_srcdb(),
                                  ifconfig.get_srctable()))

    # Start our list with the progress database
    databases = [{
        'session': nlpdef.get_progdb_session(),
        'engine': nlpdef.get_progdb_engine(),
        'metadata': nlpdef.get_progdb_metadata(),
        'temptable': None,  # type: Table
    }]

    # Add the processors' destination databases
    for processor in nlpdef.get_processors():  # of type BaseNlpParser
        session = processor.get_session()
        if any(x['session'] == session for x in databases):
            continue  # already exists
        databases.append({
            'session': session,
            'engine': processor.get_engine(),
            'metadata': processor.get_metadata(),
        })

    # Make a temporary table in each database (note: the Table objects become
    # affiliated to their engine, I think, so make separate ones for each).
    log.info("... using {n} destination database(s)".format(n=len(databases)))
    log.info("... dropping (if exists) and creating temporary table(s)")
    for database in databases:
        engine = database['engine']
        temptable = Table(
            nlpdef.get_temporary_tablename(),
            database['metadata'],
            Column(FN_SRCPKVAL, BigInteger),  # not PK, as may be a hash
            Column(FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH)),
            **TABLE_KWARGS)
        temptable.drop(engine, checkfirst=True)
        temptable.create(engine, checkfirst=True)
        database['temptable'] = temptable

    # Insert PKs into temporary tables

    n = count_star(ifconfig.get_source_session(), ifconfig.get_srctable())
    log.info("... populating temporary table(s): {} records to go; working in "
             "chunks of {}".format(n, chunksize))
    i = 0
    records = []  # type: List[Dict[str, Any]]
    for pkval, pkstr in ifconfig.gen_src_pks():
        i += 1
        if report_every and i % report_every == 0:
            log.info("... src row# {} / {}".format(i, n))
        records.append({FN_SRCPKVAL: pkval, FN_SRCPKSTR: pkstr})
        if i % chunksize == 0:
            insert(records)
            records = []  # type: List[Dict[str, Any]]
    if records:  # remainder
        insert(records)

    # Commit
    commit()

    # Index, for speed
    log.info("... creating index(es) on temporary table(s)")
    for database in databases:
        temptable = database['temptable']  # type: Table
        index = Index('_temptable_idx', temptable.columns[FN_SRCPKVAL])
        index.create(database['engine'])

    # DELETE FROM desttable WHERE destpk NOT IN (SELECT srcpk FROM temptable)
    log.info("... deleting from progress/destination DBs where appropriate")

    # Delete from progress database
    prog_db = databases[0]
    prog_temptable = prog_db['temptable']
    ifconfig.delete_progress_records_where_srcpk_not(prog_temptable)

    # Delete from others
    for processor in nlpdef.get_processors():
        database = [
            x for x in databases if x['session'] == processor.get_session()
        ][0]
        temptable = database['temptable']
        processor.delete_where_srcpk_not(ifconfig, temptable)

    # Drop temporary tables
    log.info("... dropping temporary table(s)")
    for database in databases:
        database['temptable'].drop(database['engine'], checkfirst=True)

    # Commit
    commit()