Example #1
0
def show_dest_counts(nlpdef: NlpDefinition) -> None:
    """
    Show the number of records in all destination tables.
    """
    print("DESTINATION TABLE RECORD COUNTS:")
    counts = []  # type: List[Tuple[str, int]]
    for processor in nlpdef.get_processors():
        session = processor.get_session()
        dbname = processor.get_dbname()
        for tablename in processor.get_tablenames():
            n = count_star(session, tablename)
            counts.append(("DESTINATION: {}.{}".format(dbname, tablename), n))
    print_record_counts(counts)
Example #2
0
def drop_remake(progargs,
                nlpdef: NlpDefinition,
                incremental: bool = False,
                skipdelete: bool = False) -> None:
    """
    Drop output tables and recreate them.
    """
    # Not parallel.
    # -------------------------------------------------------------------------
    # 1. Progress database
    # -------------------------------------------------------------------------
    progengine = nlpdef.get_progdb_engine()
    if not incremental:
        log.debug("Dropping progress tables")
        NlpRecord.__table__.drop(progengine, checkfirst=True)
    log.info("Creating progress table (with index)")
    NlpRecord.__table__.create(progengine, checkfirst=True)

    # -------------------------------------------------------------------------
    # 2. Output database(s)
    # -------------------------------------------------------------------------
    pretty_names = []  # type: List[str]
    for processor in nlpdef.get_processors():
        new_pretty_names = processor.make_tables(drop_first=not incremental)
        for npn in new_pretty_names:
            if npn in pretty_names:
                log.warning("An NLP processor has tried to re-make a table "
                            "made by one of its colleagues: {}".format(npn))
        pretty_names.extend(new_pretty_names)

    # -------------------------------------------------------------------------
    # 3. Delete WHERE NOT IN for incremental
    # -------------------------------------------------------------------------
    for ifconfig in nlpdef.get_ifconfigs():
        with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE):
            if incremental:
                if not skipdelete:
                    delete_where_no_source(
                        nlpdef,
                        ifconfig,
                        report_every=progargs.report_every_fast,
                        chunksize=progargs.chunksize)
            else:  # full
                ifconfig.delete_all_progress_records()

    # -------------------------------------------------------------------------
    # 4. Overall commit (superfluous)
    # -------------------------------------------------------------------------
    nlpdef.commit_all()
Example #3
0
def process_nlp(nlpdef: NlpDefinition,
                incremental: bool = False,
                report_every: int = DEFAULT_REPORT_EVERY_NLP,
                tasknum: int = 0,
                ntasks: int = 1) -> None:
    """
    Main NLP processing function. Fetch text, send it to the GATE app
    (storing the results), and make a note in the progress database.
    """
    log.info(SEP + "NLP")
    session = nlpdef.get_progdb_session()
    for ifconfig in nlpdef.get_ifconfigs():
        i = 0  # record count within this process
        recnum = tasknum  # record count overall
        totalcount = ifconfig.get_count()  # total number of records in table
        for text, other_values in ifconfig.gen_text(tasknum=tasknum,
                                                    ntasks=ntasks):
            i += 1
            pkval = other_values[FN_SRCPKVAL]
            pkstr = other_values[FN_SRCPKSTR]
            if report_every and i % report_every == 0:
                log.info(
                    "Processing {db}.{t}.{c}, PK: {pkf}={pkv} "
                    "({overall}record {approx}{recnum}/{totalcount})"
                    "{thisproc}".format(
                        db=other_values[FN_SRCDB],
                        t=other_values[FN_SRCTABLE],
                        c=other_values[FN_SRCFIELD],
                        pkf=other_values[FN_SRCPKFIELD],
                        pkv=pkstr if pkstr else pkval,
                        overall="overall " if ntasks > 1 else "",
                        approx="~" if pkstr and ntasks > 1 else "",
                        # ... string hashing means approx. distribution
                        recnum=recnum + 1,
                        i=i,
                        totalcount=totalcount,
                        thisproc=(" ({i}/~{proccount} this process)".format(
                            i=i, proccount=totalcount //
                            ntasks) if ntasks > 1 else "")))
            recnum += ntasks
            # log.critical("other_values={}".format(repr(other_values)))
            srchash = nlpdef.hash(text)

            progrec = None
            if incremental:
                progrec = ifconfig.get_progress_record(pkval, pkstr)
                if progrec is not None:
                    if progrec.srchash == srchash:
                        log.debug("Record previously processed; skipping")
                        continue
                    else:
                        log.debug("Record has changed")
                else:
                    log.debug("Record is new")

            for processor in nlpdef.get_processors():
                if incremental:
                    processor.delete_dest_record(ifconfig,
                                                 pkval,
                                                 pkstr,
                                                 commit=incremental)
                processor.process(text, other_values)

            # Make a note in the progress database that we've processed a
            # source record.
            if progrec:  # modifying an existing record
                progrec.whenprocessedutc = nlpdef.get_now()
                progrec.srchash = srchash
            else:  # creating a new record
                progrec = NlpRecord(
                    # Quasi-key fields:
                    srcdb=ifconfig.get_srcdb(),
                    srctable=ifconfig.get_srctable(),
                    srcpkval=pkval,
                    srcpkstr=pkstr,
                    srcfield=ifconfig.get_srcfield(),
                    nlpdef=nlpdef.get_name(),
                    # Other fields:
                    srcpkfield=ifconfig.get_srcpkfield(),
                    whenprocessedutc=nlpdef.get_now(),
                    srchash=srchash,
                )
                with MultiTimerContext(timer, TIMING_PROGRESS_DB_ADD):
                    session.add(progrec)

            # In incremental mode, do we commit immediately, because other
            # processes may need this table promptly... ?

            # force_commit = False  # definitely wrong; crashes as below
            # force_commit = incremental
            force_commit = ntasks > 1

            # - A single source record should not be processed by >1 CRATE
            #   process. So in theory there should be no conflicts.
            # - However, databases can lock in various ways. Can we guarantee
            #   it'll do something sensible?
            # - See also
            #   https://en.wikipedia.org/wiki/Isolation_(database_systems)
            #   http://skien.cc/blog/2014/02/06/sqlalchemy-and-race-conditions-follow-up/  # noqa
            #   http://docs.sqlalchemy.org/en/latest/core/connections.html?highlight=execution_options#sqlalchemy.engine.Connection.execution_options  # noqa
            # - However, empirically, setting this to False gives
            #   "Transaction (Process ID xx) was deadlocked on lock resources
            #   with another process and has been chosen as the deadlock
            #   victim. Rerun the transaction." -- with a SELECT query.
            # - SQL Server uses READ COMMITTED as the default isolation level.
            # - https://technet.microsoft.com/en-us/library/jj856598(v=sql.110).aspx  # noqa

            nlpdef.notify_transaction(
                session=session,
                n_rows=1,
                n_bytes=sys.getsizeof(progrec),  # approx
                force_commit=force_commit)

    nlpdef.commit_all()
Example #4
0
def delete_where_no_source(nlpdef: NlpDefinition,
                           ifconfig: InputFieldConfig,
                           report_every: int = DEFAULT_REPORT_EVERY,
                           chunksize: int = DEFAULT_CHUNKSIZE) -> None:
    """
    Delete destination records where source records no longer exist.

    - Can't do this in a single SQL command, since the engine can't necessarily
      see both databases.
    - Can't use a single temporary table, since the progress database isn't
      necessarily the same as any of the destination database(s).
    - Can't do this in a multiprocess way, because we're trying to do a
      DELETE WHERE NOT IN.
    - So we fetch all source PKs (which, by definition, do exist), stash them
      keep them in memory, and do a DELETE WHERE NOT IN based on those
      specified values (or, if there are no PKs in the source, delete
      everything from the destination).

    Problems:
    - This is IMPERFECT if we have string source PKs and there are hash
      collisions (e.g. PKs for records X and Y both hash to the same thing;
      record X is deleted; then its processed version might not be).
    - With massive tables, we might run out of memory or (much more likely)
      SQL parameter slots. -- This is now happening; error looks like:
      pyodbc.ProgrammingError: ('The SQL contains 30807 parameter parkers, but
      2717783 parameters were supplied', 'HY000')

    A better way might be:
    - for each table, make a temporary table in the same database
    - populate that table with (source PK integer/hash, source PK string) pairs
    - delete where pairs don't match -- is that portable SQL?
      http://stackoverflow.com/questions/7356108/sql-query-for-deleting-rows-with-not-in-using-2-columns  # noqa
    - More efficient would be to make one table per destination database.

    On the "delete where multiple fields don't match":
    - Single field syntax is
        DELETE FROM a WHERE a1 NOT IN (SELECT b1 FROM b)
    - Multiple field syntax is
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND a.a2 = b.b2
        )
    - In SQLAlchemy, exists():
        http://stackoverflow.com/questions/14600619
        http://docs.sqlalchemy.org/en/latest/core/selectable.html
    - Furthermore, in SQL NULL = NULL is false, and NULL <> NULL is also false,
      so we have to do an explicit null check.
      You do that with "field == None" (disable
      See http://stackoverflow.com/questions/21668606
      We're aiming, therefore, for:
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND (
                a.a2 = b.b2
                OR (a.a2 IS NULL AND b.b2 IS NULL)
            )
        )
    """

    # -------------------------------------------------------------------------
    # Sub-functions
    # -------------------------------------------------------------------------

    def insert(records_):
        n_rows = len(records_)
        log.debug("... inserting {} records".format(n_rows))
        for db in databases:
            session_ = db['session']
            temptable_ = db['temptable']  # type: Table
            session_.execute(temptable_.insert(), records_)
            nlpdef.notify_transaction(session_,
                                      n_rows=n_rows,
                                      n_bytes=sys.getsizeof(records_))

    def commit():
        for db in databases:
            nlpdef.commit(db['session'])

    # -------------------------------------------------------------------------
    # Main code
    # -------------------------------------------------------------------------
    # Use info log level, otherwise it looks like our code hangs with very
    # large databases.

    log.info("delete_where_no_source: examining source table {}.{}; "
             "MAY BE SLOW".format(ifconfig.get_srcdb(),
                                  ifconfig.get_srctable()))

    # Start our list with the progress database
    databases = [{
        'session': nlpdef.get_progdb_session(),
        'engine': nlpdef.get_progdb_engine(),
        'metadata': nlpdef.get_progdb_metadata(),
        'temptable': None,  # type: Table
    }]

    # Add the processors' destination databases
    for processor in nlpdef.get_processors():  # of type BaseNlpParser
        session = processor.get_session()
        if any(x['session'] == session for x in databases):
            continue  # already exists
        databases.append({
            'session': session,
            'engine': processor.get_engine(),
            'metadata': processor.get_metadata(),
        })

    # Make a temporary table in each database (note: the Table objects become
    # affiliated to their engine, I think, so make separate ones for each).
    log.info("... using {n} destination database(s)".format(n=len(databases)))
    log.info("... dropping (if exists) and creating temporary table(s)")
    for database in databases:
        engine = database['engine']
        temptable = Table(
            nlpdef.get_temporary_tablename(),
            database['metadata'],
            Column(FN_SRCPKVAL, BigInteger),  # not PK, as may be a hash
            Column(FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH)),
            **TABLE_KWARGS)
        temptable.drop(engine, checkfirst=True)
        temptable.create(engine, checkfirst=True)
        database['temptable'] = temptable

    # Insert PKs into temporary tables

    n = count_star(ifconfig.get_source_session(), ifconfig.get_srctable())
    log.info("... populating temporary table(s): {} records to go; working in "
             "chunks of {}".format(n, chunksize))
    i = 0
    records = []  # type: List[Dict[str, Any]]
    for pkval, pkstr in ifconfig.gen_src_pks():
        i += 1
        if report_every and i % report_every == 0:
            log.info("... src row# {} / {}".format(i, n))
        records.append({FN_SRCPKVAL: pkval, FN_SRCPKSTR: pkstr})
        if i % chunksize == 0:
            insert(records)
            records = []  # type: List[Dict[str, Any]]
    if records:  # remainder
        insert(records)

    # Commit
    commit()

    # Index, for speed
    log.info("... creating index(es) on temporary table(s)")
    for database in databases:
        temptable = database['temptable']  # type: Table
        index = Index('_temptable_idx', temptable.columns[FN_SRCPKVAL])
        index.create(database['engine'])

    # DELETE FROM desttable WHERE destpk NOT IN (SELECT srcpk FROM temptable)
    log.info("... deleting from progress/destination DBs where appropriate")

    # Delete from progress database
    prog_db = databases[0]
    prog_temptable = prog_db['temptable']
    ifconfig.delete_progress_records_where_srcpk_not(prog_temptable)

    # Delete from others
    for processor in nlpdef.get_processors():
        database = [
            x for x in databases if x['session'] == processor.get_session()
        ][0]
        temptable = database['temptable']
        processor.delete_where_srcpk_not(ifconfig, temptable)

    # Drop temporary tables
    log.info("... dropping temporary table(s)")
    for database in databases:
        database['temptable'].drop(database['engine'], checkfirst=True)

    # Commit
    commit()