Ejemplo n.º 1
0
def show_source_counts(nlpdef: NlpDefinition) -> None:
    """
    Show the number of records in all source tables.
    """
    print("SOURCE TABLE RECORD COUNTS:")
    counts = []  # type: List[Tuple[str, int]]
    for ifconfig in nlpdef.get_ifconfigs():
        session = ifconfig.get_source_session()
        dbname = ifconfig.get_srcdb()
        tablename = ifconfig.get_srctable()
        n = count_star(session, tablename)
        counts.append(("{}.{}".format(dbname, tablename), n))
    print_record_counts(counts)
Ejemplo n.º 2
0
def drop_remake(progargs,
                nlpdef: NlpDefinition,
                incremental: bool = False,
                skipdelete: bool = False) -> None:
    """
    Drop output tables and recreate them.
    """
    # Not parallel.
    # -------------------------------------------------------------------------
    # 1. Progress database
    # -------------------------------------------------------------------------
    progengine = nlpdef.get_progdb_engine()
    if not incremental:
        log.debug("Dropping progress tables")
        NlpRecord.__table__.drop(progengine, checkfirst=True)
    log.info("Creating progress table (with index)")
    NlpRecord.__table__.create(progengine, checkfirst=True)

    # -------------------------------------------------------------------------
    # 2. Output database(s)
    # -------------------------------------------------------------------------
    pretty_names = []  # type: List[str]
    for processor in nlpdef.get_processors():
        new_pretty_names = processor.make_tables(drop_first=not incremental)
        for npn in new_pretty_names:
            if npn in pretty_names:
                log.warning("An NLP processor has tried to re-make a table "
                            "made by one of its colleagues: {}".format(npn))
        pretty_names.extend(new_pretty_names)

    # -------------------------------------------------------------------------
    # 3. Delete WHERE NOT IN for incremental
    # -------------------------------------------------------------------------
    for ifconfig in nlpdef.get_ifconfigs():
        with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE):
            if incremental:
                if not skipdelete:
                    delete_where_no_source(
                        nlpdef,
                        ifconfig,
                        report_every=progargs.report_every_fast,
                        chunksize=progargs.chunksize)
            else:  # full
                ifconfig.delete_all_progress_records()

    # -------------------------------------------------------------------------
    # 4. Overall commit (superfluous)
    # -------------------------------------------------------------------------
    nlpdef.commit_all()
Ejemplo n.º 3
0
def process_nlp(nlpdef: NlpDefinition,
                incremental: bool = False,
                report_every: int = DEFAULT_REPORT_EVERY_NLP,
                tasknum: int = 0,
                ntasks: int = 1) -> None:
    """
    Main NLP processing function. Fetch text, send it to the GATE app
    (storing the results), and make a note in the progress database.
    """
    log.info(SEP + "NLP")
    session = nlpdef.get_progdb_session()
    for ifconfig in nlpdef.get_ifconfigs():
        i = 0  # record count within this process
        recnum = tasknum  # record count overall
        totalcount = ifconfig.get_count()  # total number of records in table
        for text, other_values in ifconfig.gen_text(tasknum=tasknum,
                                                    ntasks=ntasks):
            i += 1
            pkval = other_values[FN_SRCPKVAL]
            pkstr = other_values[FN_SRCPKSTR]
            if report_every and i % report_every == 0:
                log.info(
                    "Processing {db}.{t}.{c}, PK: {pkf}={pkv} "
                    "({overall}record {approx}{recnum}/{totalcount})"
                    "{thisproc}".format(
                        db=other_values[FN_SRCDB],
                        t=other_values[FN_SRCTABLE],
                        c=other_values[FN_SRCFIELD],
                        pkf=other_values[FN_SRCPKFIELD],
                        pkv=pkstr if pkstr else pkval,
                        overall="overall " if ntasks > 1 else "",
                        approx="~" if pkstr and ntasks > 1 else "",
                        # ... string hashing means approx. distribution
                        recnum=recnum + 1,
                        i=i,
                        totalcount=totalcount,
                        thisproc=(" ({i}/~{proccount} this process)".format(
                            i=i, proccount=totalcount //
                            ntasks) if ntasks > 1 else "")))
            recnum += ntasks
            # log.critical("other_values={}".format(repr(other_values)))
            srchash = nlpdef.hash(text)

            progrec = None
            if incremental:
                progrec = ifconfig.get_progress_record(pkval, pkstr)
                if progrec is not None:
                    if progrec.srchash == srchash:
                        log.debug("Record previously processed; skipping")
                        continue
                    else:
                        log.debug("Record has changed")
                else:
                    log.debug("Record is new")

            for processor in nlpdef.get_processors():
                if incremental:
                    processor.delete_dest_record(ifconfig,
                                                 pkval,
                                                 pkstr,
                                                 commit=incremental)
                processor.process(text, other_values)

            # Make a note in the progress database that we've processed a
            # source record.
            if progrec:  # modifying an existing record
                progrec.whenprocessedutc = nlpdef.get_now()
                progrec.srchash = srchash
            else:  # creating a new record
                progrec = NlpRecord(
                    # Quasi-key fields:
                    srcdb=ifconfig.get_srcdb(),
                    srctable=ifconfig.get_srctable(),
                    srcpkval=pkval,
                    srcpkstr=pkstr,
                    srcfield=ifconfig.get_srcfield(),
                    nlpdef=nlpdef.get_name(),
                    # Other fields:
                    srcpkfield=ifconfig.get_srcpkfield(),
                    whenprocessedutc=nlpdef.get_now(),
                    srchash=srchash,
                )
                with MultiTimerContext(timer, TIMING_PROGRESS_DB_ADD):
                    session.add(progrec)

            # In incremental mode, do we commit immediately, because other
            # processes may need this table promptly... ?

            # force_commit = False  # definitely wrong; crashes as below
            # force_commit = incremental
            force_commit = ntasks > 1

            # - A single source record should not be processed by >1 CRATE
            #   process. So in theory there should be no conflicts.
            # - However, databases can lock in various ways. Can we guarantee
            #   it'll do something sensible?
            # - See also
            #   https://en.wikipedia.org/wiki/Isolation_(database_systems)
            #   http://skien.cc/blog/2014/02/06/sqlalchemy-and-race-conditions-follow-up/  # noqa
            #   http://docs.sqlalchemy.org/en/latest/core/connections.html?highlight=execution_options#sqlalchemy.engine.Connection.execution_options  # noqa
            # - However, empirically, setting this to False gives
            #   "Transaction (Process ID xx) was deadlocked on lock resources
            #   with another process and has been chosen as the deadlock
            #   victim. Rerun the transaction." -- with a SELECT query.
            # - SQL Server uses READ COMMITTED as the default isolation level.
            # - https://technet.microsoft.com/en-us/library/jj856598(v=sql.110).aspx  # noqa

            nlpdef.notify_transaction(
                session=session,
                n_rows=1,
                n_bytes=sys.getsizeof(progrec),  # approx
                force_commit=force_commit)

    nlpdef.commit_all()