Beispiel #1
0
 def delete_dest_record(self,
                        ifconfig: InputFieldConfig,
                        srcpkval: int,
                        srcpkstr: Optional[str],
                        commit: bool = False) -> None:
     """
     Used during incremental updates.
     For when a record (specified by srcpkval) has been updated in the
     source; wipe older entries for it in the destination database(s).
     """
     session = self.get_session()
     srcdb = ifconfig.get_srcdb()
     srctable = ifconfig.get_srctable()
     srcfield = ifconfig.get_srcfield()
     destdb_name = self._destdb.name
     nlpdef_name = self._nlpdef.get_name()
     for tablename, desttable in self.tables().items():
         log.debug("delete_from_dest_dbs... {}.{} -> {}.{}".format(
             srcdb, srctable, destdb_name, tablename))
         # noinspection PyProtectedMember,PyPropertyAccess
         delquery = (desttable.delete().where(
             desttable.c._srcdb == srcdb).where(
                 desttable.c._srctable == srctable).where(
                     desttable.c._srcfield == srcfield).where(
                         desttable.c._srcpkval == srcpkval).where(
                             desttable.c._nlpdef == nlpdef_name))
         if srcpkstr is not None:
             # noinspection PyProtectedMember,PyPropertyAccess
             delquery = delquery.where(desttable.c._srcpkstr == srcpkstr)
         with MultiTimerContext(timer, TIMING_DELETE_DEST_RECORD):
             session.execute(delquery)
         if commit:
             self._nlpdef.commit(session)
Beispiel #2
0
def hash32(data: Any, seed=0) -> int:
    """Returns a signed 32-bit integer."""
    with MultiTimerContext(timer, TIMING_HASH):
        c_data = to_str(data)
        if mmh3:
            return mmh3.hash(c_data, seed=seed)
        py_data = to_bytes(c_data)
        py_unsigned = murmur3_x86_32(py_data, seed=seed)
        return twos_comp_to_signed(py_unsigned, n_bits=32)
Beispiel #3
0
 def process(self, text: str, starting_fields_values: Dict[str,
                                                           Any]) -> None:
     if not text:
         return
     starting_fields_values[FN_NLPDEF] = self._nlpdef.get_name()
     session = self.get_session()
     n_values = 0
     with MultiTimerContext(timer, TIMING_PARSE):
         for tablename, nlp_values in self.parse(text):
             with MultiTimerContext(timer, TIMING_HANDLE_PARSED):
                 # Merge dictionaries so EXISTING FIELDS/VALUES
                 # (starting_fields_values) HAVE PRIORITY.
                 nlp_values.update(starting_fields_values)
                 sqla_table = self.get_table(tablename)
                 # If we have superfluous keys in our dictionary, SQLAlchemy
                 # will choke ("Unconsumed column names", reporting the
                 # thing that's in our dictionary that it doesn't know
                 # about). HOWEVER, note that SQLA column names may be mixed
                 # case (e.g. 'Text') while our copy-column names are lower
                 # case (e.g. 'text'), so we must have pre-converted
                 # the SQLA column names to lower case. That happens in
                 # InputFieldConfig.get_copy_columns and
                 # InputFieldConfig.get_copy_indexes
                 column_names = [c.name for c in sqla_table.columns]
                 final_values = {
                     k: v
                     for k, v in nlp_values.items() if k in column_names
                 }
                 # log.critical(repr(sqla_table))
                 insertquery = sqla_table.insert().values(final_values)
                 with MultiTimerContext(timer, TIMING_INSERT):
                     session.execute(insertquery)
                 self._nlpdef.notify_transaction(
                     session,
                     n_rows=1,
                     n_bytes=sys.getsizeof(final_values),
                     force_commit=self._commit
                 )  # or we get deadlocks in multiprocess mode  # noqa
                 n_values += 1
     log.debug("NLP processor {}/{}: found {} values".format(
         self.get_nlpdef_name(), self.get_parser_name(), n_values))
Beispiel #4
0
def drop_remake(progargs,
                nlpdef: NlpDefinition,
                incremental: bool = False,
                skipdelete: bool = False) -> None:
    """
    Drop output tables and recreate them.
    """
    # Not parallel.
    # -------------------------------------------------------------------------
    # 1. Progress database
    # -------------------------------------------------------------------------
    progengine = nlpdef.get_progdb_engine()
    if not incremental:
        log.debug("Dropping progress tables")
        NlpRecord.__table__.drop(progengine, checkfirst=True)
    log.info("Creating progress table (with index)")
    NlpRecord.__table__.create(progengine, checkfirst=True)

    # -------------------------------------------------------------------------
    # 2. Output database(s)
    # -------------------------------------------------------------------------
    pretty_names = []  # type: List[str]
    for processor in nlpdef.get_processors():
        new_pretty_names = processor.make_tables(drop_first=not incremental)
        for npn in new_pretty_names:
            if npn in pretty_names:
                log.warning("An NLP processor has tried to re-make a table "
                            "made by one of its colleagues: {}".format(npn))
        pretty_names.extend(new_pretty_names)

    # -------------------------------------------------------------------------
    # 3. Delete WHERE NOT IN for incremental
    # -------------------------------------------------------------------------
    for ifconfig in nlpdef.get_ifconfigs():
        with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE):
            if incremental:
                if not skipdelete:
                    delete_where_no_source(
                        nlpdef,
                        ifconfig,
                        report_every=progargs.report_every_fast,
                        chunksize=progargs.chunksize)
            else:  # full
                ifconfig.delete_all_progress_records()

    # -------------------------------------------------------------------------
    # 4. Overall commit (superfluous)
    # -------------------------------------------------------------------------
    nlpdef.commit_all()
Beispiel #5
0
 def hash(self, raw: Any) -> str:
     with MultiTimerContext(timer, TIMING_HASH):
         raw_bytes = str(raw).encode('utf-8')
         hmac_obj = hmac.new(key=self.key_bytes, msg=raw_bytes,
                             digestmod=self.digestmod)
         return hmac_obj.hexdigest()
Beispiel #6
0
 def hash(self, raw: Any) -> str:
     with MultiTimerContext(timer, TIMING_HASH):
         raw_bytes = str(raw).encode('utf-8')
         return self.hashfunc(self.salt_bytes + raw_bytes).hexdigest()
Beispiel #7
0
 def commit(self) -> None:
     with MultiTimerContext(timer, TIMING_COMMIT):
         self._session.commit()
     self._bytes_in_transaction = 0
     self._rows_in_transaction = 0
Beispiel #8
0
def main() -> None:
    """
    Command-line entry point.
    """
    version = "Version {} ({})".format(VERSION, VERSION_DATE)
    description = "NLP manager. {version}. By Rudolf Cardinal.".format(
        version=version)

    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--version", action="version", version=version)
    parser.add_argument("--config",
                        help="Config file (overriding environment "
                        "variable {})".format(NLP_CONFIG_ENV_VAR))
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help="Be verbose (use twice for extra verbosity)")
    parser.add_argument("--nlpdef",
                        nargs="?",
                        default=None,
                        help="NLP definition name (from config file)")
    parser.add_argument('--report_every_fast',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY,
                        help="Report insert progress (for fast operations) "
                        "every n rows in verbose "
                        "mode (default {})".format(DEFAULT_REPORT_EVERY))
    parser.add_argument('--report_every_nlp',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY_NLP,
                        help="Report progress for NLP every n rows in verbose "
                        "mode (default "
                        "{})".format(DEFAULT_REPORT_EVERY_NLP))
    parser.add_argument('--chunksize',
                        nargs="?",
                        type=int,
                        default=DEFAULT_CHUNKSIZE,
                        help="Number of records copied in a chunk when copying"
                        " PKs from one database to another"
                        " (default {})".format(DEFAULT_CHUNKSIZE))
    parser.add_argument("--process",
                        nargs="?",
                        type=int,
                        default=0,
                        help="For multiprocess mode: specify process number")
    parser.add_argument("--nprocesses",
                        nargs="?",
                        type=int,
                        default=1,
                        help="For multiprocess mode: specify "
                        "total number of processes (launched somehow, of "
                        "which this is to be one)")
    parser.add_argument("--processcluster",
                        default="",
                        help="Process cluster name")
    parser.add_argument("--democonfig",
                        action="store_true",
                        help="Print a demo config file")
    parser.add_argument("--listprocessors",
                        action="store_true",
                        help="Show possible built-in NLP processor names")
    parser.add_argument("--describeprocessors",
                        action="store_true",
                        help="Show details of built-in NLP processors")
    parser.add_argument("--showinfo",
                        required=False,
                        nargs='?',
                        metavar="NLP_CLASS_NAME",
                        help="Show detailed information for a parser")
    parser.add_argument("--count",
                        action="store_true",
                        help="Count records in source/destination databases, "
                        "then stop")

    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument("-i",
                            "--incremental",
                            dest="incremental",
                            action="store_true",
                            help="Process only new/changed information, where "
                            "possible (* default)")
    mode_group.add_argument("-f",
                            "--full",
                            dest="incremental",
                            action="store_false",
                            help="Drop and remake everything")
    parser.set_defaults(incremental=True)

    parser.add_argument("--dropremake",
                        action="store_true",
                        help="Drop/remake destination tables only")
    parser.add_argument("--skipdelete",
                        dest="skipdelete",
                        action="store_true",
                        help="For incremental updates, skip deletion of rows "
                        "present in the destination but not the source")
    parser.add_argument("--nlp",
                        action="store_true",
                        help="Perform NLP processing only")
    parser.add_argument("--echo", action="store_true", help="Echo SQL")
    parser.add_argument("--timing",
                        action="store_true",
                        help="Show detailed timing breakdown")
    args = parser.parse_args()

    # Validate args
    if args.nprocesses < 1:
        raise ValueError("--nprocesses must be >=1")
    if args.process < 0 or args.process >= args.nprocesses:
        raise ValueError(
            "--process argument must be from 0 to (nprocesses - 1) inclusive")
    if args.config:
        os.environ[NLP_CONFIG_ENV_VAR] = args.config

    # Verbosity and logging
    mynames = []  # type: List[str]
    if args.processcluster:
        mynames.append(args.processcluster)
    if args.nprocesses > 1:
        mynames.append("proc{}".format(args.process))
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel, extranames=mynames)

    # -------------------------------------------------------------------------

    # Demo config?
    if args.democonfig:
        print(DEMO_CONFIG)
        return

    # List or describe processors?
    if args.listprocessors:
        print("\n".join(possible_processor_names()))
        return
    if args.describeprocessors:
        print(possible_processor_table())
        return
    if args.showinfo:
        parser = get_nlp_parser_debug_instance(args.showinfo)
        if parser:
            print("Info for class {}:\n".format(args.showinfo))
            parser.print_info()
        else:
            print("No such processor class: {}".format(args.showinfo))
        return

    # Otherwise, we need a valid NLP definition.
    if args.nlpdef is None:
        raise ValueError("Must specify nlpdef parameter (unless --democonfig, "
                         "--listprocessors, or --describeprocessors used)")

    everything = not any([args.dropremake, args.nlp])

    # Report args
    log.debug("arguments: {}".format(args))

    # Load/validate config
    config = NlpDefinition(args.nlpdef,
                           logtag="_".join(mynames).replace(" ", "_"))
    config.set_echo(args.echo)

    # Count only?
    if args.count:
        show_source_counts(config)
        show_dest_counts(config)
        return

    # -------------------------------------------------------------------------

    log.info("Starting: incremental={}".format(args.incremental))
    start = get_now_utc()
    timer.set_timing(args.timing, reset=True)

    # 1. Drop/remake tables. Single-tasking only.
    with MultiTimerContext(timer, TIMING_DROP_REMAKE):
        if args.dropremake or everything:
            drop_remake(args,
                        config,
                        incremental=args.incremental,
                        skipdelete=args.skipdelete)

    # From here, in a multiprocessing environment, trap any errors simply so
    # we can report the process number clearly.

    # 2. NLP
    if args.nlp or everything:
        try:
            process_nlp(config,
                        incremental=args.incremental,
                        report_every=args.report_every_nlp,
                        tasknum=args.process,
                        ntasks=args.nprocesses)
        except Exception as exc:
            log.critical("TERMINAL ERROR FROM THIS PROCESS")  # so we see proc#
            die(exc)

    log.info("Finished")
    end = get_now_utc()
    time_taken = end - start
    log.info("Time taken: {:.3f} seconds".format(time_taken.total_seconds()))

    if args.timing:
        timer.report()
Beispiel #9
0
def process_nlp(nlpdef: NlpDefinition,
                incremental: bool = False,
                report_every: int = DEFAULT_REPORT_EVERY_NLP,
                tasknum: int = 0,
                ntasks: int = 1) -> None:
    """
    Main NLP processing function. Fetch text, send it to the GATE app
    (storing the results), and make a note in the progress database.
    """
    log.info(SEP + "NLP")
    session = nlpdef.get_progdb_session()
    for ifconfig in nlpdef.get_ifconfigs():
        i = 0  # record count within this process
        recnum = tasknum  # record count overall
        totalcount = ifconfig.get_count()  # total number of records in table
        for text, other_values in ifconfig.gen_text(tasknum=tasknum,
                                                    ntasks=ntasks):
            i += 1
            pkval = other_values[FN_SRCPKVAL]
            pkstr = other_values[FN_SRCPKSTR]
            if report_every and i % report_every == 0:
                log.info(
                    "Processing {db}.{t}.{c}, PK: {pkf}={pkv} "
                    "({overall}record {approx}{recnum}/{totalcount})"
                    "{thisproc}".format(
                        db=other_values[FN_SRCDB],
                        t=other_values[FN_SRCTABLE],
                        c=other_values[FN_SRCFIELD],
                        pkf=other_values[FN_SRCPKFIELD],
                        pkv=pkstr if pkstr else pkval,
                        overall="overall " if ntasks > 1 else "",
                        approx="~" if pkstr and ntasks > 1 else "",
                        # ... string hashing means approx. distribution
                        recnum=recnum + 1,
                        i=i,
                        totalcount=totalcount,
                        thisproc=(" ({i}/~{proccount} this process)".format(
                            i=i, proccount=totalcount //
                            ntasks) if ntasks > 1 else "")))
            recnum += ntasks
            # log.critical("other_values={}".format(repr(other_values)))
            srchash = nlpdef.hash(text)

            progrec = None
            if incremental:
                progrec = ifconfig.get_progress_record(pkval, pkstr)
                if progrec is not None:
                    if progrec.srchash == srchash:
                        log.debug("Record previously processed; skipping")
                        continue
                    else:
                        log.debug("Record has changed")
                else:
                    log.debug("Record is new")

            for processor in nlpdef.get_processors():
                if incremental:
                    processor.delete_dest_record(ifconfig,
                                                 pkval,
                                                 pkstr,
                                                 commit=incremental)
                processor.process(text, other_values)

            # Make a note in the progress database that we've processed a
            # source record.
            if progrec:  # modifying an existing record
                progrec.whenprocessedutc = nlpdef.get_now()
                progrec.srchash = srchash
            else:  # creating a new record
                progrec = NlpRecord(
                    # Quasi-key fields:
                    srcdb=ifconfig.get_srcdb(),
                    srctable=ifconfig.get_srctable(),
                    srcpkval=pkval,
                    srcpkstr=pkstr,
                    srcfield=ifconfig.get_srcfield(),
                    nlpdef=nlpdef.get_name(),
                    # Other fields:
                    srcpkfield=ifconfig.get_srcpkfield(),
                    whenprocessedutc=nlpdef.get_now(),
                    srchash=srchash,
                )
                with MultiTimerContext(timer, TIMING_PROGRESS_DB_ADD):
                    session.add(progrec)

            # In incremental mode, do we commit immediately, because other
            # processes may need this table promptly... ?

            # force_commit = False  # definitely wrong; crashes as below
            # force_commit = incremental
            force_commit = ntasks > 1

            # - A single source record should not be processed by >1 CRATE
            #   process. So in theory there should be no conflicts.
            # - However, databases can lock in various ways. Can we guarantee
            #   it'll do something sensible?
            # - See also
            #   https://en.wikipedia.org/wiki/Isolation_(database_systems)
            #   http://skien.cc/blog/2014/02/06/sqlalchemy-and-race-conditions-follow-up/  # noqa
            #   http://docs.sqlalchemy.org/en/latest/core/connections.html?highlight=execution_options#sqlalchemy.engine.Connection.execution_options  # noqa
            # - However, empirically, setting this to False gives
            #   "Transaction (Process ID xx) was deadlocked on lock resources
            #   with another process and has been chosen as the deadlock
            #   victim. Rerun the transaction." -- with a SELECT query.
            # - SQL Server uses READ COMMITTED as the default isolation level.
            # - https://technet.microsoft.com/en-us/library/jj856598(v=sql.110).aspx  # noqa

            nlpdef.notify_transaction(
                session=session,
                n_rows=1,
                n_bytes=sys.getsizeof(progrec),  # approx
                force_commit=force_commit)

    nlpdef.commit_all()