def delete_dest_record(self, ifconfig: InputFieldConfig, srcpkval: int, srcpkstr: Optional[str], commit: bool = False) -> None: """ Used during incremental updates. For when a record (specified by srcpkval) has been updated in the source; wipe older entries for it in the destination database(s). """ session = self.get_session() srcdb = ifconfig.get_srcdb() srctable = ifconfig.get_srctable() srcfield = ifconfig.get_srcfield() destdb_name = self._destdb.name nlpdef_name = self._nlpdef.get_name() for tablename, desttable in self.tables().items(): log.debug("delete_from_dest_dbs... {}.{} -> {}.{}".format( srcdb, srctable, destdb_name, tablename)) # noinspection PyProtectedMember,PyPropertyAccess delquery = (desttable.delete().where( desttable.c._srcdb == srcdb).where( desttable.c._srctable == srctable).where( desttable.c._srcfield == srcfield).where( desttable.c._srcpkval == srcpkval).where( desttable.c._nlpdef == nlpdef_name)) if srcpkstr is not None: # noinspection PyProtectedMember,PyPropertyAccess delquery = delquery.where(desttable.c._srcpkstr == srcpkstr) with MultiTimerContext(timer, TIMING_DELETE_DEST_RECORD): session.execute(delquery) if commit: self._nlpdef.commit(session)
def hash32(data: Any, seed=0) -> int: """Returns a signed 32-bit integer.""" with MultiTimerContext(timer, TIMING_HASH): c_data = to_str(data) if mmh3: return mmh3.hash(c_data, seed=seed) py_data = to_bytes(c_data) py_unsigned = murmur3_x86_32(py_data, seed=seed) return twos_comp_to_signed(py_unsigned, n_bits=32)
def process(self, text: str, starting_fields_values: Dict[str, Any]) -> None: if not text: return starting_fields_values[FN_NLPDEF] = self._nlpdef.get_name() session = self.get_session() n_values = 0 with MultiTimerContext(timer, TIMING_PARSE): for tablename, nlp_values in self.parse(text): with MultiTimerContext(timer, TIMING_HANDLE_PARSED): # Merge dictionaries so EXISTING FIELDS/VALUES # (starting_fields_values) HAVE PRIORITY. nlp_values.update(starting_fields_values) sqla_table = self.get_table(tablename) # If we have superfluous keys in our dictionary, SQLAlchemy # will choke ("Unconsumed column names", reporting the # thing that's in our dictionary that it doesn't know # about). HOWEVER, note that SQLA column names may be mixed # case (e.g. 'Text') while our copy-column names are lower # case (e.g. 'text'), so we must have pre-converted # the SQLA column names to lower case. That happens in # InputFieldConfig.get_copy_columns and # InputFieldConfig.get_copy_indexes column_names = [c.name for c in sqla_table.columns] final_values = { k: v for k, v in nlp_values.items() if k in column_names } # log.critical(repr(sqla_table)) insertquery = sqla_table.insert().values(final_values) with MultiTimerContext(timer, TIMING_INSERT): session.execute(insertquery) self._nlpdef.notify_transaction( session, n_rows=1, n_bytes=sys.getsizeof(final_values), force_commit=self._commit ) # or we get deadlocks in multiprocess mode # noqa n_values += 1 log.debug("NLP processor {}/{}: found {} values".format( self.get_nlpdef_name(), self.get_parser_name(), n_values))
def drop_remake(progargs, nlpdef: NlpDefinition, incremental: bool = False, skipdelete: bool = False) -> None: """ Drop output tables and recreate them. """ # Not parallel. # ------------------------------------------------------------------------- # 1. Progress database # ------------------------------------------------------------------------- progengine = nlpdef.get_progdb_engine() if not incremental: log.debug("Dropping progress tables") NlpRecord.__table__.drop(progengine, checkfirst=True) log.info("Creating progress table (with index)") NlpRecord.__table__.create(progengine, checkfirst=True) # ------------------------------------------------------------------------- # 2. Output database(s) # ------------------------------------------------------------------------- pretty_names = [] # type: List[str] for processor in nlpdef.get_processors(): new_pretty_names = processor.make_tables(drop_first=not incremental) for npn in new_pretty_names: if npn in pretty_names: log.warning("An NLP processor has tried to re-make a table " "made by one of its colleagues: {}".format(npn)) pretty_names.extend(new_pretty_names) # ------------------------------------------------------------------------- # 3. Delete WHERE NOT IN for incremental # ------------------------------------------------------------------------- for ifconfig in nlpdef.get_ifconfigs(): with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE): if incremental: if not skipdelete: delete_where_no_source( nlpdef, ifconfig, report_every=progargs.report_every_fast, chunksize=progargs.chunksize) else: # full ifconfig.delete_all_progress_records() # ------------------------------------------------------------------------- # 4. Overall commit (superfluous) # ------------------------------------------------------------------------- nlpdef.commit_all()
def hash(self, raw: Any) -> str: with MultiTimerContext(timer, TIMING_HASH): raw_bytes = str(raw).encode('utf-8') hmac_obj = hmac.new(key=self.key_bytes, msg=raw_bytes, digestmod=self.digestmod) return hmac_obj.hexdigest()
def hash(self, raw: Any) -> str: with MultiTimerContext(timer, TIMING_HASH): raw_bytes = str(raw).encode('utf-8') return self.hashfunc(self.salt_bytes + raw_bytes).hexdigest()
def commit(self) -> None: with MultiTimerContext(timer, TIMING_COMMIT): self._session.commit() self._bytes_in_transaction = 0 self._rows_in_transaction = 0
def main() -> None: """ Command-line entry point. """ version = "Version {} ({})".format(VERSION, VERSION_DATE) description = "NLP manager. {version}. By Rudolf Cardinal.".format( version=version) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--version", action="version", version=version) parser.add_argument("--config", help="Config file (overriding environment " "variable {})".format(NLP_CONFIG_ENV_VAR)) parser.add_argument('--verbose', '-v', action='store_true', help="Be verbose (use twice for extra verbosity)") parser.add_argument("--nlpdef", nargs="?", default=None, help="NLP definition name (from config file)") parser.add_argument('--report_every_fast', nargs="?", type=int, default=DEFAULT_REPORT_EVERY, help="Report insert progress (for fast operations) " "every n rows in verbose " "mode (default {})".format(DEFAULT_REPORT_EVERY)) parser.add_argument('--report_every_nlp', nargs="?", type=int, default=DEFAULT_REPORT_EVERY_NLP, help="Report progress for NLP every n rows in verbose " "mode (default " "{})".format(DEFAULT_REPORT_EVERY_NLP)) parser.add_argument('--chunksize', nargs="?", type=int, default=DEFAULT_CHUNKSIZE, help="Number of records copied in a chunk when copying" " PKs from one database to another" " (default {})".format(DEFAULT_CHUNKSIZE)) parser.add_argument("--process", nargs="?", type=int, default=0, help="For multiprocess mode: specify process number") parser.add_argument("--nprocesses", nargs="?", type=int, default=1, help="For multiprocess mode: specify " "total number of processes (launched somehow, of " "which this is to be one)") parser.add_argument("--processcluster", default="", help="Process cluster name") parser.add_argument("--democonfig", action="store_true", help="Print a demo config file") parser.add_argument("--listprocessors", action="store_true", help="Show possible built-in NLP processor names") parser.add_argument("--describeprocessors", action="store_true", help="Show details of built-in NLP processors") parser.add_argument("--showinfo", required=False, nargs='?', metavar="NLP_CLASS_NAME", help="Show detailed information for a parser") parser.add_argument("--count", action="store_true", help="Count records in source/destination databases, " "then stop") mode_group = parser.add_mutually_exclusive_group() mode_group.add_argument("-i", "--incremental", dest="incremental", action="store_true", help="Process only new/changed information, where " "possible (* default)") mode_group.add_argument("-f", "--full", dest="incremental", action="store_false", help="Drop and remake everything") parser.set_defaults(incremental=True) parser.add_argument("--dropremake", action="store_true", help="Drop/remake destination tables only") parser.add_argument("--skipdelete", dest="skipdelete", action="store_true", help="For incremental updates, skip deletion of rows " "present in the destination but not the source") parser.add_argument("--nlp", action="store_true", help="Perform NLP processing only") parser.add_argument("--echo", action="store_true", help="Echo SQL") parser.add_argument("--timing", action="store_true", help="Show detailed timing breakdown") args = parser.parse_args() # Validate args if args.nprocesses < 1: raise ValueError("--nprocesses must be >=1") if args.process < 0 or args.process >= args.nprocesses: raise ValueError( "--process argument must be from 0 to (nprocesses - 1) inclusive") if args.config: os.environ[NLP_CONFIG_ENV_VAR] = args.config # Verbosity and logging mynames = [] # type: List[str] if args.processcluster: mynames.append(args.processcluster) if args.nprocesses > 1: mynames.append("proc{}".format(args.process)) loglevel = logging.DEBUG if args.verbose else logging.INFO rootlogger = logging.getLogger() configure_logger_for_colour(rootlogger, level=loglevel, extranames=mynames) # ------------------------------------------------------------------------- # Demo config? if args.democonfig: print(DEMO_CONFIG) return # List or describe processors? if args.listprocessors: print("\n".join(possible_processor_names())) return if args.describeprocessors: print(possible_processor_table()) return if args.showinfo: parser = get_nlp_parser_debug_instance(args.showinfo) if parser: print("Info for class {}:\n".format(args.showinfo)) parser.print_info() else: print("No such processor class: {}".format(args.showinfo)) return # Otherwise, we need a valid NLP definition. if args.nlpdef is None: raise ValueError("Must specify nlpdef parameter (unless --democonfig, " "--listprocessors, or --describeprocessors used)") everything = not any([args.dropremake, args.nlp]) # Report args log.debug("arguments: {}".format(args)) # Load/validate config config = NlpDefinition(args.nlpdef, logtag="_".join(mynames).replace(" ", "_")) config.set_echo(args.echo) # Count only? if args.count: show_source_counts(config) show_dest_counts(config) return # ------------------------------------------------------------------------- log.info("Starting: incremental={}".format(args.incremental)) start = get_now_utc() timer.set_timing(args.timing, reset=True) # 1. Drop/remake tables. Single-tasking only. with MultiTimerContext(timer, TIMING_DROP_REMAKE): if args.dropremake or everything: drop_remake(args, config, incremental=args.incremental, skipdelete=args.skipdelete) # From here, in a multiprocessing environment, trap any errors simply so # we can report the process number clearly. # 2. NLP if args.nlp or everything: try: process_nlp(config, incremental=args.incremental, report_every=args.report_every_nlp, tasknum=args.process, ntasks=args.nprocesses) except Exception as exc: log.critical("TERMINAL ERROR FROM THIS PROCESS") # so we see proc# die(exc) log.info("Finished") end = get_now_utc() time_taken = end - start log.info("Time taken: {:.3f} seconds".format(time_taken.total_seconds())) if args.timing: timer.report()
def process_nlp(nlpdef: NlpDefinition, incremental: bool = False, report_every: int = DEFAULT_REPORT_EVERY_NLP, tasknum: int = 0, ntasks: int = 1) -> None: """ Main NLP processing function. Fetch text, send it to the GATE app (storing the results), and make a note in the progress database. """ log.info(SEP + "NLP") session = nlpdef.get_progdb_session() for ifconfig in nlpdef.get_ifconfigs(): i = 0 # record count within this process recnum = tasknum # record count overall totalcount = ifconfig.get_count() # total number of records in table for text, other_values in ifconfig.gen_text(tasknum=tasknum, ntasks=ntasks): i += 1 pkval = other_values[FN_SRCPKVAL] pkstr = other_values[FN_SRCPKSTR] if report_every and i % report_every == 0: log.info( "Processing {db}.{t}.{c}, PK: {pkf}={pkv} " "({overall}record {approx}{recnum}/{totalcount})" "{thisproc}".format( db=other_values[FN_SRCDB], t=other_values[FN_SRCTABLE], c=other_values[FN_SRCFIELD], pkf=other_values[FN_SRCPKFIELD], pkv=pkstr if pkstr else pkval, overall="overall " if ntasks > 1 else "", approx="~" if pkstr and ntasks > 1 else "", # ... string hashing means approx. distribution recnum=recnum + 1, i=i, totalcount=totalcount, thisproc=(" ({i}/~{proccount} this process)".format( i=i, proccount=totalcount // ntasks) if ntasks > 1 else ""))) recnum += ntasks # log.critical("other_values={}".format(repr(other_values))) srchash = nlpdef.hash(text) progrec = None if incremental: progrec = ifconfig.get_progress_record(pkval, pkstr) if progrec is not None: if progrec.srchash == srchash: log.debug("Record previously processed; skipping") continue else: log.debug("Record has changed") else: log.debug("Record is new") for processor in nlpdef.get_processors(): if incremental: processor.delete_dest_record(ifconfig, pkval, pkstr, commit=incremental) processor.process(text, other_values) # Make a note in the progress database that we've processed a # source record. if progrec: # modifying an existing record progrec.whenprocessedutc = nlpdef.get_now() progrec.srchash = srchash else: # creating a new record progrec = NlpRecord( # Quasi-key fields: srcdb=ifconfig.get_srcdb(), srctable=ifconfig.get_srctable(), srcpkval=pkval, srcpkstr=pkstr, srcfield=ifconfig.get_srcfield(), nlpdef=nlpdef.get_name(), # Other fields: srcpkfield=ifconfig.get_srcpkfield(), whenprocessedutc=nlpdef.get_now(), srchash=srchash, ) with MultiTimerContext(timer, TIMING_PROGRESS_DB_ADD): session.add(progrec) # In incremental mode, do we commit immediately, because other # processes may need this table promptly... ? # force_commit = False # definitely wrong; crashes as below # force_commit = incremental force_commit = ntasks > 1 # - A single source record should not be processed by >1 CRATE # process. So in theory there should be no conflicts. # - However, databases can lock in various ways. Can we guarantee # it'll do something sensible? # - See also # https://en.wikipedia.org/wiki/Isolation_(database_systems) # http://skien.cc/blog/2014/02/06/sqlalchemy-and-race-conditions-follow-up/ # noqa # http://docs.sqlalchemy.org/en/latest/core/connections.html?highlight=execution_options#sqlalchemy.engine.Connection.execution_options # noqa # - However, empirically, setting this to False gives # "Transaction (Process ID xx) was deadlocked on lock resources # with another process and has been chosen as the deadlock # victim. Rerun the transaction." -- with a SELECT query. # - SQL Server uses READ COMMITTED as the default isolation level. # - https://technet.microsoft.com/en-us/library/jj856598(v=sql.110).aspx # noqa nlpdef.notify_transaction( session=session, n_rows=1, n_bytes=sys.getsizeof(progrec), # approx force_commit=force_commit) nlpdef.commit_all()