Exemple #1
0
 def __init__(self) -> None:
     self._timing = False
     self._overallstart = get_now_utc()
     self._starttimes = OrderedDict()  # name: start time
     self._totaldurations = OrderedDict()  # name: duration
     self._count = OrderedDict()  # name: count
     self._stack = []  # list of names
Exemple #2
0
    def start(self, name: str, increment_count: bool = True) -> None:
        if not self._timing:
            return
        now = get_now_utc()

        # If we were already timing something else, pause that.
        if self._stack:
            last = self._stack[-1]
            self._totaldurations[last] += now - self._starttimes[last]

        # Start timing our new thing
        if name not in self._starttimes:
            self._totaldurations[name] = datetime.timedelta()
            self._count[name] = 0
        self._starttimes[name] = now
        if increment_count:
            self._count[name] += 1
        self._stack.append(name)
Exemple #3
0
    def stop(self, name: str) -> None:
        if not self._timing:
            return
        now = get_now_utc()

        # Validity check
        if not self._stack:
            raise AssertionError("MultiTimer.stop() when nothing running")
        if self._stack[-1] != name:
            raise AssertionError(
                "MultiTimer.stop({}) when {} is running".format(
                    repr(name), repr(self._stack[-1])))

        # Finish what we were asked to
        self._totaldurations[name] += now - self._starttimes[name]
        self._stack.pop()

        # Now, if we were timing something else before we started "name",
        # resume...
        if self._stack:
            last = self._stack[-1]
            self._starttimes[last] = now
Exemple #4
0
    def report(self) -> None:
        """Finish and report to the log."""
        while self._stack:
            self.stop(self._stack[-1])
        now = get_now_utc()
        grand_total = datetime.timedelta()
        overall_duration = now - self._overallstart
        for name, duration in self._totaldurations.items():
            grand_total += duration

        log.info("Timing summary:")
        summaries = []
        for name, duration in self._totaldurations.items():
            n = self._count[name]
            total_sec = duration.total_seconds()
            mean = total_sec / n if n > 0 else None

            summaries.append({
                'total':
                total_sec,
                'description':
                ("- {}: {:.3f} s ({:.2f}%, n={}, mean={:.3f}s)".format(
                    name, total_sec,
                    (100 * total_sec / grand_total.total_seconds()), n, mean)),
            })
        summaries.sort(key=lambda x: x['total'], reverse=True)
        for s in summaries:
            # noinspection PyTypeChecker
            log.info(s["description"])
        if not self._totaldurations:
            log.info("<no timings recorded>")

        unmetered = overall_duration - grand_total
        log.info("Unmetered time: {:.3f} s ({:.2f}%)".format(
            unmetered.total_seconds(), 100 * unmetered.total_seconds() /
            overall_duration.total_seconds()))
Exemple #5
0
 def reset(self) -> None:
     self._overallstart = get_now_utc()
     self._starttimes.clear()
     self._totaldurations.clear()
     self._count.clear()
     self._stack.clear()
Exemple #6
0
def anonymise(args: Any) -> None:
    """
    Main entry point.
    """
    # Validate args
    if args.nprocesses < 1:
        raise ValueError("--nprocesses must be >=1")
    if args.process < 0 or args.process >= args.nprocesses:
        raise ValueError(
            "--process argument must be from 0 to (nprocesses - 1) inclusive")
    if args.nprocesses > 1 and args.dropremake:
        raise ValueError("Can't use nprocesses > 1 with --dropremake")
    if args.incrementaldd and args.draftdd:
        raise ValueError("Can't use --incrementaldd and --draftdd")

    everything = not any([
        args.dropremake, args.optout, args.nonpatienttables,
        args.patienttables, args.index
    ])

    # Load/validate config
    config.report_every_n_rows = args.reportevery
    config.chunksize = args.chunksize
    config.debug_scrubbers = args.debugscrubbers
    config.save_scrubbers = args.savescrubbers
    config.set_echo(args.echo)
    if not args.draftdd:
        config.load_dd(check_against_source_db=not args.skip_dd_check)

    if args.draftdd or args.incrementaldd:
        # Note: the difference is that for incrementaldd, the data dictionary
        # will have been loaded from disk; for draftdd, it won't (so a
        # completely fresh one will be generated).
        config.dd.read_from_source_databases()
        print(config.dd.get_tsv())
        return

    config.check_valid()

    if args.count:
        show_source_counts()
        show_dest_counts()
        return

    # random number seed
    random.seed(args.seed)

    # -------------------------------------------------------------------------

    log.info(BIGSEP + "Starting")
    start = get_now_utc()

    # 1. Drop/remake tables. Single-tasking only.
    if args.dropremake or everything:
        drop_remake(incremental=args.incremental, skipdelete=args.skipdelete)

    # 2. Deal with opt-outs
    if args.optout or everything:
        setup_opt_out(incremental=args.incremental)

    # 3. Tables with patient info.
    #    Process PER PATIENT, across all tables, because we have to synthesize
    #    information to scrub across the entirety of that patient's record.
    if args.patienttables or everything:
        process_patient_tables(tasknum=args.process,
                               ntasks=args.nprocesses,
                               incremental=args.incremental)

    # 4. Tables without any patient ID (e.g. lookup tables). Process PER TABLE.
    if args.nonpatienttables or everything:
        process_nonpatient_tables(tasknum=args.process,
                                  ntasks=args.nprocesses,
                                  incremental=args.incremental)

    # 5. Indexes. ALWAYS FASTEST TO DO THIS LAST. Process PER TABLE.
    if args.index or everything:
        create_indexes(tasknum=args.process, ntasks=args.nprocesses)

    log.info(BIGSEP + "Finished")
    end = get_now_utc()
    time_taken = end - start
    log.info("Time taken: {} seconds".format(time_taken.total_seconds()))
Exemple #7
0
def main() -> None:
    """
    Command-line entry point.
    """
    version = "Version {} ({})".format(VERSION, VERSION_DATE)
    description = "NLP manager. {version}. By Rudolf Cardinal.".format(
        version=version)

    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--version", action="version", version=version)
    parser.add_argument("--config",
                        help="Config file (overriding environment "
                        "variable {})".format(NLP_CONFIG_ENV_VAR))
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help="Be verbose (use twice for extra verbosity)")
    parser.add_argument("--nlpdef",
                        nargs="?",
                        default=None,
                        help="NLP definition name (from config file)")
    parser.add_argument('--report_every_fast',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY,
                        help="Report insert progress (for fast operations) "
                        "every n rows in verbose "
                        "mode (default {})".format(DEFAULT_REPORT_EVERY))
    parser.add_argument('--report_every_nlp',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY_NLP,
                        help="Report progress for NLP every n rows in verbose "
                        "mode (default "
                        "{})".format(DEFAULT_REPORT_EVERY_NLP))
    parser.add_argument('--chunksize',
                        nargs="?",
                        type=int,
                        default=DEFAULT_CHUNKSIZE,
                        help="Number of records copied in a chunk when copying"
                        " PKs from one database to another"
                        " (default {})".format(DEFAULT_CHUNKSIZE))
    parser.add_argument("--process",
                        nargs="?",
                        type=int,
                        default=0,
                        help="For multiprocess mode: specify process number")
    parser.add_argument("--nprocesses",
                        nargs="?",
                        type=int,
                        default=1,
                        help="For multiprocess mode: specify "
                        "total number of processes (launched somehow, of "
                        "which this is to be one)")
    parser.add_argument("--processcluster",
                        default="",
                        help="Process cluster name")
    parser.add_argument("--democonfig",
                        action="store_true",
                        help="Print a demo config file")
    parser.add_argument("--listprocessors",
                        action="store_true",
                        help="Show possible built-in NLP processor names")
    parser.add_argument("--describeprocessors",
                        action="store_true",
                        help="Show details of built-in NLP processors")
    parser.add_argument("--showinfo",
                        required=False,
                        nargs='?',
                        metavar="NLP_CLASS_NAME",
                        help="Show detailed information for a parser")
    parser.add_argument("--count",
                        action="store_true",
                        help="Count records in source/destination databases, "
                        "then stop")

    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument("-i",
                            "--incremental",
                            dest="incremental",
                            action="store_true",
                            help="Process only new/changed information, where "
                            "possible (* default)")
    mode_group.add_argument("-f",
                            "--full",
                            dest="incremental",
                            action="store_false",
                            help="Drop and remake everything")
    parser.set_defaults(incremental=True)

    parser.add_argument("--dropremake",
                        action="store_true",
                        help="Drop/remake destination tables only")
    parser.add_argument("--skipdelete",
                        dest="skipdelete",
                        action="store_true",
                        help="For incremental updates, skip deletion of rows "
                        "present in the destination but not the source")
    parser.add_argument("--nlp",
                        action="store_true",
                        help="Perform NLP processing only")
    parser.add_argument("--echo", action="store_true", help="Echo SQL")
    parser.add_argument("--timing",
                        action="store_true",
                        help="Show detailed timing breakdown")
    args = parser.parse_args()

    # Validate args
    if args.nprocesses < 1:
        raise ValueError("--nprocesses must be >=1")
    if args.process < 0 or args.process >= args.nprocesses:
        raise ValueError(
            "--process argument must be from 0 to (nprocesses - 1) inclusive")
    if args.config:
        os.environ[NLP_CONFIG_ENV_VAR] = args.config

    # Verbosity and logging
    mynames = []  # type: List[str]
    if args.processcluster:
        mynames.append(args.processcluster)
    if args.nprocesses > 1:
        mynames.append("proc{}".format(args.process))
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel, extranames=mynames)

    # -------------------------------------------------------------------------

    # Demo config?
    if args.democonfig:
        print(DEMO_CONFIG)
        return

    # List or describe processors?
    if args.listprocessors:
        print("\n".join(possible_processor_names()))
        return
    if args.describeprocessors:
        print(possible_processor_table())
        return
    if args.showinfo:
        parser = get_nlp_parser_debug_instance(args.showinfo)
        if parser:
            print("Info for class {}:\n".format(args.showinfo))
            parser.print_info()
        else:
            print("No such processor class: {}".format(args.showinfo))
        return

    # Otherwise, we need a valid NLP definition.
    if args.nlpdef is None:
        raise ValueError("Must specify nlpdef parameter (unless --democonfig, "
                         "--listprocessors, or --describeprocessors used)")

    everything = not any([args.dropremake, args.nlp])

    # Report args
    log.debug("arguments: {}".format(args))

    # Load/validate config
    config = NlpDefinition(args.nlpdef,
                           logtag="_".join(mynames).replace(" ", "_"))
    config.set_echo(args.echo)

    # Count only?
    if args.count:
        show_source_counts(config)
        show_dest_counts(config)
        return

    # -------------------------------------------------------------------------

    log.info("Starting: incremental={}".format(args.incremental))
    start = get_now_utc()
    timer.set_timing(args.timing, reset=True)

    # 1. Drop/remake tables. Single-tasking only.
    with MultiTimerContext(timer, TIMING_DROP_REMAKE):
        if args.dropremake or everything:
            drop_remake(args,
                        config,
                        incremental=args.incremental,
                        skipdelete=args.skipdelete)

    # From here, in a multiprocessing environment, trap any errors simply so
    # we can report the process number clearly.

    # 2. NLP
    if args.nlp or everything:
        try:
            process_nlp(config,
                        incremental=args.incremental,
                        report_every=args.report_every_nlp,
                        tasknum=args.process,
                        ntasks=args.nprocesses)
        except Exception as exc:
            log.critical("TERMINAL ERROR FROM THIS PROCESS")  # so we see proc#
            die(exc)

    log.info("Finished")
    end = get_now_utc()
    time_taken = end - start
    log.info("Time taken: {:.3f} seconds".format(time_taken.total_seconds()))

    if args.timing:
        timer.report()