Example #1
0
def main() -> None:
    """
    Command-line entry point.
    """
    parser = argparse.ArgumentParser(
        description='Test anonymisation',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--config', required=True,
                        help='Configuration file name (input)')
    parser.add_argument('--dsttable', required=True,
                        help='Destination table')
    parser.add_argument('--dstfield', required=True,
                        help='Destination column')
    parser.add_argument('--limit', type=int, default=100,
                        help='Limit on number of documents')
    parser.add_argument('--rawdir', default='raw',
                        help='Directory for raw output text files')
    parser.add_argument('--anondir', default='anon',
                        help='Directory for anonymised output text files')
    parser.add_argument('--resultsfile', default='testanon_results.csv',
                        help='Results output CSV file name')
    parser.add_argument('--scrubfile', default='testanon_scrubber.txt',
                        help='Scrubbing information text file name')
    parser.add_argument('--verbose', '-v', action='store_true',
                        help="Be verbose")

    pkgroup = parser.add_mutually_exclusive_group(required=False)
    pkgroup.add_argument('--pkfromsrc', dest='from_src', action='store_true',
                         help='Fetch PKs (document IDs) from source (default)')
    pkgroup.add_argument('--pkfromdest', dest='from_src', action='store_false',
                         help='Fetch PKs (document IDs) from destination')
    parser.set_defaults(from_src=True)

    uniquegroup = parser.add_mutually_exclusive_group(required=False)
    uniquegroup.add_argument(
        '--uniquepatients', dest='uniquepatients', action='store_true',
        help='Only one document per patient (the first by PK) (default)')
    uniquegroup.add_argument(
        '--nonuniquepatients', dest='uniquepatients', action='store_false',
        help='Documents in sequence, with potentially >1 document/patient')
    parser.set_defaults(uniquepatients=True)

    args = parser.parse_args()

    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, loglevel)

    log.info("Arguments: " + str(args))

    # Load/validate config
    log.info("Loading config...")
    config.set(filename=args.config, load_destfields=False)
    log.info("... config loaded")

    # Do it
    test_anon(args)
Example #2
0
def main() -> None:
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        # formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Alters a PCMIS database to be suitable for CRATE.")
    parser.add_argument("--url", required=True, help="SQLAlchemy database URL")
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose")
    parser.add_argument(
        "--print",
        action="store_true",
        help="Print SQL but do not execute it. (You can redirect the printed "
        "output to create an SQL script.")
    parser.add_argument("--echo", action="store_true", help="Echo SQL")

    parser.add_argument(
        "--drop-danger-drop",
        action="store_true",
        help="REMOVES new columns and indexes, rather than creating them. "
        "(There's not very much danger; no real information is lost, but "
        "it might take a while to recalculate it.)")

    parser.add_argument(
        "--debug-skiptables",
        action="store_true",
        help="DEBUG-ONLY OPTION. Skip tables (view creation only)")

    parser.add_argument(
        "--postcodedb",
        help='Specify database (schema) name for ONS Postcode Database (as '
        'imported by CRATE) to link to addresses as a view. With SQL '
        'Server, you will have to specify the schema as well as the '
        'database; e.g. "--postcodedb ONS_PD.dbo"')
    parser.add_argument(
        "--geogcols",
        nargs="*",
        default=DEFAULT_GEOG_COLS,
        help="List of geographical information columns to link in from ONS "
        "Postcode Database. BEWARE that you do not specify anything too "
        "identifying. Default: {}".format(' '.join(DEFAULT_GEOG_COLS)))

    parser.add_argument(
        "--settings-filename",
        help="Specify filename to write draft ddgen_* settings to, for use in "
        "a CRATE anonymiser configuration file.")

    progargs = parser.parse_args()

    rootlogger = logging.getLogger()
    configure_logger_for_colour(
        rootlogger, level=logging.DEBUG if progargs.verbose else logging.INFO)

    log.info("CRATE in-place preprocessor for PCMIS databases")
    safeargs = {k: v for k, v in vars(progargs).items() if k != 'url'}
    log.debug("args (except url): {}".format(repr(safeargs)))

    if progargs.postcodedb and not progargs.geogcols:
        raise ValueError(
            "If you specify postcodedb, you must specify some geogcols")

    set_print_not_execute(progargs.print)

    hack_in_mssql_xml_type()

    engine = create_engine(progargs.url, echo=progargs.echo, encoding=CHARSET)
    metadata = MetaData()
    metadata.bind = engine
    log.info("Database: {}".format(repr(engine.url)))  # ... repr hides p/w
    log.debug("Dialect: {}".format(engine.dialect.name))

    log.info("Reflecting (inspecting) database...")
    metadata.reflect(engine)
    log.info("... inspection complete")

    ddhint = DDHint()

    if progargs.drop_danger_drop:
        # Drop views (and view-induced table indexes) first
        drop_pcmis_views(engine, metadata, progargs, ddhint)
        if not progargs.debug_skiptables:
            process_all_tables(engine, metadata, progargs)
    else:
        # Tables first, then views
        if not progargs.debug_skiptables:
            process_all_tables(engine, metadata, progargs)
        create_pcmis_views(engine, metadata, progargs, ddhint)

    if progargs.settings_filename:
        with open(progargs.settings_filename, 'w') as f:
            print(get_pcmis_dd_settings(ddhint), file=f)
Example #3
0
def main() -> None:
    """
    Command-line entry point.
    """
    version = "Version {} ({})".format(VERSION, VERSION_DATE)
    description = "Database anonymiser. {version}. By Rudolf Cardinal.".format(
        version=version, )

    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--version", action="version", version=version)
    parser.add_argument("--democonfig",
                        action="store_true",
                        help="Print a demo config file")
    parser.add_argument("--config",
                        help="Config file (overriding environment "
                        "variable {})".format(CONFIG_ENV_VAR))
    parser.add_argument('--verbose',
                        '-v',
                        action="store_true",
                        help="Be verbose")
    parser.add_argument('--reportevery',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY,
                        help="Report insert progress every n rows in verbose "
                        "mode (default {})".format(DEFAULT_REPORT_EVERY))
    parser.add_argument('--chunksize',
                        nargs="?",
                        type=int,
                        default=DEFAULT_CHUNKSIZE,
                        help="Number of records copied in a chunk when copying"
                        " PKs from one database to another"
                        " (default {})".format(DEFAULT_CHUNKSIZE))
    parser.add_argument("--process",
                        nargs="?",
                        type=int,
                        default=0,
                        help="For multiprocess mode: specify process number")
    parser.add_argument("--nprocesses",
                        nargs="?",
                        type=int,
                        default=1,
                        help="For multiprocess mode: specify "
                        "total number of processes (launched somehow, of "
                        "which this is to be one)")
    parser.add_argument("--processcluster",
                        default="",
                        help="Process cluster name")
    parser.add_argument("--draftdd",
                        action="store_true",
                        help="Print a draft data dictionary")
    parser.add_argument("--incrementaldd",
                        action="store_true",
                        help="Print an INCREMENTAL draft data dictionary")
    parser.add_argument("--debugscrubbers",
                        action="store_true",
                        help="Report sensitive scrubbing information, for "
                        "debugging")
    parser.add_argument("--savescrubbers",
                        action="store_true",
                        help="Saves sensitive scrubbing information in admin "
                        "database, for debugging")
    parser.add_argument("--count",
                        action="store_true",
                        help="Count records in source/destination databases, "
                        "then stop")
    parser.add_argument("--dropremake",
                        action="store_true",
                        help="Drop/remake destination tables, then stop")
    parser.add_argument("--optout",
                        action="store_true",
                        help="Build opt-out list, then stop")
    parser.add_argument("--nonpatienttables",
                        action="store_true",
                        help="Process non-patient tables only")
    parser.add_argument("--patienttables",
                        action="store_true",
                        help="Process patient tables only")
    parser.add_argument("--index",
                        action="store_true",
                        help="Create indexes only")
    parser.add_argument("--skip_dd_check",
                        action="store_true",
                        help="Skip data dictionary validity check")

    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument(
        "-i",
        "--incremental",
        dest="incremental",
        action="store_true",
        help="Process only new/changed information, where possible "
        "(* default)")
    mode_group.add_argument("-f",
                            "--full",
                            dest="incremental",
                            action="store_false",
                            help="Drop and remake everything")
    parser.set_defaults(incremental=True)

    parser.add_argument(
        "--skipdelete",
        dest="skipdelete",
        action="store_true",
        help="For incremental updates, skip deletion of rows present in the "
        "destination but not the source")
    parser.add_argument(
        "--seed",
        help="String to use as the basis of the seed for the random number "
        "generator used for the transient integer RID (TRID). Leave "
        "blank to use the default seed (system time).")
    parser.add_argument("--echo", action="store_true", help="Echo SQL")
    parser.add_argument(
        "--checkextractor",
        nargs='*',
        help="File extensions to check for availability of a text extractor "
        "(use a '.' prefix, and use the special extension 'None' to "
        "check the fallback processor")
    args = parser.parse_args()

    # -------------------------------------------------------------------------

    # Verbosity
    mynames = []  # type: List[str]
    if args.processcluster:
        mynames.append(args.processcluster)
    if args.nprocesses > 1:
        mynames.append("proc{}".format(args.process))
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, loglevel, extranames=mynames)

    # Check text converters
    if args.checkextractor:
        for ext in args.checkextractor:
            if ext.lower() == 'none':
                ext = None
            available = is_text_extractor_available(ext)
            print("Text extractor for extension {} present: {}".format(
                ext, available))
        return

    if args.config:
        os.environ[CONFIG_ENV_VAR] = args.config

    # Demo config?
    if args.democonfig:
        print(DEMO_CONFIG)
        return

    # Delayed import; pass everything else on
    from crate_anon.anonymise.anonymise import anonymise  # delayed import
    try:
        anonymise(args)
    except Exception as exc:
        log.critical("TERMINAL ERROR FROM THIS PROCESS")  # so we see proc#
        die(exc)
Example #4
0
    print("--- For code {}:".format(testcode))
    for r in code_regexes:
        print(r)

    n_digits = 10
    nonspec_10_digit_number_regexes = get_number_of_length_n_regex_elements(
        n_digits,
        at_word_boundaries_only=anonymise_numbers_at_word_boundaries_only)
    print("--- NONSPECIFIC: numbers of length {}:".format(n_digits))
    for r in nonspec_10_digit_number_regexes:
        print(r)

    uk_postcode_regexes = get_uk_postcode_regex_elements(
        at_word_boundaries_only=anonymise_codes_at_word_boundaries_only)
    print("--- NONSPECIFIC: UK postcodes:")
    for r in uk_postcode_regexes:
        print(r)

    testdate = datetime.date(year=2016, month=12, day=31)
    date_regexes = get_date_regex_elements(testdate)
    print("--- For date {}:".format(testdate))
    for r in date_regexes:
        print(r)


if __name__ == '__main__':
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=logging.DEBUG)
    # unittest.main()
    examples_for_paper()
Example #5
0
def main() -> None:
    # -------------------------------------------------------------------------
    # Arguments
    # -------------------------------------------------------------------------
    parser = argparse.ArgumentParser(
        description="Compile MedEx-UIMA itself (in Java)")
    parser.add_argument(
        '--medexdir', default=DEFAULT_MEDEX_DIR,
        help="Root directory of MedEx installation (default: {})".format(
            DEFAULT_MEDEX_DIR))
    parser.add_argument(
        '--javac', default=DEFAULT_JAVAC,
        help="Java compiler (default: {})".format(DEFAULT_JAVAC))
    parser.add_argument(
        '--deletefirst', action='store_true',
        help="Delete existing .class files first (optional)")
    parser.add_argument('--verbose', '-v', action='store_true',
                        help="Be verbose")
    args = parser.parse_args()

    # -------------------------------------------------------------------------
    # Logging
    # -------------------------------------------------------------------------
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel)

    # -------------------------------------------------------------------------
    # Add lexicon entries
    # -------------------------------------------------------------------------
    lexfilename = os.path.join(args.medexdir, 'resources', 'lexicon.cfg')
    lexlines = [lex_route(route)
                for route in EXTRA_ROUTES]
    for frequency, _ in EXTRA_FREQUENCIES:
        lexlines.append(lex_freq(frequency))
        if '.' in frequency:
            lexlines.append(lex_freq(frequency.replace('.', '. ')))
            if frequency not in DO_NOT_REMOVE_DOTS:
                lexlines.append(lex_freq(frequency.replace('.', '')))
    # Need to add variants, e.g. "om" for "o.m."?
    add_lines_if_not_in(lexfilename, lexlines)

    # -------------------------------------------------------------------------
    # Add frequency tags to SemanticRuleEngine.java
    # -------------------------------------------------------------------------
    semengfilename = os.path.join(args.medexdir, 'src', 'org', 'apache',
                                  'medex', 'SemanticRuleEngine.java')
    semlines = [semantic_rule_engine_line(frequency,
                                          frequency not in DO_NOT_REMOVE_DOTS)
                for frequency, _ in EXTRA_FREQUENCIES]
    add_lines_after_trigger(semengfilename, SEM_ENG_TRIGGER_LINE_TRIMMED,
                            SOURCE_START_MARKER, SOURCE_END_MARKER,
                            semlines)

    # -------------------------------------------------------------------------
    # Add frequency tags to frequency_rules
    # -------------------------------------------------------------------------
    freqrulefilename = os.path.join(args.medexdir, 'resources', 'TIMEX',
                                    'rules', 'frequency_rules')
    frlines = [frequency_rules_line(frequency, timex,
                                    frequency not in DO_NOT_REMOVE_DOTS)
               for frequency, timex in EXTRA_FREQUENCIES]
    add_lines_after_trigger(freqrulefilename, FREQ_RULE_TRIGGER_LINE_TRIMMED,
                            SOURCE_START_MARKER, SOURCE_END_MARKER,
                            frlines)

    # -------------------------------------------------------------------------
    # Fix bugs! Argh.
    # -------------------------------------------------------------------------
    bugfixes = [
        {
            "filename": os.path.join(args.medexdir, 'src', 'org', 'apache',
                                     'NLPTools', 'Document.java'),
            "changes": [
                {
                    "comment": """
Medex confuses & and &&, leading to

Exception in thread "main" java.lang.StringIndexOutOfBoundsException: String index out of range: 2
    at java.lang.String.charAt(Unknown Source)
    at org.apache.NLPTools.Document.<init>(Document.java:134)
    at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:256)
    at CrateMedexPipeline.processInput(CrateMedexPipeline.java:302)
    at CrateMedexPipeline.<init>(CrateMedexPipeline.java:128)
    at CrateMedexPipeline.main(CrateMedexPipeline.java:320)
                    """,  # noqa
                    "wrong": r"while(cur_pos<llen & (txt.charAt(cur_pos)==' ' || txt.charAt(cur_pos)=='\n' || txt.charAt(cur_pos)=='\r') ){",  # noqa
                    "right": r"while(cur_pos<llen && (txt.charAt(cur_pos)==' ' || txt.charAt(cur_pos)=='\n' || txt.charAt(cur_pos)=='\r') ){"  # noqa
                    # -----------------------------^
                },
            ],
        },
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        {
            "filename": os.path.join(args.medexdir, 'src', 'org', 'apache',
                                     'algorithms', 'SuffixArray.java'),
            "changes": [
                {
                    "comment": """
        
java.lang.StringIndexOutOfBoundsException: String index out of range: 1
    at java.lang.String.charAt(Unknown Source)
    at org.apache.algorithms.SuffixArray.construct_tree_word(SuffixArray.java:375)
    at org.apache.algorithms.SuffixArray.re_build(SuffixArray.java:97)
    at org.apache.algorithms.SuffixArray.<init>(SuffixArray.java:60)
    at org.apache.medex.MedTagger.medtagging(MedTagger.java:359)
    at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264)
    at CrateMedexPipeline.processInput(CrateMedexPipeline.java:302)
    at CrateMedexPipeline.<init>(CrateMedexPipeline.java:128)
    at CrateMedexPipeline.main(CrateMedexPipeline.java:320)

Offending code in SuffixArray.java:

    for (int i=0;i<this.N;i++){
        int pos=this.SA[i];
        if (this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){
            this.insert_SF_tree(this.SA[i], 0, 0); //# 0 denote the root in __SA;
        }
    }
    
The bug may relate to what's in SA[i]... but as a simple fix:
        
                    """,  # noqa
                    "wrong": r"if (this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){",  # noqa
                    "right": r"if (pos < this.otext.length() && this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){"  # noqa
                    # -------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                },
            ],
        },
    ]  # type: List[Dict[str, Union[str, List[Dict[str, str]]]]]

    _ = """

BUGS IN MEDEX-UIMA NOT YET FIXED:

java.lang.ArrayIndexOutOfBoundsException: -1
    at java.util.Vector.elementData(Unknown Source)
    at java.util.Vector.get(Unknown Source)
    at org.apache.NLPTools.SentenceBoundary.detect_boundaries(SentenceBoundary.java:329)
    at org.apache.medex.MedTagger.medtagging(MedTagger.java:354)
    at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264)
    at CrateMedexPipeline.processInput(CrateMedexPipeline.java:312)
    at CrateMedexPipeline.runPipeline(CrateMedexPipeline.java:138)
    at CrateMedexPipeline.<init>(CrateMedexPipeline.java:112)
    at CrateMedexPipeline.main(CrateMedexPipeline.java:330)

java.lang.NullPointerException
    at org.apache.algorithms.SuffixArray.search(SuffixArray.java:636)
    at org.apache.medex.MedTagger.medtagging(MedTagger.java:362)
    at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264)
    at CrateMedexPipeline.processInput(CrateMedexPipeline.java:312)
    at CrateMedexPipeline.runPipeline(CrateMedexPipeline.java:138)
    at CrateMedexPipeline.<init>(CrateMedexPipeline.java:112)
    at CrateMedexPipeline.main(CrateMedexPipeline.java:330)

... frankly, it's just badly written. That's clearly why it uses the "catch
all exceptions" strategy, but one would imagine the errors are unintentional
(certainly the &/&& one!) or else they wouldn't print a stack trace and chug
on.

    """  # noqa

    for bf in bugfixes:
        filename = bf["filename"]
        changes = []  # type: List[Tuple[str, str]]
        for change in bf["changes"]:
            changes.append((change["wrong"], change["right"]))
        replace_in_file(filename, changes)

    # -------------------------------------------------------------------------
    # Clean up first?
    # -------------------------------------------------------------------------
    if args.deletefirst:
        purge(args.medexdir, '*.class')

    # -------------------------------------------------------------------------
    # Compile
    # -------------------------------------------------------------------------
    bindir = os.path.join(args.medexdir, 'bin')
    classpath = os.pathsep.join([
        os.path.join(args.medexdir, 'src'),
        os.path.join(args.medexdir, 'lib', '*'),  # jar files
    ])
    classpath_options = ['-classpath', classpath]
    os.chdir(args.medexdir)
    cmdargs = (
        [args.javac] +
        classpath_options +
        ['src/org/apache/medex/Main.java'] +
        # ... compiling this compiles everything else necessary
        ['-d', bindir]  # put the binaries here
    )
    log.info("Executing command: {}".format(cmdargs))
    subprocess.check_call(cmdargs)
Example #6
0
def main() -> None:
    version = "Version {} ({})".format(VERSION, VERSION_DATE)
    description = (
        "Runs the CRATE NLP manager in parallel. {}. Note that all arguments "
        "not specified here are passed to the underlying script "
        "(see crate_nlp --help).".format(version))
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument("--nlpdef",
                        required=True,
                        help="NLP processing name, from the config file")
    parser.add_argument("--nproc",
                        "-n",
                        nargs="?",
                        type=int,
                        default=CPUCOUNT,
                        help="Number of processes (default on this "
                        "machine: {})".format(CPUCOUNT))
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help="Be verbose")
    args, unknownargs = parser.parse_known_args()

    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, loglevel)

    common_options = (['--nlpdef', args.nlpdef] + ["-v"] *
                      (1 if args.verbose else 0) + unknownargs)

    log.debug("common_options: {}".format(common_options))

    nprocesses_main = args.nproc
    # nprocesses_index = args.nproc

    # -------------------------------------------------------------------------
    # Setup
    # -------------------------------------------------------------------------

    # Start.
    time_start = time.time()

    # -------------------------------------------------------------------------
    # Clean/build the tables. Only run one copy of this!
    # -------------------------------------------------------------------------
    # CALL USING "python -m my.module"; DO NOT run the script as an executable.
    # If you run a Python script as an executable, it gets added to the
    # PYTHONPATH. Then, when your script says "import regex" (meaning the
    # system module), it might import "regex.py" from the same directory (which
    # it wouldn't normally do, because Python 3 uses absolute not relative
    # imports).
    procargs = [
        sys.executable, '-m', NLP_MANAGER, '--dropremake', '--processcluster',
        'STRUCTURE'
    ] + common_options
    check_call_process(procargs)

    # -------------------------------------------------------------------------
    # Now run lots of things simultaneously:
    # -------------------------------------------------------------------------
    # (a) patient tables
    args_list = []  # type: List[List[str]]
    for procnum in range(nprocesses_main):
        procargs = [
            sys.executable, '-m', NLP_MANAGER, '--nlp', '--processcluster',
            'NLP', '--nprocesses={}'.format(nprocesses_main),
            '--process={}'.format(procnum)
        ] + common_options
        args_list.append(procargs)
    run_multiple_processes(args_list)  # Wait for them all to finish

    # time_middle = time.time()

    # -------------------------------------------------------------------------
    # We used to index at the end.
    # (Always fastest to index last.)
    # But now we combine index definitions with column definitions in SQLA.
    # -------------------------------------------------------------------------
    # args_list = [
    #     [
    #         sys.executable, '-m', NLP_MANAGER,
    #         '--index',
    #         '--processcluster=INDEX',
    #         '--nprocesses={}'.format(nprocesses_index),
    #         '--process={}'.format(procnum)
    #     ] + common_options for procnum in range(nprocesses_index)
    # ]
    # run_multiple_processes(args_list)  # Wait for them all to finish

    # -------------------------------------------------------------------------
    # Finished.
    # -------------------------------------------------------------------------
    time_end = time.time()

    # main_dur = time_middle - time_start
    # index_dur = time_end - time_middle
    # total_dur = time_end - time_start
    # print("Time taken: main {} s, indexing {} s, total {} s".format(
    #     main_dur, index_dur, total_dur))

    total_dur = time_end - time_start
    print("Time taken: {} s".format(total_dur))
Example #7
0
def main() -> None:
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        # formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=r"""
*   Alters a RiO database to be suitable for CRATE.

*   By default, this treats the source database as being a copy of a RiO
    database (slightly later than version 6.2; exact version unclear).
    Use the "--rcep" (+/- "--cpft") switch(es) to treat it as a
    Servelec RiO CRIS Extract Program (RCEP) v2 output database.
    """)  # noqa
    parser.add_argument("--url", required=True, help="SQLAlchemy database URL")
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose")
    parser.add_argument(
        "--print",
        action="store_true",
        help="Print SQL but do not execute it. (You can redirect the printed "
        "output to create an SQL script.")
    parser.add_argument("--echo", action="store_true", help="Echo SQL")

    parser.add_argument(
        "--rcep",
        action="store_true",
        help="Treat the source database as the product of Servelec's RiO CRIS "
        "Extract Program v2 (instead of raw RiO)")
    parser.add_argument(
        "--drop-danger-drop",
        action="store_true",
        help="REMOVES new columns and indexes, rather than creating them. "
        "(There's not very much danger; no real information is lost, but "
        "it might take a while to recalculate it.)")
    parser.add_argument(
        "--cpft",
        action="store_true",
        help="Apply hacks for Cambridgeshire & Peterborough NHS Foundation "
        "Trust (CPFT) RCEP database. Only appicable with --rcep")

    parser.add_argument(
        "--debug-skiptables",
        action="store_true",
        help="DEBUG-ONLY OPTION. Skip tables (view creation only)")

    prog_curr_group = parser.add_mutually_exclusive_group()
    prog_curr_group.add_argument(
        "--prognotes-current-only",
        dest="prognotes_current_only",
        action="store_true",
        help="Progress_Notes view restricted to current versions only "
        "(* default)")
    prog_curr_group.add_argument(
        "--prognotes-all",
        dest="prognotes_current_only",
        action="store_false",
        help="Progress_Notes view shows old versions too")
    parser.set_defaults(prognotes_current_only=True)

    clindocs_curr_group = parser.add_mutually_exclusive_group()
    clindocs_curr_group.add_argument(
        "--clindocs-current-only",
        dest="clindocs_current_only",
        action="store_true",
        help="Clinical_Documents view restricted to current versions only (*)")
    clindocs_curr_group.add_argument(
        "--clindocs-all",
        dest="clindocs_current_only",
        action="store_false",
        help="Clinical_Documents view shows old versions too")
    parser.set_defaults(clindocs_current_only=True)

    allerg_curr_group = parser.add_mutually_exclusive_group()
    allerg_curr_group.add_argument(
        "--allergies-current-only",
        dest="allergies_current_only",
        action="store_true",
        help="Client_Allergies view restricted to current info only")
    allerg_curr_group.add_argument(
        "--allergies-all",
        dest="allergies_current_only",
        action="store_false",
        help="Client_Allergies view shows deleted allergies too (*)")
    parser.set_defaults(allergies_current_only=False)

    audit_group = parser.add_mutually_exclusive_group()
    audit_group.add_argument(
        "--audit-info",
        dest="audit_info",
        action="store_true",
        help="Audit information (creation/update times) added to views")
    audit_group.add_argument("--no-audit-info",
                             dest="audit_info",
                             action="store_false",
                             help="No audit information added (*)")
    parser.set_defaults(audit_info=False)

    parser.add_argument(
        "--postcodedb",
        help='Specify database (schema) name for ONS Postcode Database (as '
        'imported by CRATE) to link to addresses as a view. With SQL '
        'Server, you will have to specify the schema as well as the '
        'database; e.g. "--postcodedb ONS_PD.dbo"')
    parser.add_argument(
        "--geogcols",
        nargs="*",
        default=DEFAULT_GEOG_COLS,
        help="List of geographical information columns to link in from ONS "
        "Postcode Database. BEWARE that you do not specify anything too "
        "identifying. Default: {}".format(' '.join(DEFAULT_GEOG_COLS)))

    parser.add_argument(
        "--settings-filename",
        help="Specify filename to write draft ddgen_* settings to, for use in "
        "a CRATE anonymiser configuration file.")

    progargs = parser.parse_args()

    rootlogger = logging.getLogger()
    configure_logger_for_colour(
        rootlogger, level=logging.DEBUG if progargs.verbose else logging.INFO)

    progargs.rio = not progargs.rcep
    if progargs.rcep:
        # RCEP
        progargs.master_patient_table = RCEP_TABLE_MASTER_PATIENT
        if progargs.cpft:
            progargs.full_prognotes_table = CPFT_RCEP_TABLE_FULL_PROGRESS_NOTES
            # We (CPFT) may have a hacked-in copy of the RiO main progress
            # notes table added to the RCEP output database.
        else:
            progargs.full_prognotes_table = None
            # The RCEP does not export sufficient information to distinguish
            # current and non-current versions of progress notes.
    else:
        # RiO
        progargs.master_patient_table = RIO_TABLE_MASTER_PATIENT
        progargs.full_prognotes_table = RIO_TABLE_PROGRESS_NOTES

    log.info("CRATE in-place preprocessor for RiO or RiO CRIS Extract Program "
             "(RCEP) databases")
    safeargs = {k: v for k, v in vars(progargs).items() if k != 'url'}
    log.debug("args (except url): {}".format(repr(safeargs)))
    log.info("RiO mode" if progargs.rio else "RCEP mode")

    if progargs.postcodedb and not progargs.geogcols:
        raise ValueError(
            "If you specify postcodedb, you must specify some geogcols")

    set_print_not_execute(progargs.print)

    hack_in_mssql_xml_type()

    engine = create_engine(progargs.url, echo=progargs.echo, encoding=CHARSET)
    metadata = MetaData()
    metadata.bind = engine
    log.info("Database: {}".format(repr(engine.url)))  # ... repr hides p/w
    log.debug("Dialect: {}".format(engine.dialect.name))

    log.info("Reflecting (inspecting) database...")
    metadata.reflect(engine)
    log.info("... inspection complete")

    ddhint = DDHint()

    if progargs.drop_danger_drop:
        # Drop views (and view-induced table indexes) first
        if progargs.rio:
            drop_rio_views(engine, metadata, progargs, ddhint)
        drop_view(engine, VIEW_ADDRESS_WITH_GEOGRAPHY)
        if not progargs.debug_skiptables:
            process_all_tables(engine, metadata, progargs)
    else:
        # Tables first, then views
        if not progargs.debug_skiptables:
            process_all_tables(engine, metadata, progargs)
        if progargs.postcodedb:
            add_postcode_geography_view(engine, progargs, ddhint)
        if progargs.rio:
            create_rio_views(engine, metadata, progargs, ddhint)

    if progargs.settings_filename:
        with open(progargs.settings_filename, 'w') as f:
            print(get_rio_dd_settings(ddhint), file=f)
def main() -> None:
    version = "Version {} ({})".format(VERSION, VERSION_DATE)
    description = (
        "Runs the CRATE anonymiser in parallel. {}. Note that all arguments "
        "not specified here are passed to the underlying script "
        "(see crate_anonymise --help).".format(version))
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument("--nproc",
                        "-n",
                        nargs="?",
                        type=int,
                        default=CPUCOUNT,
                        help="Number of processes (default on this "
                        "machine: {})".format(CPUCOUNT))
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help="Be verbose")
    args, unknownargs = parser.parse_known_args()

    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel)

    common_options = ["-v"] * (1 if args.verbose else 0) + unknownargs

    log.debug("common_options: {}".format(common_options))

    nprocesses_patient = args.nproc
    nprocesses_nonpatient = args.nproc
    nprocesses_index = args.nproc

    # -------------------------------------------------------------------------
    # Setup
    # -------------------------------------------------------------------------

    # Start.
    time_start = time.time()

    # -------------------------------------------------------------------------
    # Clean/build the tables. Only run one copy of this!
    # -------------------------------------------------------------------------
    # CALL USING "python -m my.module"; DO NOT run the script as an executable.
    # If you run a Python script as an executable, it gets added to the
    # PYTHONPATH. Then, when your script says "import regex" (meaning the
    # system module), it might import "regex.py" from the same directory (which
    # it wouldn't normally do, because Python 3 uses absolute not relative
    # imports).
    procargs = [
        sys.executable, '-m', ANONYMISER, '--dropremake',
        '--processcluster=STRUCTURE'
    ] + common_options
    check_call_process(procargs)

    # -------------------------------------------------------------------------
    # Build opt-out lists. Only run one copy of this!
    # -------------------------------------------------------------------------
    procargs = [
        sys.executable, '-m', ANONYMISER, '--optout',
        '--processcluster=OPTOUT', '--skip_dd_check'
    ] + common_options
    check_call_process(procargs)

    # -------------------------------------------------------------------------
    # Now run lots of things simultaneously:
    # -------------------------------------------------------------------------
    # It'd be less confusing if we have a single numbering system across all,
    # rather than numbering separately for patient and non-patient processes.
    # However, each group divides its work by its process number, so that
    # won't fly (for n processes it wants to see processes numbered from 0 to
    # n - 1 inclusive).

    # (a) patient tables
    args_list = []
    for procnum in range(nprocesses_patient):
        procargs = [
            sys.executable, '-m', ANONYMISER, '--patienttables',
            '--processcluster=PATIENT',
            '--nprocesses={}'.format(nprocesses_patient),
            '--process={}'.format(procnum), '--skip_dd_check'
        ] + common_options
        args_list.append(procargs)
    for procnum in range(nprocesses_nonpatient):
        procargs = [
            sys.executable, '-m', ANONYMISER, '--nonpatienttables',
            '--processcluster=NONPATIENT',
            '--nprocesses={}'.format(nprocesses_nonpatient),
            '--process={}'.format(procnum), '--skip_dd_check'
        ] + common_options
        args_list.append(procargs)
    run_multiple_processes(args_list)  # Wait for them all to finish

    time_middle = time.time()

    # -------------------------------------------------------------------------
    # Now do the indexing, if nothing else failed.
    # (Always fastest to index last.)
    # -------------------------------------------------------------------------
    args_list = [[
        sys.executable, '-m', ANONYMISER, '--index', '--processcluster=INDEX',
        '--nprocesses={}'.format(nprocesses_index),
        '--process={}'.format(procnum), '--skip_dd_check'
    ] + common_options for procnum in range(nprocesses_index)]
    run_multiple_processes(args_list)

    # -------------------------------------------------------------------------
    # Finished.
    # -------------------------------------------------------------------------
    time_end = time.time()
    main_dur = time_middle - time_start
    index_dur = time_end - time_middle
    total_dur = time_end - time_start
    print("Time taken: main {} s, indexing {} s, total {} s".format(
        main_dur, index_dur, total_dur))
Example #9
0
def main() -> None:
    """
    Command-line entry point.
    """
    version = "Version {} ({})".format(VERSION, VERSION_DATE)
    description = "NLP manager. {version}. By Rudolf Cardinal.".format(
        version=version)

    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--version", action="version", version=version)
    parser.add_argument("--config",
                        help="Config file (overriding environment "
                        "variable {})".format(NLP_CONFIG_ENV_VAR))
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help="Be verbose (use twice for extra verbosity)")
    parser.add_argument("--nlpdef",
                        nargs="?",
                        default=None,
                        help="NLP definition name (from config file)")
    parser.add_argument('--report_every_fast',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY,
                        help="Report insert progress (for fast operations) "
                        "every n rows in verbose "
                        "mode (default {})".format(DEFAULT_REPORT_EVERY))
    parser.add_argument('--report_every_nlp',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY_NLP,
                        help="Report progress for NLP every n rows in verbose "
                        "mode (default "
                        "{})".format(DEFAULT_REPORT_EVERY_NLP))
    parser.add_argument('--chunksize',
                        nargs="?",
                        type=int,
                        default=DEFAULT_CHUNKSIZE,
                        help="Number of records copied in a chunk when copying"
                        " PKs from one database to another"
                        " (default {})".format(DEFAULT_CHUNKSIZE))
    parser.add_argument("--process",
                        nargs="?",
                        type=int,
                        default=0,
                        help="For multiprocess mode: specify process number")
    parser.add_argument("--nprocesses",
                        nargs="?",
                        type=int,
                        default=1,
                        help="For multiprocess mode: specify "
                        "total number of processes (launched somehow, of "
                        "which this is to be one)")
    parser.add_argument("--processcluster",
                        default="",
                        help="Process cluster name")
    parser.add_argument("--democonfig",
                        action="store_true",
                        help="Print a demo config file")
    parser.add_argument("--listprocessors",
                        action="store_true",
                        help="Show possible built-in NLP processor names")
    parser.add_argument("--describeprocessors",
                        action="store_true",
                        help="Show details of built-in NLP processors")
    parser.add_argument("--showinfo",
                        required=False,
                        nargs='?',
                        metavar="NLP_CLASS_NAME",
                        help="Show detailed information for a parser")
    parser.add_argument("--count",
                        action="store_true",
                        help="Count records in source/destination databases, "
                        "then stop")

    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument("-i",
                            "--incremental",
                            dest="incremental",
                            action="store_true",
                            help="Process only new/changed information, where "
                            "possible (* default)")
    mode_group.add_argument("-f",
                            "--full",
                            dest="incremental",
                            action="store_false",
                            help="Drop and remake everything")
    parser.set_defaults(incremental=True)

    parser.add_argument("--dropremake",
                        action="store_true",
                        help="Drop/remake destination tables only")
    parser.add_argument("--skipdelete",
                        dest="skipdelete",
                        action="store_true",
                        help="For incremental updates, skip deletion of rows "
                        "present in the destination but not the source")
    parser.add_argument("--nlp",
                        action="store_true",
                        help="Perform NLP processing only")
    parser.add_argument("--echo", action="store_true", help="Echo SQL")
    parser.add_argument("--timing",
                        action="store_true",
                        help="Show detailed timing breakdown")
    args = parser.parse_args()

    # Validate args
    if args.nprocesses < 1:
        raise ValueError("--nprocesses must be >=1")
    if args.process < 0 or args.process >= args.nprocesses:
        raise ValueError(
            "--process argument must be from 0 to (nprocesses - 1) inclusive")
    if args.config:
        os.environ[NLP_CONFIG_ENV_VAR] = args.config

    # Verbosity and logging
    mynames = []  # type: List[str]
    if args.processcluster:
        mynames.append(args.processcluster)
    if args.nprocesses > 1:
        mynames.append("proc{}".format(args.process))
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel, extranames=mynames)

    # -------------------------------------------------------------------------

    # Demo config?
    if args.democonfig:
        print(DEMO_CONFIG)
        return

    # List or describe processors?
    if args.listprocessors:
        print("\n".join(possible_processor_names()))
        return
    if args.describeprocessors:
        print(possible_processor_table())
        return
    if args.showinfo:
        parser = get_nlp_parser_debug_instance(args.showinfo)
        if parser:
            print("Info for class {}:\n".format(args.showinfo))
            parser.print_info()
        else:
            print("No such processor class: {}".format(args.showinfo))
        return

    # Otherwise, we need a valid NLP definition.
    if args.nlpdef is None:
        raise ValueError("Must specify nlpdef parameter (unless --democonfig, "
                         "--listprocessors, or --describeprocessors used)")

    everything = not any([args.dropremake, args.nlp])

    # Report args
    log.debug("arguments: {}".format(args))

    # Load/validate config
    config = NlpDefinition(args.nlpdef,
                           logtag="_".join(mynames).replace(" ", "_"))
    config.set_echo(args.echo)

    # Count only?
    if args.count:
        show_source_counts(config)
        show_dest_counts(config)
        return

    # -------------------------------------------------------------------------

    log.info("Starting: incremental={}".format(args.incremental))
    start = get_now_utc()
    timer.set_timing(args.timing, reset=True)

    # 1. Drop/remake tables. Single-tasking only.
    with MultiTimerContext(timer, TIMING_DROP_REMAKE):
        if args.dropremake or everything:
            drop_remake(args,
                        config,
                        incremental=args.incremental,
                        skipdelete=args.skipdelete)

    # From here, in a multiprocessing environment, trap any errors simply so
    # we can report the process number clearly.

    # 2. NLP
    if args.nlp or everything:
        try:
            process_nlp(config,
                        incremental=args.incremental,
                        report_every=args.report_every_nlp,
                        tasknum=args.process,
                        ntasks=args.nprocesses)
        except Exception as exc:
            log.critical("TERMINAL ERROR FROM THIS PROCESS")  # so we see proc#
            die(exc)

    log.info("Finished")
    end = get_now_utc()
    time_taken = end - start
    log.info("Time taken: {:.3f} seconds".format(time_taken.total_seconds()))

    if args.timing:
        timer.report()
Example #10
0
def main() -> None:
    parser = argparse.ArgumentParser(
        description="Compile Java classes for CRATE's interface to GATE")
    parser.add_argument(
        '--builddir',
        default=DEFAULT_BUILD_DIR,
        help="Output directory for compiled .class files (default: {})".format(
            DEFAULT_BUILD_DIR))
    parser.add_argument(
        '--gatedir',
        default=DEFAULT_GATEDIR,
        help="Root directory of GATE installation (default: {})".format(
            DEFAULT_GATEDIR))
    parser.add_argument(
        '--java',
        default=DEFAULT_JAVA,
        help="Java executable (default: {})".format(DEFAULT_JAVA))
    parser.add_argument(
        '--javac',
        default=DEFAULT_JAVAC,
        help="Java compiler (default: {})".format(DEFAULT_JAVAC))
    parser.add_argument('--verbose',
                        '-v',
                        action='count',
                        default=0,
                        help="Be verbose (use twice for extra verbosity)")
    parser.add_argument(
        '--launch',
        action='store_true',
        help="Launch script in demonstration mode (having previously "
        "compiled it)")
    args = parser.parse_args()

    loglevel = logging.DEBUG if args.verbose >= 1 else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel)

    gatejar = os.path.join(args.gatedir, 'bin', 'gate.jar')
    gatelibjars = os.path.join(args.gatedir, 'lib', '*')
    classpath = os.pathsep.join([gatejar, gatelibjars])
    classpath_options = ['-classpath', classpath]

    if args.launch:
        appfile = os.path.join(args.gatedir, 'plugins', 'ANNIE',
                               'ANNIE_with_defaults.gapp')
        features = ['-a', 'Person', '-a', 'Location']
        eol_options = ['-it', 'END', '-ot', 'END']
        prog_args = ['-g', appfile] + features + eol_options
        if args.verbose > 0:
            prog_args += ['-v', '-v']
        if args.verbose > 1:
            prog_args += ['-wg', 'wholexml_', '-wa', 'annotxml_']
        cmdargs = ([args.java] + classpath_options +
                   [GATE_PIPELINE_CLASSNAME] + prog_args)
        log.info("Executing command: {}".format(cmdargs))
        subprocess.check_call(cmdargs)
    else:
        os.makedirs(args.builddir, exist_ok=True)
        cmdargs = ([args.javac, '-Xlint:unchecked'] +
                   (['-verbose'] if args.verbose > 0 else []) +
                   classpath_options + ['-d', args.builddir] + [SOURCE_FILE])
        log.info("Executing command: {}".format(cmdargs))
        subprocess.check_call(cmdargs)
        log.info("Output *.class files are in {}".format(args.builddir))
Example #11
0
def main() -> None:
    default_size = 0
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "url",
        help=(
            "SQLAlchemy database URL. Append ?charset=utf8, e.g. "
            "mysql+mysqldb://root:[email protected]:3306/test?charset=utf8 ."
            " WARNING: If you get the error 'MySQL has gone away', increase "
            "the max_allowed_packet parameter in my.cnf (e.g. to 32M)."))
    parser.add_argument(
        "--size",
        type=int,
        default=default_size,
        choices=[0, 1, 2, 3],
        help="Make tiny (0), small (1), medium (2), or large (3) database "
        "(default={})".format(default_size))
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help="Be verbose")
    parser.add_argument("--echo", action="store_true", help="Echo SQL")
    parser.add_argument(
        "--doctest-doc",
        default=DOCTEST_DOC,
        help="Test file for .DOC (default: {})".format(DOCTEST_DOC))
    parser.add_argument(
        "--doctest-docx",
        default=DOCTEST_DOCX,
        help="Test file for .DOCX (default: {})".format(DOCTEST_DOCX))
    parser.add_argument(
        "--doctest-odt",
        default=DOCTEST_ODT,
        help="Test file for .ODT (default: {})".format(DOCTEST_ODT))
    parser.add_argument(
        "--doctest-pdf",
        default=DOCTEST_PDF,
        help="Test file for .PDF (default: {})".format(DOCTEST_PDF))
    args = parser.parse_args()

    nwords = 10000
    if args.size == 0:
        n_patients = 20
        notes_per_patient = 1
        words_per_note = 100
    elif args.size == 1:
        n_patients = 100
        notes_per_patient = 5
        words_per_note = 100
    elif args.size == 2:
        n_patients = 100
        notes_per_patient = 100
        words_per_note = 1000
    elif args.size == 3:
        # about 1.4 Gb
        n_patients = 1000
        notes_per_patient = 100
        words_per_note = 1000
    else:
        assert False, "Bad size parameter"
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel)

    # 0. Announce intentions

    log.info("n_patients={}, notes_per_patient={}, words_per_note={}".format(
        n_patients, notes_per_patient, words_per_note))

    # 1. Get words

    log.info("Fetching words.")
    words = subprocess.check_output(
        ["grep", "-v", "'s", "-m",
         str(nwords),
         "/usr/share/dict/words"]).decode(CONSOLE_ENCODING).splitlines()

    # 2. Open database

    log.info("Opening database.")
    log.debug("URL: {}".format(args.url))
    engine = create_engine(args.url, echo=args.echo, encoding=CHARSET)
    session = sessionmaker(bind=engine)()

    # 3. Create tables

    log.info("Creating tables.")
    metadata.drop_all(engine, checkfirst=True)
    metadata.create_all(engine, checkfirst=True)

    # 4. Insert

    log.info("Aiming for a total of {} words in notes.".format(
        n_patients * notes_per_patient * words_per_note))

    log.info("Inserting data.")

    # Special extra patient

    p1 = Patient(
        patient_id=1,
        forename="Ronald Gibbet",
        surname="MacDonald",
        dob=datetime.datetime(day=11, month=11, year=1911),
        nhsnum=123456,
        phone="(01223)-123456",
        postcode="CB2 3EB",
    )
    session.add(p1)
    n1 = Note(patient=p1,
              note="""
Ronald MacDonald lived on a farm and kept a gibbet for scaring off
small animals. He was born on 11 Nov 1911 and was very proud of this.
His cat’s name was Flitterwick. It did not like the gibbets.
Ronalds other passion was blimping.
A typo might be RonaldMacDonald.
His phone number was 0122-312-3456, or 01223-123456, or (01223) 123456,
or 01223 123 456, or 01223 123456.
His NHS number was 123.456 or possibly 12 34 56.
His postcode was CB2 3EB, or possible CB23EB, or CB2, or 3EB.

Some HTML encoding is &amp; and &lt;.
An HTML tag is <a href="http://somewhere">this link</a>.
Start aspirin 75mg od. Remains on Lipitor 40mg nocte.
For haloperidol 2mg po prn max qds.
Start amoxicillin 500 mg b.i.d. for 7 days.

Some numerical results:
His CRP is 10. His previous CRP was <13 mg/dl.
Sodium 140.
TSH 3.5; urea normal.
Height 1.82m, weight 75kg, BMI 22.6. BP 135/82.
MMSE 28/30. ACE-R 72, ACE-II 73, ACE 73.
ESR 16 (H) mm/h.
WBC 9.2; neutrophils 4.3; lymphocytes 2.6; eosinophils 0.4; monocytes 1.2;
basophils 0.6.
        """)
    session.add(n1)
    for filename in (DOCTEST_DOC, DOCTEST_DOCX, DOCTEST_ODT, DOCTEST_PDF):
        bd = BlobDoc(patient=p1, filename=filename)
        session.add(bd)
        fd = FilenameDoc(patient=p1, filename=filename)
        session.add(fd)

    p2 = Patient(
        patient_id=2,
        forename="Bob D'Souza",
        surname="",
        dob=datetime.datetime(day=11, month=11, year=1911),
        nhsnum=234567,
        phone="(01223)-234567",
        postcode="CB2 3EB",
        related_patient_id=1,
    )
    session.add(p2)
    n2 = Note(patient=p2,
              note="""
Bob D'Souza, also known as Bob, or Mr DSouza, or sometimes Mr D Souza,
or the D'Souza bloke down the road, or BobDSouza or BobD'Souza.
His phone number was 0122-312-3456, or 01223-123456, or (01223) 123456,
or 01223 123 456, or 01223 123456.
His NHS number was 123.456 or possibly 12 34 56 or 123456, perhaps.
His postcode was CB2 3EB, or possible CB23EB, or CB2, or 3EB.
Bob Hope visited Seattle.
Bob took venlafaxine 375 M/R od, and is due to start clozapine 75mg bd.
        """)
    session.add(n2)

    # A bunch of patients
    random.seed(1)
    prev_forename = ''
    prev_surname = ''
    for p in range(n_patients):
        if p % REPORT_EVERY == 0:
            log.info("patient {}".format(p))
        forename = words[(p + 1) % nwords] + " " + words[(p + 10) % nwords]
        surname = words[(p + 2) % nwords]
        dob = BASE_DOB + datetime.timedelta(days=p)
        ok_date = dob + datetime.timedelta(days=1)
        nhsnum = random.randint(1, 9999999999)
        patient = Patient(
            patient_id=p + 3,
            forename=forename,
            surname=surname,
            dob=dob,
            nhsnum=nhsnum,
            phone="123456",
            postcode="CB2 3EB",
            related_patient_id=p + 2,  # one back from patient_id
        )
        session.add(patient)
        patient_id = patient.patient_id
        dates = "DATES: " + (
            " ".join([dob.strftime(fmt) for fmt in DT_FORMATS]) +
            " ".join([ok_date.strftime(fmt) for fmt in DT_FORMATS])) + ". "
        fname = "FORENAME: " + forename + ". "
        sname = "SURNAME: " + surname + ". "
        rname = "RELATIVE: " + prev_forename + " " + prev_surname + ". "
        numbers = "NUMBERS: {}, {}, {}. ".format(patient_id, patient_id + 1,
                                                 nhsnum)
        for n in range(notes_per_patient):
            wstr = " ".join(words[p % nwords:(p + words_per_note) % nwords])
            note = Note(
                patient=patient,
                note=fname + sname + rname + numbers + dates + wstr,
            )
            session.add(note)
        prev_forename = forename
        prev_surname = surname

    # 5. Commit

    log.info("Committing...")
    session.commit()
    log.info("Done.")

    # 6. Report size

    if engine.dialect.name == 'mysql':
        log.info("Done. Database size:")
        sql = """
            SELECT
                table_schema,
                table_name,
                table_rows,
                data_length,
                index_length,
                ROUND(((data_length + index_length) / (1024 * 1024)), 2)
                  AS "Size_MB"
            FROM
                information_schema.tables
            WHERE table_schema = DATABASE()
        """
        rows = session.execute(text(sql))
        for r in rows:
            print("schema={}, table={}, rows={}, data_length={}, "
                  "index_length={}, size_MB={}".format(*r))
Example #12
0
def main() -> None:
    parser = argparse.ArgumentParser(
        description="Compile Java classes for CRATE's interface to MedEx-UIMA")
    parser.add_argument(
        '--builddir', default=DEFAULT_BUILD_DIR,
        help="Output directory for compiled .class files (default: {})".format(
            DEFAULT_BUILD_DIR))
    parser.add_argument(
        '--medexdir', default=DEFAULT_MEDEX_DIR,
        help="Root directory of MedEx installation (default: {})".format(
            DEFAULT_MEDEX_DIR))
    parser.add_argument(
        '--java', default=DEFAULT_JAVA,
        help="Java executable (default: {})".format(DEFAULT_JAVA))
    parser.add_argument(
        '--javac', default=DEFAULT_JAVAC,
        help="Java compiler (default: {})".format(DEFAULT_JAVAC))
    parser.add_argument('--verbose', '-v', action='count', default=0,
                        help="Be verbose (use twice for extra verbosity)")
    parser.add_argument(
        '--launch', action='store_true',
        help="Launch script in demonstration mode (having previously "
             "compiled it)")
    args = parser.parse_args()

    loglevel = logging.DEBUG if args.verbose >= 1 else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel)

    medexclasses = os.path.join(args.medexdir, 'bin')
    medexlibjars = os.path.join(args.medexdir, 'lib', '*')
    classpath = os.pathsep.join([args.builddir, medexclasses, medexlibjars])
    classpath_options = ['-classpath', classpath]

    if args.launch:
        inputdir = tempfile.TemporaryDirectory()
        outputdir = tempfile.TemporaryDirectory()
        prog_args = [
            "-data_ready_signal", MEDEX_DATA_READY_SIGNAL,
            "-results_ready_signal", MEDEX_RESULTS_READY_SIGNAL,
            "-i", inputdir.name,
            "-o", outputdir.name,
        ]
        if args.verbose > 0:
            prog_args += ['-v', '-v']
        cmdargs = (
            [args.java] +
            classpath_options +
            [MEDEX_PIPELINE_CLASSNAME] +
            prog_args
        )
        log.info("Executing command: {}".format(cmdargs))
        subprocess.check_call(cmdargs)
    else:
        os.makedirs(args.builddir, exist_ok=True)
        cmdargs = (
            [args.javac, '-Xlint:unchecked'] +
            (['-verbose'] if args.verbose > 0 else []) +
            classpath_options +
            ['-d', args.builddir] +
            [SOURCE_FILE]
        )
        log.info("Executing command: {}".format(cmdargs))
        subprocess.check_call(cmdargs)
        log.info("Output *.class files are in {}".format(args.builddir))