Example #1
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 variable: str,
                 target_unit: str,
                 regex_str_for_debugging: str,
                 commit: bool = False) -> None:
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit)
        self.variable = variable
        self.target_unit = target_unit
        self.regex_str_for_debugging = regex_str_for_debugging

        if nlpdef is None:  # only None for debugging!
            self.tablename = ''
            self.assume_preferred_unit = True
        else:
            self.tablename = nlpdef.opt_str(
                cfgsection, 'desttable', required=True)
            self.assume_preferred_unit = nlpdef.opt_bool(
                cfgsection, 'assume_preferred_unit', default=True)

        # Sanity checks
        assert len(self.variable) <= MAX_SQL_FIELD_LEN, (
            "Variable name too long (max {} characters)".format(
                MAX_SQL_FIELD_LEN))
Example #2
0
def drop_remake(progargs,
                nlpdef: NlpDefinition,
                incremental: bool = False,
                skipdelete: bool = False) -> None:
    """
    Drop output tables and recreate them.
    """
    # Not parallel.
    # -------------------------------------------------------------------------
    # 1. Progress database
    # -------------------------------------------------------------------------
    progengine = nlpdef.get_progdb_engine()
    if not incremental:
        log.debug("Dropping progress tables")
        NlpRecord.__table__.drop(progengine, checkfirst=True)
    log.info("Creating progress table (with index)")
    NlpRecord.__table__.create(progengine, checkfirst=True)

    # -------------------------------------------------------------------------
    # 2. Output database(s)
    # -------------------------------------------------------------------------
    pretty_names = []  # type: List[str]
    for processor in nlpdef.get_processors():
        new_pretty_names = processor.make_tables(drop_first=not incremental)
        for npn in new_pretty_names:
            if npn in pretty_names:
                log.warning("An NLP processor has tried to re-make a table "
                            "made by one of its colleagues: {}".format(npn))
        pretty_names.extend(new_pretty_names)

    # -------------------------------------------------------------------------
    # 3. Delete WHERE NOT IN for incremental
    # -------------------------------------------------------------------------
    for ifconfig in nlpdef.get_ifconfigs():
        with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE):
            if incremental:
                if not skipdelete:
                    delete_where_no_source(
                        nlpdef,
                        ifconfig,
                        report_every=progargs.report_every_fast,
                        chunksize=progargs.chunksize)
            else:  # full
                ifconfig.delete_all_progress_records()

    # -------------------------------------------------------------------------
    # 4. Overall commit (superfluous)
    # -------------------------------------------------------------------------
    nlpdef.commit_all()
Example #3
0
def show_dest_counts(nlpdef: NlpDefinition) -> None:
    """
    Show the number of records in all destination tables.
    """
    print("DESTINATION TABLE RECORD COUNTS:")
    counts = []  # type: List[Tuple[str, int]]
    for processor in nlpdef.get_processors():
        session = processor.get_session()
        dbname = processor.get_dbname()
        for tablename in processor.get_tablenames():
            n = count_star(session, tablename)
            counts.append(("DESTINATION: {}.{}".format(dbname, tablename), n))
    print_record_counts(counts)
Example #4
0
def show_source_counts(nlpdef: NlpDefinition) -> None:
    """
    Show the number of records in all source tables.
    """
    print("SOURCE TABLE RECORD COUNTS:")
    counts = []  # type: List[Tuple[str, int]]
    for ifconfig in nlpdef.get_ifconfigs():
        session = ifconfig.get_source_session()
        dbname = ifconfig.get_srcdb()
        tablename = ifconfig.get_srctable()
        n = count_star(session, tablename)
        counts.append(("{}.{}".format(dbname, tablename), n))
    print_record_counts(counts)
Example #5
0
    def __init__(self, nlpdef: NlpDefinition) -> None:
        """
        Args:
            nlpdef:
                :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
        """
        self._nlpdef = nlpdef
        self._cloudcfg = nlpdef.get_cloud_config_or_raise()
        self._nlpdef_sectionname = full_sectionname(NlpConfigPrefixes.NLPDEF,
                                                    self._nlpdef.get_name())
        self._auth = (self._cloudcfg.username, self._cloudcfg.password)
        self._post = self._internal_post

        self.cookies = None  # type: Optional[CookieJar]
Example #6
0
 def __init__(self,
              nlpdef: NlpDefinition) \
         -> None:
     """
     Args:
         nlpdef:
             a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
     """
     self.nlpdef = nlpdef
     # Convenience member for our users:
     self.cloudcfg = nlpdef.get_cloud_config_or_raise()
     self._remote_processors = None  # type: Optional[List[ServerProcessor]]
     self._local_processors = None  # type: Optional[List[Cloud]]
     self._configure_local_processors()
Example #7
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 regex_str_list: List[str],
                 validated_variable: str,
                 commit: bool = False) -> None:
        """
        This class operates with compiled regexes having this group format:
          - variable
        """
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit)
        self.regex_str_list = regex_str_list  # for debugging only
        self.compiled_regex_list = [compile_regex(r) for r in regex_str_list]
        self.variable = "{}_validator".format(validated_variable)
        self.NAME = self.variable

        if nlpdef is None:  # only None for debugging!
            self.tablename = ''
        else:
            self.tablename = nlpdef.opt_str(
                cfgsection, 'desttable', required=True)
Example #8
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        """
        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfgsection:
                the name of a CRATE NLP config file section (from which we may
                choose to get extra config information)
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        super().__init__(nlpdef=nlpdef,
                         cfgsection=cfgsection,
                         commit=commit,
                         name="MedEx")

        if nlpdef is None:  # only None for debugging!
            self._debug_mode = True
            self._tablename = self.classname().lower()
            self._max_external_prog_uses = 1
            self._progenvsection = ""
            self._env = {}  # type: Dict[str, str]
            progargs = ""
        else:
            self._debug_mode = False
            self._tablename = nlpdef.opt_str(self._sectionname,
                                             ProcessorConfigKeys.DESTTABLE,
                                             required=True)

            self._max_external_prog_uses = nlpdef.opt_int(
                self._sectionname,
                ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES,
                default=0)

            self._progenvsection = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGENVSECTION)

            if self._progenvsection:
                self._env = nlpdef.get_env_dict(
                    full_sectionname(NlpConfigPrefixes.ENV,
                                     self._progenvsection), os.environ)
            else:
                self._env = os.environ.copy()
            self._env["NLPLOGTAG"] = nlpdef.get_logtag() or '.'
            # ... because passing a "-lt" switch with no parameter will make
            # CrateGatePipeline.java complain and stop

            progargs = nlpdef.opt_str(self._sectionname,
                                      ProcessorConfigKeys.PROGARGS,
                                      required=True)

        if USE_TEMP_DIRS:
            self._inputdir = tempfile.TemporaryDirectory()
            self._outputdir = tempfile.TemporaryDirectory()
            self._workingdir = tempfile.TemporaryDirectory()
            # ... these are autodeleted when the object goes out of scope; see
            #     https://docs.python.org/3/library/tempfile.html
            # ... which manages it using weakref.finalize
        else:
            homedir = os.path.expanduser("~")
            self._inputdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "input"))
            mkdir_p(self._inputdir.name)
            self._outputdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "output"))
            mkdir_p(self._outputdir.name)
            self._workingdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "working"))
            mkdir_p(self._workingdir.name)

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)
        self._progargs.extend([
            "-data_ready_signal",
            MEDEX_DATA_READY_SIGNAL,
            "-results_ready_signal",
            MEDEX_RESULTS_READY_SIGNAL,
            "-i",
            self._inputdir.name,
            "-o",
            self._outputdir.name,
        ])

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._file_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False
Example #9
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit)

        self._tablename = nlpdef.opt_str(cfgsection,
                                         'desttable',
                                         required=True)

        self._max_external_prog_uses = nlpdef.opt_int(cfgsection,
                                                      'max_external_prog_uses',
                                                      default=0)

        self._progenvsection = nlpdef.opt_str(cfgsection, 'progenvsection')
        if self._progenvsection:
            self._env = nlpdef.get_env_dict(self._progenvsection, os.environ)
        else:
            self._env = os.environ.copy()
        self._env["NLPLOGTAG"] = nlpdef.get_logtag() or '.'
        # ... because passing a "-lt" switch with no parameter will make
        # CrateGatePipeline.java complain and stop

        if USE_TEMP_DIRS:
            self._inputdir = tempfile.TemporaryDirectory()
            self._outputdir = tempfile.TemporaryDirectory()
            self._workingdir = tempfile.TemporaryDirectory()
            # ... these are autodeleted when the object goes out of scope; see
            #     https://docs.python.org/3/library/tempfile.html
            # ... which manages it using weakref.finalize
        else:
            homedir = os.path.expanduser("~")
            self._inputdir = AttrDict(
                name=os.path.join(homedir, "medextemp", "input"))
            mkdir_p(self._inputdir.name)
            self._outputdir = AttrDict(
                name=os.path.join(homedir, "medextemp", "output"))
            mkdir_p(self._outputdir.name)
            self._workingdir = AttrDict(
                name=os.path.join(homedir, "medextemp", "working"))
            mkdir_p(self._workingdir.name)

        progargs = nlpdef.opt_str(cfgsection, 'progargs', required=True)
        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)
        self._progargs.extend([
            "-data_ready_signal",
            MEDEX_DATA_READY_SIGNAL,
            "-results_ready_signal",
            MEDEX_RESULTS_READY_SIGNAL,
            "-i",
            self._inputdir.name,
            "-o",
            self._outputdir.name,
        ])

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._file_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False
Example #10
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit)

        if not nlpdef and not cfgsection:
            # Debugging only
            self._max_external_prog_uses = 0
            self._input_terminator = 'input_terminator'
            self._output_terminator = 'output_terminator'
            typepairs = []  # type: List[str]
            self._progenvsection = ''
            progargs = ''
            logtag = ''
        else:
            self._max_external_prog_uses = nlpdef.opt_int(
                cfgsection, 'max_external_prog_uses', default=0)
            self._input_terminator = nlpdef.opt_str(cfgsection,
                                                    'input_terminator',
                                                    required=True)
            self._output_terminator = nlpdef.opt_str(cfgsection,
                                                     'output_terminator',
                                                     required=True)
            typepairs = nlpdef.opt_strlist(cfgsection,
                                           'outputtypemap',
                                           required=True,
                                           lower=False)
            self._progenvsection = nlpdef.opt_str(cfgsection, 'progenvsection')
            progargs = nlpdef.opt_str(cfgsection, 'progargs', required=True)
            logtag = nlpdef.get_logtag() or '.'

        self._outputtypemap = {}  # type: Dict[str, OutputUserConfig]
        self._type_to_tablename = {}  # type: Dict[str, str]
        for c in chunks(typepairs, 2):
            annottype = c[0]
            outputsection = c[1]
            if annottype != annottype.lower():
                raise Exception(
                    "Section {}: annotation types in outputtypemap must be in "
                    "lower case: change {}".format(cfgsection, annottype))
            # log.critical(outputsection)
            c = OutputUserConfig(nlpdef.get_parser(), outputsection)
            self._outputtypemap[annottype] = c
            self._type_to_tablename[annottype] = c.get_tablename()

        if self._progenvsection:
            self._env = nlpdef.get_env_dict(self._progenvsection, os.environ)
        else:
            self._env = os.environ.copy()
        self._env["NLPLOGTAG"] = logtag
        # ... because passing a "-lt" switch with no parameter will make
        # CrateGatePipeline.java complain and stop

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False

        # Sanity checks
        for ty, tn in self._type_to_tablename.items():
            assert len(tn) <= MAX_SQL_FIELD_LEN, (
                "Table name too long (max {} characters)".format(
                    MAX_SQL_FIELD_LEN))
Example #11
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfg_processor_name: str,
                 commit: bool = False) -> None:
        """
        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfg_processor_name:
                the name of a CRATE NLP config file section (from which we may
                choose to get extra config information)
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        super().__init__(nlpdef=nlpdef,
                         cfg_processor_name=cfg_processor_name,
                         commit=commit,
                         friendly_name="GATE")

        if not nlpdef and not cfg_processor_name:
            # Debugging only
            self._debug_mode = True
            self._max_external_prog_uses = 0
            self._input_terminator = 'input_terminator'
            self._output_terminator = 'output_terminator'
            typepairs = []  # type: List[str]
            self._progenvsection = ''
            progargs = ''
            logtag = ''
        else:
            self._debug_mode = False
            self._max_external_prog_uses = self._cfgsection.opt_int_positive(
                ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES, default=0)
            self._input_terminator = self._cfgsection.opt_str(
                ProcessorConfigKeys.INPUT_TERMINATOR, required=True)
            self._output_terminator = self._cfgsection.opt_str(
                ProcessorConfigKeys.OUTPUT_TERMINATOR, required=True)
            typepairs = self._cfgsection.opt_strlist(
                ProcessorConfigKeys.OUTPUTTYPEMAP, required=True, lower=False)
            self._progenvsection = self._cfgsection.opt_str(
                ProcessorConfigKeys.PROGENVSECTION)
            progargs = self._cfgsection.opt_str(ProcessorConfigKeys.PROGARGS,
                                                required=True)
            logtag = nlpdef.logtag or '.'

        self._outputtypemap = {}  # type: Dict[str, OutputUserConfig]
        self._type_to_tablename = {}  # type: Dict[str, str]
        for annottype, outputsection in chunks(typepairs, 2):
            annottype = annottype.lower()
            c = OutputUserConfig(nlpdef.parser, outputsection)
            self._outputtypemap[annottype] = c
            self._type_to_tablename[annottype] = c.dest_tablename

        if self._progenvsection:
            # noinspection PyTypeChecker
            self._env = nlpdef.get_env_dict(self._progenvsection, os.environ)
        else:
            self._env = os.environ.copy()
        self._env["NLPLOGTAG"] = logtag
        # ... We have ensured that this is not empty for real use, because
        # passing a "-lt" switch with no parameter will make
        # CrateGatePipeline.java complain and stop. The environment variable
        # is read via the "progargs" config argument, as follows.

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False

        # Sanity checks
        for ty, tn in self._type_to_tablename.items():
            assert len(tn) <= MAX_SQL_FIELD_LEN, (
                f"Table name too long (max {MAX_SQL_FIELD_LEN} characters)")
Example #12
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        """
        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfgsection:
                the name of a CRATE NLP config file section (from which we may
                choose to get extra config information)
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit,
                         name="GATE")

        if not nlpdef and not cfgsection:
            # Debugging only
            self._debug_mode = True
            self._max_external_prog_uses = 0
            self._input_terminator = 'input_terminator'
            self._output_terminator = 'output_terminator'
            typepairs = []  # type: List[str]
            self._progenvsection = ''
            progargs = ''
            logtag = ''
        else:
            self._debug_mode = False
            self._max_external_prog_uses = nlpdef.opt_int(
                self._sectionname, ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES,
                default=0)
            self._input_terminator = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.INPUT_TERMINATOR,
                required=True)
            self._output_terminator = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.OUTPUT_TERMINATOR,
                required=True)
            typepairs = nlpdef.opt_strlist(
                self._sectionname, ProcessorConfigKeys.OUTPUTTYPEMAP,
                required=True, lower=False)
            self._progenvsection = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGENVSECTION)
            progargs = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGARGS,
                required=True)
            logtag = nlpdef.get_logtag() or '.'

        self._outputtypemap = {}  # type: Dict[str, OutputUserConfig]
        self._type_to_tablename = {}  # type: Dict[str, str]
        for c in chunks(typepairs, 2):
            annottype = c[0]
            outputsection = c[1]
            # 2018-03-27: not clear why we need to force the user to specify
            # in lower case! We just said it's case-insensitive. So ditch this:
            #
            # if annottype != annottype.lower():
            #     raise Exception(
            #         "Section {}: annotation types in outputtypemap must be in "  # noqa
            #         "lower case: change {}".format(cfgsection, annottype))
            #
            # and add this:
            annottype = annottype.lower()
            # log.critical(outputsection)
            c = OutputUserConfig(nlpdef.get_parser(), outputsection)
            self._outputtypemap[annottype] = c
            self._type_to_tablename[annottype] = c.get_tablename()

        if self._progenvsection:
            self._env = nlpdef.get_env_dict(
                full_sectionname(NlpConfigPrefixes.ENV,
                                 self._progenvsection),
                os.environ)
        else:
            self._env = os.environ.copy()
        self._env["NLPLOGTAG"] = logtag
        # ... We have ensured that this is not empty for real use, because
        # passing a "-lt" switch with no parameter will make
        # CrateGatePipeline.java complain and stop. The environment variable
        # is read via the "progargs" config argument, as follows.

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False

        # Sanity checks
        for ty, tn in self._type_to_tablename.items():
            assert len(tn) <= MAX_SQL_FIELD_LEN, (
                f"Table name too long (max {MAX_SQL_FIELD_LEN} characters)")
Example #13
0
def main() -> None:
    """
    Command-line entry point.
    """
    version = "Version {} ({})".format(VERSION, VERSION_DATE)
    description = "NLP manager. {version}. By Rudolf Cardinal.".format(
        version=version)

    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--version", action="version", version=version)
    parser.add_argument("--config",
                        help="Config file (overriding environment "
                        "variable {})".format(NLP_CONFIG_ENV_VAR))
    parser.add_argument('--verbose',
                        '-v',
                        action='store_true',
                        help="Be verbose (use twice for extra verbosity)")
    parser.add_argument("--nlpdef",
                        nargs="?",
                        default=None,
                        help="NLP definition name (from config file)")
    parser.add_argument('--report_every_fast',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY,
                        help="Report insert progress (for fast operations) "
                        "every n rows in verbose "
                        "mode (default {})".format(DEFAULT_REPORT_EVERY))
    parser.add_argument('--report_every_nlp',
                        nargs="?",
                        type=int,
                        default=DEFAULT_REPORT_EVERY_NLP,
                        help="Report progress for NLP every n rows in verbose "
                        "mode (default "
                        "{})".format(DEFAULT_REPORT_EVERY_NLP))
    parser.add_argument('--chunksize',
                        nargs="?",
                        type=int,
                        default=DEFAULT_CHUNKSIZE,
                        help="Number of records copied in a chunk when copying"
                        " PKs from one database to another"
                        " (default {})".format(DEFAULT_CHUNKSIZE))
    parser.add_argument("--process",
                        nargs="?",
                        type=int,
                        default=0,
                        help="For multiprocess mode: specify process number")
    parser.add_argument("--nprocesses",
                        nargs="?",
                        type=int,
                        default=1,
                        help="For multiprocess mode: specify "
                        "total number of processes (launched somehow, of "
                        "which this is to be one)")
    parser.add_argument("--processcluster",
                        default="",
                        help="Process cluster name")
    parser.add_argument("--democonfig",
                        action="store_true",
                        help="Print a demo config file")
    parser.add_argument("--listprocessors",
                        action="store_true",
                        help="Show possible built-in NLP processor names")
    parser.add_argument("--describeprocessors",
                        action="store_true",
                        help="Show details of built-in NLP processors")
    parser.add_argument("--showinfo",
                        required=False,
                        nargs='?',
                        metavar="NLP_CLASS_NAME",
                        help="Show detailed information for a parser")
    parser.add_argument("--count",
                        action="store_true",
                        help="Count records in source/destination databases, "
                        "then stop")

    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument("-i",
                            "--incremental",
                            dest="incremental",
                            action="store_true",
                            help="Process only new/changed information, where "
                            "possible (* default)")
    mode_group.add_argument("-f",
                            "--full",
                            dest="incremental",
                            action="store_false",
                            help="Drop and remake everything")
    parser.set_defaults(incremental=True)

    parser.add_argument("--dropremake",
                        action="store_true",
                        help="Drop/remake destination tables only")
    parser.add_argument("--skipdelete",
                        dest="skipdelete",
                        action="store_true",
                        help="For incremental updates, skip deletion of rows "
                        "present in the destination but not the source")
    parser.add_argument("--nlp",
                        action="store_true",
                        help="Perform NLP processing only")
    parser.add_argument("--echo", action="store_true", help="Echo SQL")
    parser.add_argument("--timing",
                        action="store_true",
                        help="Show detailed timing breakdown")
    args = parser.parse_args()

    # Validate args
    if args.nprocesses < 1:
        raise ValueError("--nprocesses must be >=1")
    if args.process < 0 or args.process >= args.nprocesses:
        raise ValueError(
            "--process argument must be from 0 to (nprocesses - 1) inclusive")
    if args.config:
        os.environ[NLP_CONFIG_ENV_VAR] = args.config

    # Verbosity and logging
    mynames = []  # type: List[str]
    if args.processcluster:
        mynames.append(args.processcluster)
    if args.nprocesses > 1:
        mynames.append("proc{}".format(args.process))
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel, extranames=mynames)

    # -------------------------------------------------------------------------

    # Demo config?
    if args.democonfig:
        print(DEMO_CONFIG)
        return

    # List or describe processors?
    if args.listprocessors:
        print("\n".join(possible_processor_names()))
        return
    if args.describeprocessors:
        print(possible_processor_table())
        return
    if args.showinfo:
        parser = get_nlp_parser_debug_instance(args.showinfo)
        if parser:
            print("Info for class {}:\n".format(args.showinfo))
            parser.print_info()
        else:
            print("No such processor class: {}".format(args.showinfo))
        return

    # Otherwise, we need a valid NLP definition.
    if args.nlpdef is None:
        raise ValueError("Must specify nlpdef parameter (unless --democonfig, "
                         "--listprocessors, or --describeprocessors used)")

    everything = not any([args.dropremake, args.nlp])

    # Report args
    log.debug("arguments: {}".format(args))

    # Load/validate config
    config = NlpDefinition(args.nlpdef,
                           logtag="_".join(mynames).replace(" ", "_"))
    config.set_echo(args.echo)

    # Count only?
    if args.count:
        show_source_counts(config)
        show_dest_counts(config)
        return

    # -------------------------------------------------------------------------

    log.info("Starting: incremental={}".format(args.incremental))
    start = get_now_utc()
    timer.set_timing(args.timing, reset=True)

    # 1. Drop/remake tables. Single-tasking only.
    with MultiTimerContext(timer, TIMING_DROP_REMAKE):
        if args.dropremake or everything:
            drop_remake(args,
                        config,
                        incremental=args.incremental,
                        skipdelete=args.skipdelete)

    # From here, in a multiprocessing environment, trap any errors simply so
    # we can report the process number clearly.

    # 2. NLP
    if args.nlp or everything:
        try:
            process_nlp(config,
                        incremental=args.incremental,
                        report_every=args.report_every_nlp,
                        tasknum=args.process,
                        ntasks=args.nprocesses)
        except Exception as exc:
            log.critical("TERMINAL ERROR FROM THIS PROCESS")  # so we see proc#
            die(exc)

    log.info("Finished")
    end = get_now_utc()
    time_taken = end - start
    log.info("Time taken: {:.3f} seconds".format(time_taken.total_seconds()))

    if args.timing:
        timer.report()
Example #14
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 variable_name: str,  # e.g. "MMSE"
                 variable_regex_str: str,  # e.g. regex for MMSE
                 expected_denominator: int,
                 numerator_text_fieldname: str = "numerator_text",
                 numerator_fieldname: str = "numerator",
                 denominator_text_fieldname: str = "denominator_text",
                 denominator_fieldname: str = "denominator",
                 correct_numerator_fieldname: str = None,  # default below
                 take_absolute: bool = True,
                 commit: bool = False,
                 debug: bool = False) -> None:
        """
        This class operates with compiled regexes having this group format:
          - quantity_regex_str: e.g. to find "MMSE"
        """
        self.variable_name = variable_name
        assert(expected_denominator > 0)
        self.expected_denominator = expected_denominator
        self.numerator_text_fieldname = numerator_text_fieldname
        self.numerator_fieldname = numerator_fieldname
        self.denominator_text_fieldname = denominator_text_fieldname
        self.denominator_fieldname = denominator_fieldname
        self.correct_numerator_fieldname = (
            correct_numerator_fieldname or
            "out_of_{}".format(expected_denominator))
        self.take_absolute = take_absolute

        super().__init__(nlpdef=nlpdef,
                         cfgsection=cfgsection,
                         commit=commit)
        if nlpdef is None:  # only None for debugging!
            self.tablename = ''
        else:
            self.tablename = nlpdef.opt_str(
                cfgsection, 'desttable', required=True)

        regex_str = r"""
            ( {variable} )                     # 1. group for variable (thing being measured)
            {OPTIONAL_RESULTS_IGNORABLES}
            {SCORE}?                           # optional "score" or similar
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {TENSE_INDICATOR} )?             # 2. optional group for tense indicator
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {RELATION} )?                    # 3. optional group for relation
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {SIGNED_FLOAT} )                 # 4. group for numerator
            (?:                                # optional "/ denominator"
                \s* {OUT_OF_SEPARATOR} \s*
                ( {UNSIGNED_INTEGER} )         # 5. group for denominator
            )?
        """.format(  # noqa
            variable=variable_regex_str,
            OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES,
            SCORE=SCORE,
            TENSE_INDICATOR=TENSE_INDICATOR,
            RELATION=RELATION,
            SIGNED_FLOAT=SIGNED_FLOAT,
            OUT_OF_SEPARATOR=OUT_OF_SEPARATOR,
            UNSIGNED_INTEGER=UNSIGNED_INTEGER,
        )
        if debug:
            print("Regex for {}: {}".format(type(self).__name__, regex_str))
        self.regex_str = regex_str
        self.compiled_regex = compile_regex(regex_str)
Example #15
0
def process_nlp(nlpdef: NlpDefinition,
                incremental: bool = False,
                report_every: int = DEFAULT_REPORT_EVERY_NLP,
                tasknum: int = 0,
                ntasks: int = 1) -> None:
    """
    Main NLP processing function. Fetch text, send it to the GATE app
    (storing the results), and make a note in the progress database.
    """
    log.info(SEP + "NLP")
    session = nlpdef.get_progdb_session()
    for ifconfig in nlpdef.get_ifconfigs():
        i = 0  # record count within this process
        recnum = tasknum  # record count overall
        totalcount = ifconfig.get_count()  # total number of records in table
        for text, other_values in ifconfig.gen_text(tasknum=tasknum,
                                                    ntasks=ntasks):
            i += 1
            pkval = other_values[FN_SRCPKVAL]
            pkstr = other_values[FN_SRCPKSTR]
            if report_every and i % report_every == 0:
                log.info(
                    "Processing {db}.{t}.{c}, PK: {pkf}={pkv} "
                    "({overall}record {approx}{recnum}/{totalcount})"
                    "{thisproc}".format(
                        db=other_values[FN_SRCDB],
                        t=other_values[FN_SRCTABLE],
                        c=other_values[FN_SRCFIELD],
                        pkf=other_values[FN_SRCPKFIELD],
                        pkv=pkstr if pkstr else pkval,
                        overall="overall " if ntasks > 1 else "",
                        approx="~" if pkstr and ntasks > 1 else "",
                        # ... string hashing means approx. distribution
                        recnum=recnum + 1,
                        i=i,
                        totalcount=totalcount,
                        thisproc=(" ({i}/~{proccount} this process)".format(
                            i=i, proccount=totalcount //
                            ntasks) if ntasks > 1 else "")))
            recnum += ntasks
            # log.critical("other_values={}".format(repr(other_values)))
            srchash = nlpdef.hash(text)

            progrec = None
            if incremental:
                progrec = ifconfig.get_progress_record(pkval, pkstr)
                if progrec is not None:
                    if progrec.srchash == srchash:
                        log.debug("Record previously processed; skipping")
                        continue
                    else:
                        log.debug("Record has changed")
                else:
                    log.debug("Record is new")

            for processor in nlpdef.get_processors():
                if incremental:
                    processor.delete_dest_record(ifconfig,
                                                 pkval,
                                                 pkstr,
                                                 commit=incremental)
                processor.process(text, other_values)

            # Make a note in the progress database that we've processed a
            # source record.
            if progrec:  # modifying an existing record
                progrec.whenprocessedutc = nlpdef.get_now()
                progrec.srchash = srchash
            else:  # creating a new record
                progrec = NlpRecord(
                    # Quasi-key fields:
                    srcdb=ifconfig.get_srcdb(),
                    srctable=ifconfig.get_srctable(),
                    srcpkval=pkval,
                    srcpkstr=pkstr,
                    srcfield=ifconfig.get_srcfield(),
                    nlpdef=nlpdef.get_name(),
                    # Other fields:
                    srcpkfield=ifconfig.get_srcpkfield(),
                    whenprocessedutc=nlpdef.get_now(),
                    srchash=srchash,
                )
                with MultiTimerContext(timer, TIMING_PROGRESS_DB_ADD):
                    session.add(progrec)

            # In incremental mode, do we commit immediately, because other
            # processes may need this table promptly... ?

            # force_commit = False  # definitely wrong; crashes as below
            # force_commit = incremental
            force_commit = ntasks > 1

            # - A single source record should not be processed by >1 CRATE
            #   process. So in theory there should be no conflicts.
            # - However, databases can lock in various ways. Can we guarantee
            #   it'll do something sensible?
            # - See also
            #   https://en.wikipedia.org/wiki/Isolation_(database_systems)
            #   http://skien.cc/blog/2014/02/06/sqlalchemy-and-race-conditions-follow-up/  # noqa
            #   http://docs.sqlalchemy.org/en/latest/core/connections.html?highlight=execution_options#sqlalchemy.engine.Connection.execution_options  # noqa
            # - However, empirically, setting this to False gives
            #   "Transaction (Process ID xx) was deadlocked on lock resources
            #   with another process and has been chosen as the deadlock
            #   victim. Rerun the transaction." -- with a SELECT query.
            # - SQL Server uses READ COMMITTED as the default isolation level.
            # - https://technet.microsoft.com/en-us/library/jj856598(v=sql.110).aspx  # noqa

            nlpdef.notify_transaction(
                session=session,
                n_rows=1,
                n_bytes=sys.getsizeof(progrec),  # approx
                force_commit=force_commit)

    nlpdef.commit_all()
Example #16
0
def delete_where_no_source(nlpdef: NlpDefinition,
                           ifconfig: InputFieldConfig,
                           report_every: int = DEFAULT_REPORT_EVERY,
                           chunksize: int = DEFAULT_CHUNKSIZE) -> None:
    """
    Delete destination records where source records no longer exist.

    - Can't do this in a single SQL command, since the engine can't necessarily
      see both databases.
    - Can't use a single temporary table, since the progress database isn't
      necessarily the same as any of the destination database(s).
    - Can't do this in a multiprocess way, because we're trying to do a
      DELETE WHERE NOT IN.
    - So we fetch all source PKs (which, by definition, do exist), stash them
      keep them in memory, and do a DELETE WHERE NOT IN based on those
      specified values (or, if there are no PKs in the source, delete
      everything from the destination).

    Problems:
    - This is IMPERFECT if we have string source PKs and there are hash
      collisions (e.g. PKs for records X and Y both hash to the same thing;
      record X is deleted; then its processed version might not be).
    - With massive tables, we might run out of memory or (much more likely)
      SQL parameter slots. -- This is now happening; error looks like:
      pyodbc.ProgrammingError: ('The SQL contains 30807 parameter parkers, but
      2717783 parameters were supplied', 'HY000')

    A better way might be:
    - for each table, make a temporary table in the same database
    - populate that table with (source PK integer/hash, source PK string) pairs
    - delete where pairs don't match -- is that portable SQL?
      http://stackoverflow.com/questions/7356108/sql-query-for-deleting-rows-with-not-in-using-2-columns  # noqa
    - More efficient would be to make one table per destination database.

    On the "delete where multiple fields don't match":
    - Single field syntax is
        DELETE FROM a WHERE a1 NOT IN (SELECT b1 FROM b)
    - Multiple field syntax is
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND a.a2 = b.b2
        )
    - In SQLAlchemy, exists():
        http://stackoverflow.com/questions/14600619
        http://docs.sqlalchemy.org/en/latest/core/selectable.html
    - Furthermore, in SQL NULL = NULL is false, and NULL <> NULL is also false,
      so we have to do an explicit null check.
      You do that with "field == None" (disable
      See http://stackoverflow.com/questions/21668606
      We're aiming, therefore, for:
        DELETE FROM a WHERE NOT EXISTS (
            SELECT 1 FROM b
            WHERE a.a1 = b.b1
            AND (
                a.a2 = b.b2
                OR (a.a2 IS NULL AND b.b2 IS NULL)
            )
        )
    """

    # -------------------------------------------------------------------------
    # Sub-functions
    # -------------------------------------------------------------------------

    def insert(records_):
        n_rows = len(records_)
        log.debug("... inserting {} records".format(n_rows))
        for db in databases:
            session_ = db['session']
            temptable_ = db['temptable']  # type: Table
            session_.execute(temptable_.insert(), records_)
            nlpdef.notify_transaction(session_,
                                      n_rows=n_rows,
                                      n_bytes=sys.getsizeof(records_))

    def commit():
        for db in databases:
            nlpdef.commit(db['session'])

    # -------------------------------------------------------------------------
    # Main code
    # -------------------------------------------------------------------------
    # Use info log level, otherwise it looks like our code hangs with very
    # large databases.

    log.info("delete_where_no_source: examining source table {}.{}; "
             "MAY BE SLOW".format(ifconfig.get_srcdb(),
                                  ifconfig.get_srctable()))

    # Start our list with the progress database
    databases = [{
        'session': nlpdef.get_progdb_session(),
        'engine': nlpdef.get_progdb_engine(),
        'metadata': nlpdef.get_progdb_metadata(),
        'temptable': None,  # type: Table
    }]

    # Add the processors' destination databases
    for processor in nlpdef.get_processors():  # of type BaseNlpParser
        session = processor.get_session()
        if any(x['session'] == session for x in databases):
            continue  # already exists
        databases.append({
            'session': session,
            'engine': processor.get_engine(),
            'metadata': processor.get_metadata(),
        })

    # Make a temporary table in each database (note: the Table objects become
    # affiliated to their engine, I think, so make separate ones for each).
    log.info("... using {n} destination database(s)".format(n=len(databases)))
    log.info("... dropping (if exists) and creating temporary table(s)")
    for database in databases:
        engine = database['engine']
        temptable = Table(
            nlpdef.get_temporary_tablename(),
            database['metadata'],
            Column(FN_SRCPKVAL, BigInteger),  # not PK, as may be a hash
            Column(FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH)),
            **TABLE_KWARGS)
        temptable.drop(engine, checkfirst=True)
        temptable.create(engine, checkfirst=True)
        database['temptable'] = temptable

    # Insert PKs into temporary tables

    n = count_star(ifconfig.get_source_session(), ifconfig.get_srctable())
    log.info("... populating temporary table(s): {} records to go; working in "
             "chunks of {}".format(n, chunksize))
    i = 0
    records = []  # type: List[Dict[str, Any]]
    for pkval, pkstr in ifconfig.gen_src_pks():
        i += 1
        if report_every and i % report_every == 0:
            log.info("... src row# {} / {}".format(i, n))
        records.append({FN_SRCPKVAL: pkval, FN_SRCPKSTR: pkstr})
        if i % chunksize == 0:
            insert(records)
            records = []  # type: List[Dict[str, Any]]
    if records:  # remainder
        insert(records)

    # Commit
    commit()

    # Index, for speed
    log.info("... creating index(es) on temporary table(s)")
    for database in databases:
        temptable = database['temptable']  # type: Table
        index = Index('_temptable_idx', temptable.columns[FN_SRCPKVAL])
        index.create(database['engine'])

    # DELETE FROM desttable WHERE destpk NOT IN (SELECT srcpk FROM temptable)
    log.info("... deleting from progress/destination DBs where appropriate")

    # Delete from progress database
    prog_db = databases[0]
    prog_temptable = prog_db['temptable']
    ifconfig.delete_progress_records_where_srcpk_not(prog_temptable)

    # Delete from others
    for processor in nlpdef.get_processors():
        database = [
            x for x in databases if x['session'] == processor.get_session()
        ][0]
        temptable = database['temptable']
        processor.delete_where_srcpk_not(ifconfig, temptable)

    # Drop temporary tables
    log.info("... dropping temporary table(s)")
    for database in databases:
        database['temptable'].drop(database['engine'], checkfirst=True)

    # Commit
    commit()
Example #17
0
    def __init__(self, nlpdef: NlpDefinition, cfg_input_name: str) -> None:
        """
        Read config from a configparser section, and also associate with a
        specific NLP definition.

        Args:
            nlpdef:
                :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`,
                the master NLP definition, referring to the master config file
                etc.
            cfg_input_name:
                config section name for the input field definition
        """
        self.name = cfg_input_name
        cfg = nlpdef.get_config_section(
            full_sectionname(NlpConfigPrefixes.INPUT, cfg_input_name))

        self._nlpdef = nlpdef

        self._srcdb = cfg.opt_str(InputFieldConfigKeys.SRCDB)
        self._srctable = cfg.opt_str(InputFieldConfigKeys.SRCTABLE)
        self._srcpkfield = cfg.opt_str(InputFieldConfigKeys.SRCPKFIELD)
        self._srcfield = cfg.opt_str(InputFieldConfigKeys.SRCFIELD)
        self._srcdatetimefield = cfg.opt_str(
            InputFieldConfigKeys.SRCDATETIMEFIELD, required=False)
        # ... new in v0.18.52
        # Make these case-sensitive to avoid our failure in renaming SQLA
        # Column objects to be lower-case:
        self._copyfields = cfg.opt_multiline(
            InputFieldConfigKeys.COPYFIELDS)  # fieldnames
        self._indexed_copyfields = cfg.opt_multiline(
            InputFieldConfigKeys.INDEXED_COPYFIELDS)
        self._debug_row_limit = cfg.opt_int(
            InputFieldConfigKeys.DEBUG_ROW_LIMIT, default=0)
        # self._fetch_sorted = opt_bool('fetch_sorted', default=True)

        ensure_valid_table_name(self._srctable)
        ensure_valid_field_name(self._srcpkfield)
        ensure_valid_field_name(self._srcfield)
        if self._srcdatetimefield:
            ensure_valid_field_name(self._srcdatetimefield)

        if len(set(self._indexed_copyfields)) != len(self._indexed_copyfields):
            raise ValueError(
                f"Redundant indexed_copyfields: {self._indexed_copyfields}")

        if len(set(self._copyfields)) != len(self._copyfields):
            raise ValueError(f"Redundant copyfields: {self._copyfields}")

        indexed_not_copied = set(self._indexed_copyfields) - set(
            self._copyfields)
        if indexed_not_copied:
            raise ValueError(f"Fields in index_copyfields but not in "
                             f"copyfields: {indexed_not_copied}")

        # allfields = [self._srcpkfield, self._srcfield] + self._copyfields
        # if len(allfields) != len(set(allfields)):
        #     raise ValueError(
        #         f"Field overlap in InputFieldConfig: {section}")
        # RE-THOUGHT: OK to copy source text fields etc. if desired.
        # It's fine in SQL to say SELECT a, a FROM mytable;

        self._db = nlpdef.get_database(self._srcdb)