Example #1
0
 def __init__(self,
              nlpdef: Optional[NlpDefinition],
              cfgsection: Optional[str],
              commit: bool = False,
              name: str = "?") -> None:
     """
     Args:
         nlpdef:
             a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
         cfgsection:
             the name of a CRATE NLP config file section, TO WHICH we will
             add a "processor:" prefix (from which section we may choose to
             get extra config information)
         commit:
             force a COMMIT whenever we insert data? You should specify this
             in multiprocess mode, or you may get database deadlocks.
         name:
             friendly name for the parser
     """
     self._nlpdef = nlpdef
     self._cfgsection = cfgsection
     self._commit = commit
     self._name = name
     self._destdb_name = None  # type: Optional[str]
     self._destdb = None  # type: Optional[DatabaseHolder]
     if nlpdef is not None:
         self._sectionname = full_sectionname(
             NlpConfigPrefixes.PROCESSOR, cfgsection)
         self._destdb_name = nlpdef.opt_str(
             self._sectionname, ProcessorConfigKeys.DESTDB, required=True)
         self._destdb = nlpdef.get_database(self._destdb_name)
     else:
         self._sectionname = ""
         self._destdb_name = ""
         self._destdb = None  # type: Optional[DatabaseHolder]
Example #2
0
    def __init__(self,
                 nlpdef: Optional[NlpDefinition],
                 cfgsection: Optional[str],
                 commit: bool = False) -> None:
        """
        Args:
            nlpdef:
                :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfgsection:
                the config section for the processor
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        super().__init__(nlpdef, cfgsection, commit, name="Cloud")
        self.remote_processor_info = None  # type: Optional[ServerProcessor]
        sectionname = full_sectionname(NlpConfigPrefixes.PROCESSOR,
                                       cfgsection)
        self.procname = nlpdef.opt_str(
            sectionname, ProcessorConfigKeys.PROCESSOR_NAME,
            required=True)
        self.procversion = nlpdef.opt_str(
            sectionname, ProcessorConfigKeys.PROCESSOR_VERSION,
            default=None)
        # Made format required so people are less likely to make mistakes
        self.format = nlpdef.opt_str(
            sectionname,
            ProcessorConfigKeys.PROCESSOR_FORMAT,
            required=True)
        self.schema_type = None
        self.sql_dialect = None
        self.schema = None  # type: Optional[Dict[str, Any]]
        self.available_remotely = False  # update later if available

        # Output section - bit of repetition from the 'Gate' parser
        typepairs = nlpdef.opt_strlist(
            sectionname, ProcessorConfigKeys.OUTPUTTYPEMAP,
            required=True, lower=False)
        self._outputtypemap = {}  # type: Dict[str, OutputUserConfig]
        self._type_to_tablename = {}  # type: Dict[str, str]
        self.tablename = None
        # If typepairs is empty the following block won't execute
        for c in chunks(typepairs, 2):
            output_type = c[0]
            outputsection = c[1]
            output_type = output_type.lower()
            c = OutputUserConfig(nlpdef.get_parser(), outputsection,
                                 schema_required=False)
            self._outputtypemap[output_type] = c
            self._type_to_tablename[output_type] = c.get_tablename()
            if output_type == '""':
                self.tablename = c.get_tablename()
Example #3
0
    def __init__(self, nlpdef: NlpDefinition) -> None:
        """
        Args:
            nlpdef:
                :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
        """
        self._nlpdef = nlpdef
        self._cloudcfg = nlpdef.get_cloud_config_or_raise()
        self._nlpdef_sectionname = full_sectionname(NlpConfigPrefixes.NLPDEF,
                                                    self._nlpdef.get_name())
        self._auth = (self._cloudcfg.username, self._cloudcfg.password)
        self._post = self._internal_post

        self.cookies = None  # type: Optional[CookieJar]
Example #4
0
    def __init__(self,
                 nlpdef: Optional[NlpDefinition],
                 cfg_processor_name: Optional[str],
                 commit: bool = False,
                 friendly_name: str = "?") -> None:
        r"""
        ``__init__`` function for :class:`TableMaker`.

        Args:
            nlpdef:
                An instance of
                :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`.

            cfg_processor_name:
                The name of a CRATE NLP config file section, TO WHICH we will
                add a ``processor:`` prefix (from which section we may choose
                to get extra config information).

            commit:
                Force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.

            friendly_name:
                Friendly name for the parser.
        """
        # NB This docstring was associated with Sphinx errors!
        self._nlpdef = nlpdef
        self._cfg_processor_name = cfg_processor_name
        self._commit = commit
        self._friendly_name = friendly_name
        self._destdb_name = None  # type: Optional[str]
        self._destdb = None  # type: Optional[DatabaseHolder]
        if nlpdef is None:
            self._sectionname = ""
            self._cfgsection = None  # type: Optional[ConfigSection]
            self._destdb_name = ""
            self._destdb = None  # type: Optional[DatabaseHolder]
        else:
            self._sectionname = full_sectionname(NlpConfigPrefixes.PROCESSOR,
                                                 cfg_processor_name)
            self._cfgsection = nlpdef.get_config_section(self._sectionname)
            self._destdb_name = self._cfgsection.opt_str(
                ProcessorConfigKeys.DESTDB, required=True)
            self._destdb = nlpdef.get_database(self._destdb_name)
Example #5
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        """
        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfgsection:
                the name of a CRATE NLP config file section (from which we may
                choose to get extra config information)
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        super().__init__(nlpdef=nlpdef,
                         cfgsection=cfgsection,
                         commit=commit,
                         name="MedEx")

        if nlpdef is None:  # only None for debugging!
            self._debug_mode = True
            self._tablename = self.classname().lower()
            self._max_external_prog_uses = 1
            self._progenvsection = ""
            self._env = {}  # type: Dict[str, str]
            progargs = ""
        else:
            self._debug_mode = False
            self._tablename = nlpdef.opt_str(self._sectionname,
                                             ProcessorConfigKeys.DESTTABLE,
                                             required=True)

            self._max_external_prog_uses = nlpdef.opt_int(
                self._sectionname,
                ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES,
                default=0)

            self._progenvsection = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGENVSECTION)

            if self._progenvsection:
                self._env = nlpdef.get_env_dict(
                    full_sectionname(NlpConfigPrefixes.ENV,
                                     self._progenvsection), os.environ)
            else:
                self._env = os.environ.copy()
            self._env["NLPLOGTAG"] = nlpdef.get_logtag() or '.'
            # ... because passing a "-lt" switch with no parameter will make
            # CrateGatePipeline.java complain and stop

            progargs = nlpdef.opt_str(self._sectionname,
                                      ProcessorConfigKeys.PROGARGS,
                                      required=True)

        if USE_TEMP_DIRS:
            self._inputdir = tempfile.TemporaryDirectory()
            self._outputdir = tempfile.TemporaryDirectory()
            self._workingdir = tempfile.TemporaryDirectory()
            # ... these are autodeleted when the object goes out of scope; see
            #     https://docs.python.org/3/library/tempfile.html
            # ... which manages it using weakref.finalize
        else:
            homedir = os.path.expanduser("~")
            self._inputdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "input"))
            mkdir_p(self._inputdir.name)
            self._outputdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "output"))
            mkdir_p(self._outputdir.name)
            self._workingdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "working"))
            mkdir_p(self._workingdir.name)

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)
        self._progargs.extend([
            "-data_ready_signal",
            MEDEX_DATA_READY_SIGNAL,
            "-results_ready_signal",
            MEDEX_RESULTS_READY_SIGNAL,
            "-i",
            self._inputdir.name,
            "-o",
            self._outputdir.name,
        ])

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._file_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False
Example #6
0
    def __init__(self,
                 parser: ExtendedConfigParser,
                 section: str,
                 schema_required: bool = True) -> None:
        """
        Read config from a configparser section.

        Args:
            parser:
                :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser` 
            section:
                config file section name -- this is the second of the pair of
                strings in the ``outputtypemap`` part of the GATE NLP app 
                config section. See
                
                - :ref:`NLP config file <nlp_config>`
                - :class:`crate_anon.nlp_manager.parse_gate.Gate`
           schema_required:
               is it required that the user has specified a schema, i.e.
               destfields and a desttable? - Should be true for Gate, False
               for Cloud as the remote processors may have their own schema
               definition.
        """  # noqa

        sectionname = full_sectionname(NlpConfigPrefixes.OUTPUT, section)

        def opt_str(option: str, required: bool = False) -> str:
            return parser.get_str(sectionname, option, required=required)

        def opt_strlist(option: str,
                        required: bool = False,
                        as_words: bool = True) -> List[str]:
            return parser.get_str_list(sectionname,
                                       option,
                                       required=required,
                                       lower=False,
                                       as_words=as_words)
            # We do NOT change the case.

        if not parser.has_section(sectionname):
            raise ValueError("config missing section: " + sectionname)

        # ---------------------------------------------------------------------
        # desttable
        # ---------------------------------------------------------------------

        self._desttable = opt_str(NlpOutputConfigKeys.DESTTABLE, required=True)
        ensure_valid_table_name(self._desttable)

        # ---------------------------------------------------------------------
        # renames
        # ---------------------------------------------------------------------

        self._renames = {}  # type: Dict[str, str]
        rename_lines = opt_strlist(NlpOutputConfigKeys.RENAMES,
                                   required=False,
                                   as_words=False)
        for line in rename_lines:
            if not line.strip():
                continue
            words = shlex.split(line)
            if len(words) != 2:
                raise ValueError(
                    f"Bad {NlpOutputConfigKeys.RENAMES!r} option in config "
                    f"section {sectionname!r}; line was {line!r} but should "
                    f"have contained two things")
            annotation_name = words[0]
            field_name = words[1]
            ensure_valid_field_name(field_name)
            self._renames[annotation_name] = field_name

        # ---------------------------------------------------------------------
        # null_literals
        # ---------------------------------------------------------------------

        null_literal_lines = opt_strlist(NlpOutputConfigKeys.NULL_LITERALS,
                                         required=False,
                                         as_words=False)
        self._null_literals = []  # type: List[str]
        for line in null_literal_lines:
            self._null_literals += shlex.split(line)

        # ---------------------------------------------------------------------
        # destfields
        # ---------------------------------------------------------------------

        self._destfields = []  # type: List[str]
        self._dest_datatypes = []  # type: List[str]
        self._dest_comments = []  # type: List[str]
        dest_field_lines = opt_strlist(NlpOutputConfigKeys.DESTFIELDS,
                                       required=schema_required,
                                       as_words=False)
        # ... comments will be removed during that process.
        # log.critical(dest_field_lines)
        # If dest_field_lines is empty (as it may be for a Cloud processor)
        # the following block doesn't execute, so the 'dest' attributed remain
        # empty
        for dfl in dest_field_lines:
            parts = dfl.split(maxsplit=2)
            assert len(parts) >= 2, f"Bad field definition line: {dfl!r}"
            field = parts[0]
            datatype = parts[1].upper()
            comment = parts[2] if len(parts) > 2 else None
            ensure_valid_field_name(field)
            if not is_sqltype_valid(datatype):
                raise Exception(f"Invalid datatype for {field}: {datatype}")
            self._destfields.append(field)
            self._dest_datatypes.append(datatype)
            self._dest_comments.append(comment)

        src_fields = [
            c.name for c in InputFieldConfig.get_core_columns_for_dest()
        ]
        for sf in src_fields:
            if sf in self._destfields:
                raise Exception(
                    f"For section {sectionname}, destination field {sf} is "
                    f"auto-supplied; do not add it manually")

        if len(set(self._destfields)) != len(self._destfields):
            raise ValueError(f"Duplicate fields exist in destination fields: "
                             f"{self._destfields}")

        # ---------------------------------------------------------------------
        # indexdefs
        # ---------------------------------------------------------------------

        self._indexfields = []  # type: List[str]
        self._indexlengths = []  # type: List[int]
        indexdefs = opt_strlist(NlpOutputConfigKeys.INDEXDEFS)
        if indexdefs:
            for c in chunks(indexdefs, 2):  # pairs: field, length
                indexfieldname = c[0]
                lengthstr = c[1]
                if indexfieldname not in self._destfields:
                    raise ValueError(f"Index field {indexfieldname} not in "
                                     f"destination fields {self._destfields}")
                try:
                    length = ast.literal_eval(lengthstr)
                    if length is not None:
                        length = int(length)
                except ValueError:
                    raise ValueError(f"Bad index length: {lengthstr}")
                self._indexfields.append(indexfieldname)
                self._indexlengths.append(length)
Example #7
0
    def __init__(self, nlpdef: "NlpDefinition", name: str,
                 req_data_dir: str) -> None:
        """
        Reads the config from the NLP definition's config file.

        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            name:
                name for the cloud NLP configuration (to which a standard
                prefix will be added to get the config section name)
            req_data_dir:
                directory in which to store temporary request files
        """
        from crate_anon.nlp_manager.cloud_parser import Cloud  # delayed import  # noqa

        self._nlpdef = nlpdef
        self.req_data_dir = req_data_dir

        cfg = nlpdef.get_config_section(
            full_sectionname(NlpConfigPrefixes.CLOUD, name))

        self.url = cfg.opt_str(CloudNlpConfigKeys.CLOUD_URL, required=True)
        self.verify_ssl = cfg.opt_bool(CloudNlpConfigKeys.VERIFY_SSL, True)
        self.compress = cfg.opt_bool(CloudNlpConfigKeys.COMPRESS, True)
        self.username = cfg.opt_str(CloudNlpConfigKeys.USERNAME, default="")
        self.password = cfg.opt_str(CloudNlpConfigKeys.PASSWORD, default="")
        self.max_content_length = cfg.opt_int(
            CloudNlpConfigKeys.MAX_CONTENT_LENGTH,
            DEFAULT_CLOUD_MAX_CONTENT_LENGTH)
        self.limit_before_commit = cfg.opt_int(
            CloudNlpConfigKeys.LIMIT_BEFORE_COMMIT,
            DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT)
        self.max_records_per_request = cfg.opt_int(
            CloudNlpConfigKeys.MAX_RECORDS_PER_REQUEST,
            DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST)
        self.stop_at_failure = cfg.opt_bool(CloudNlpConfigKeys.STOP_AT_FAILURE,
                                            True)
        self.wait_on_conn_err = cfg.opt_int(
            CloudNlpConfigKeys.WAIT_ON_CONN_ERR,
            DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S)
        self.max_tries = cfg.opt_int(CloudNlpConfigKeys.MAX_TRIES,
                                     DEFAULT_CLOUD_MAX_TRIES)
        self.rate_limit_hz = cfg.opt_int(CloudNlpConfigKeys.RATE_LIMIT_HZ,
                                         DEFAULT_CLOUD_RATE_LIMIT_HZ)
        self.test_length_function_speed = cfg.opt_bool(
            CloudNlpConfigKeys.TEST_LENGTH_FUNCTION_SPEED, True)
        self.remote_processors = {}  # type: Dict[Tuple[str, str], 'Cloud']
        for processor in self._nlpdef.processors:
            if not isinstance(processor, Cloud):
                # ... only add 'Cloud' processors
                log.warning(
                    f"Skipping NLP processor of non-cloud (e.g. local) "
                    f"type: {processor.friendly_name}")
                continue
            self.remote_processors[(processor.procname,
                                    processor.procversion)] = processor
            # NOTE: KEY IS A TUPLE!
        # We need the following in order to decide whether to ask to include
        # text in reply - if a processor is GATE we need to, as it does not
        # send back the content of the nlp snippet
        self.has_gate_processors = any(
            (x.format == NlpDefValues.FORMAT_GATE)
            for x in self.remote_processors.values())
Example #8
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        """
        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfgsection:
                the name of a CRATE NLP config file section (from which we may
                choose to get extra config information)
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit,
                         name="GATE")

        if not nlpdef and not cfgsection:
            # Debugging only
            self._debug_mode = True
            self._max_external_prog_uses = 0
            self._input_terminator = 'input_terminator'
            self._output_terminator = 'output_terminator'
            typepairs = []  # type: List[str]
            self._progenvsection = ''
            progargs = ''
            logtag = ''
        else:
            self._debug_mode = False
            self._max_external_prog_uses = nlpdef.opt_int(
                self._sectionname, ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES,
                default=0)
            self._input_terminator = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.INPUT_TERMINATOR,
                required=True)
            self._output_terminator = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.OUTPUT_TERMINATOR,
                required=True)
            typepairs = nlpdef.opt_strlist(
                self._sectionname, ProcessorConfigKeys.OUTPUTTYPEMAP,
                required=True, lower=False)
            self._progenvsection = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGENVSECTION)
            progargs = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGARGS,
                required=True)
            logtag = nlpdef.get_logtag() or '.'

        self._outputtypemap = {}  # type: Dict[str, OutputUserConfig]
        self._type_to_tablename = {}  # type: Dict[str, str]
        for c in chunks(typepairs, 2):
            annottype = c[0]
            outputsection = c[1]
            # 2018-03-27: not clear why we need to force the user to specify
            # in lower case! We just said it's case-insensitive. So ditch this:
            #
            # if annottype != annottype.lower():
            #     raise Exception(
            #         "Section {}: annotation types in outputtypemap must be in "  # noqa
            #         "lower case: change {}".format(cfgsection, annottype))
            #
            # and add this:
            annottype = annottype.lower()
            # log.critical(outputsection)
            c = OutputUserConfig(nlpdef.get_parser(), outputsection)
            self._outputtypemap[annottype] = c
            self._type_to_tablename[annottype] = c.get_tablename()

        if self._progenvsection:
            self._env = nlpdef.get_env_dict(
                full_sectionname(NlpConfigPrefixes.ENV,
                                 self._progenvsection),
                os.environ)
        else:
            self._env = os.environ.copy()
        self._env["NLPLOGTAG"] = logtag
        # ... We have ensured that this is not empty for real use, because
        # passing a "-lt" switch with no parameter will make
        # CrateGatePipeline.java complain and stop. The environment variable
        # is read via the "progargs" config argument, as follows.

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False

        # Sanity checks
        for ty, tn in self._type_to_tablename.items():
            assert len(tn) <= MAX_SQL_FIELD_LEN, (
                f"Table name too long (max {MAX_SQL_FIELD_LEN} characters)")
Example #9
0
    def __init__(self, nlpdef: NlpDefinition, cfg_input_name: str) -> None:
        """
        Read config from a configparser section, and also associate with a
        specific NLP definition.

        Args:
            nlpdef:
                :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`,
                the master NLP definition, referring to the master config file
                etc.
            cfg_input_name:
                config section name for the input field definition
        """
        self.name = cfg_input_name
        cfg = nlpdef.get_config_section(
            full_sectionname(NlpConfigPrefixes.INPUT, cfg_input_name))

        self._nlpdef = nlpdef

        self._srcdb = cfg.opt_str(InputFieldConfigKeys.SRCDB)
        self._srctable = cfg.opt_str(InputFieldConfigKeys.SRCTABLE)
        self._srcpkfield = cfg.opt_str(InputFieldConfigKeys.SRCPKFIELD)
        self._srcfield = cfg.opt_str(InputFieldConfigKeys.SRCFIELD)
        self._srcdatetimefield = cfg.opt_str(
            InputFieldConfigKeys.SRCDATETIMEFIELD, required=False)
        # ... new in v0.18.52
        # Make these case-sensitive to avoid our failure in renaming SQLA
        # Column objects to be lower-case:
        self._copyfields = cfg.opt_multiline(
            InputFieldConfigKeys.COPYFIELDS)  # fieldnames
        self._indexed_copyfields = cfg.opt_multiline(
            InputFieldConfigKeys.INDEXED_COPYFIELDS)
        self._debug_row_limit = cfg.opt_int(
            InputFieldConfigKeys.DEBUG_ROW_LIMIT, default=0)
        # self._fetch_sorted = opt_bool('fetch_sorted', default=True)

        ensure_valid_table_name(self._srctable)
        ensure_valid_field_name(self._srcpkfield)
        ensure_valid_field_name(self._srcfield)
        if self._srcdatetimefield:
            ensure_valid_field_name(self._srcdatetimefield)

        if len(set(self._indexed_copyfields)) != len(self._indexed_copyfields):
            raise ValueError(
                f"Redundant indexed_copyfields: {self._indexed_copyfields}")

        if len(set(self._copyfields)) != len(self._copyfields):
            raise ValueError(f"Redundant copyfields: {self._copyfields}")

        indexed_not_copied = set(self._indexed_copyfields) - set(
            self._copyfields)
        if indexed_not_copied:
            raise ValueError(f"Fields in index_copyfields but not in "
                             f"copyfields: {indexed_not_copied}")

        # allfields = [self._srcpkfield, self._srcfield] + self._copyfields
        # if len(allfields) != len(set(allfields)):
        #     raise ValueError(
        #         f"Field overlap in InputFieldConfig: {section}")
        # RE-THOUGHT: OK to copy source text fields etc. if desired.
        # It's fine in SQL to say SELECT a, a FROM mytable;

        self._db = nlpdef.get_database(self._srcdb)