Ejemplo n.º 1
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        """
        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfgsection:
                the name of a CRATE NLP config file section (from which we may
                choose to get extra config information)
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        super().__init__(nlpdef=nlpdef,
                         cfgsection=cfgsection,
                         commit=commit,
                         name="MedEx")

        if nlpdef is None:  # only None for debugging!
            self._debug_mode = True
            self._tablename = self.classname().lower()
            self._max_external_prog_uses = 1
            self._progenvsection = ""
            self._env = {}  # type: Dict[str, str]
            progargs = ""
        else:
            self._debug_mode = False
            self._tablename = nlpdef.opt_str(self._sectionname,
                                             ProcessorConfigKeys.DESTTABLE,
                                             required=True)

            self._max_external_prog_uses = nlpdef.opt_int(
                self._sectionname,
                ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES,
                default=0)

            self._progenvsection = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGENVSECTION)

            if self._progenvsection:
                self._env = nlpdef.get_env_dict(
                    full_sectionname(NlpConfigPrefixes.ENV,
                                     self._progenvsection), os.environ)
            else:
                self._env = os.environ.copy()
            self._env["NLPLOGTAG"] = nlpdef.get_logtag() or '.'
            # ... because passing a "-lt" switch with no parameter will make
            # CrateGatePipeline.java complain and stop

            progargs = nlpdef.opt_str(self._sectionname,
                                      ProcessorConfigKeys.PROGARGS,
                                      required=True)

        if USE_TEMP_DIRS:
            self._inputdir = tempfile.TemporaryDirectory()
            self._outputdir = tempfile.TemporaryDirectory()
            self._workingdir = tempfile.TemporaryDirectory()
            # ... these are autodeleted when the object goes out of scope; see
            #     https://docs.python.org/3/library/tempfile.html
            # ... which manages it using weakref.finalize
        else:
            homedir = os.path.expanduser("~")
            self._inputdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "input"))
            mkdir_p(self._inputdir.name)
            self._outputdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "output"))
            mkdir_p(self._outputdir.name)
            self._workingdir = PseudoTempDir(
                os.path.join(homedir, "medextemp", "working"))
            mkdir_p(self._workingdir.name)

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)
        self._progargs.extend([
            "-data_ready_signal",
            MEDEX_DATA_READY_SIGNAL,
            "-results_ready_signal",
            MEDEX_RESULTS_READY_SIGNAL,
            "-i",
            self._inputdir.name,
            "-o",
            self._outputdir.name,
        ])

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._file_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False
Ejemplo n.º 2
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit)

        if not nlpdef and not cfgsection:
            # Debugging only
            self._max_external_prog_uses = 0
            self._input_terminator = 'input_terminator'
            self._output_terminator = 'output_terminator'
            typepairs = []  # type: List[str]
            self._progenvsection = ''
            progargs = ''
            logtag = ''
        else:
            self._max_external_prog_uses = nlpdef.opt_int(
                cfgsection, 'max_external_prog_uses', default=0)
            self._input_terminator = nlpdef.opt_str(cfgsection,
                                                    'input_terminator',
                                                    required=True)
            self._output_terminator = nlpdef.opt_str(cfgsection,
                                                     'output_terminator',
                                                     required=True)
            typepairs = nlpdef.opt_strlist(cfgsection,
                                           'outputtypemap',
                                           required=True,
                                           lower=False)
            self._progenvsection = nlpdef.opt_str(cfgsection, 'progenvsection')
            progargs = nlpdef.opt_str(cfgsection, 'progargs', required=True)
            logtag = nlpdef.get_logtag() or '.'

        self._outputtypemap = {}  # type: Dict[str, OutputUserConfig]
        self._type_to_tablename = {}  # type: Dict[str, str]
        for c in chunks(typepairs, 2):
            annottype = c[0]
            outputsection = c[1]
            if annottype != annottype.lower():
                raise Exception(
                    "Section {}: annotation types in outputtypemap must be in "
                    "lower case: change {}".format(cfgsection, annottype))
            # log.critical(outputsection)
            c = OutputUserConfig(nlpdef.get_parser(), outputsection)
            self._outputtypemap[annottype] = c
            self._type_to_tablename[annottype] = c.get_tablename()

        if self._progenvsection:
            self._env = nlpdef.get_env_dict(self._progenvsection, os.environ)
        else:
            self._env = os.environ.copy()
        self._env["NLPLOGTAG"] = logtag
        # ... because passing a "-lt" switch with no parameter will make
        # CrateGatePipeline.java complain and stop

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False

        # Sanity checks
        for ty, tn in self._type_to_tablename.items():
            assert len(tn) <= MAX_SQL_FIELD_LEN, (
                "Table name too long (max {} characters)".format(
                    MAX_SQL_FIELD_LEN))
Ejemplo n.º 3
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit)

        self._tablename = nlpdef.opt_str(cfgsection,
                                         'desttable',
                                         required=True)

        self._max_external_prog_uses = nlpdef.opt_int(cfgsection,
                                                      'max_external_prog_uses',
                                                      default=0)

        self._progenvsection = nlpdef.opt_str(cfgsection, 'progenvsection')
        if self._progenvsection:
            self._env = nlpdef.get_env_dict(self._progenvsection, os.environ)
        else:
            self._env = os.environ.copy()
        self._env["NLPLOGTAG"] = nlpdef.get_logtag() or '.'
        # ... because passing a "-lt" switch with no parameter will make
        # CrateGatePipeline.java complain and stop

        if USE_TEMP_DIRS:
            self._inputdir = tempfile.TemporaryDirectory()
            self._outputdir = tempfile.TemporaryDirectory()
            self._workingdir = tempfile.TemporaryDirectory()
            # ... these are autodeleted when the object goes out of scope; see
            #     https://docs.python.org/3/library/tempfile.html
            # ... which manages it using weakref.finalize
        else:
            homedir = os.path.expanduser("~")
            self._inputdir = AttrDict(
                name=os.path.join(homedir, "medextemp", "input"))
            mkdir_p(self._inputdir.name)
            self._outputdir = AttrDict(
                name=os.path.join(homedir, "medextemp", "output"))
            mkdir_p(self._outputdir.name)
            self._workingdir = AttrDict(
                name=os.path.join(homedir, "medextemp", "working"))
            mkdir_p(self._workingdir.name)

        progargs = nlpdef.opt_str(cfgsection, 'progargs', required=True)
        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)
        self._progargs.extend([
            "-data_ready_signal",
            MEDEX_DATA_READY_SIGNAL,
            "-results_ready_signal",
            MEDEX_RESULTS_READY_SIGNAL,
            "-i",
            self._inputdir.name,
            "-o",
            self._outputdir.name,
        ])

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._file_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False
Ejemplo n.º 4
0
    def __init__(self,
                 nlpdef: NlpDefinition,
                 cfgsection: str,
                 commit: bool = False) -> None:
        """
        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfgsection:
                the name of a CRATE NLP config file section (from which we may
                choose to get extra config information)
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit,
                         name="GATE")

        if not nlpdef and not cfgsection:
            # Debugging only
            self._debug_mode = True
            self._max_external_prog_uses = 0
            self._input_terminator = 'input_terminator'
            self._output_terminator = 'output_terminator'
            typepairs = []  # type: List[str]
            self._progenvsection = ''
            progargs = ''
            logtag = ''
        else:
            self._debug_mode = False
            self._max_external_prog_uses = nlpdef.opt_int(
                self._sectionname, ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES,
                default=0)
            self._input_terminator = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.INPUT_TERMINATOR,
                required=True)
            self._output_terminator = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.OUTPUT_TERMINATOR,
                required=True)
            typepairs = nlpdef.opt_strlist(
                self._sectionname, ProcessorConfigKeys.OUTPUTTYPEMAP,
                required=True, lower=False)
            self._progenvsection = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGENVSECTION)
            progargs = nlpdef.opt_str(
                self._sectionname, ProcessorConfigKeys.PROGARGS,
                required=True)
            logtag = nlpdef.get_logtag() or '.'

        self._outputtypemap = {}  # type: Dict[str, OutputUserConfig]
        self._type_to_tablename = {}  # type: Dict[str, str]
        for c in chunks(typepairs, 2):
            annottype = c[0]
            outputsection = c[1]
            # 2018-03-27: not clear why we need to force the user to specify
            # in lower case! We just said it's case-insensitive. So ditch this:
            #
            # if annottype != annottype.lower():
            #     raise Exception(
            #         "Section {}: annotation types in outputtypemap must be in "  # noqa
            #         "lower case: change {}".format(cfgsection, annottype))
            #
            # and add this:
            annottype = annottype.lower()
            # log.critical(outputsection)
            c = OutputUserConfig(nlpdef.get_parser(), outputsection)
            self._outputtypemap[annottype] = c
            self._type_to_tablename[annottype] = c.get_tablename()

        if self._progenvsection:
            self._env = nlpdef.get_env_dict(
                full_sectionname(NlpConfigPrefixes.ENV,
                                 self._progenvsection),
                os.environ)
        else:
            self._env = os.environ.copy()
        self._env["NLPLOGTAG"] = logtag
        # ... We have ensured that this is not empty for real use, because
        # passing a "-lt" switch with no parameter will make
        # CrateGatePipeline.java complain and stop. The environment variable
        # is read via the "progargs" config argument, as follows.

        formatted_progargs = progargs.format(**self._env)
        self._progargs = shlex.split(formatted_progargs)

        self._n_uses = 0
        self._pipe_encoding = 'utf8'
        self._p = None  # the subprocess
        self._started = False

        # Sanity checks
        for ty, tn in self._type_to_tablename.items():
            assert len(tn) <= MAX_SQL_FIELD_LEN, (
                f"Table name too long (max {MAX_SQL_FIELD_LEN} characters)")