def __init__(self, nlpdef: NlpDefinition, cfgsection: str, commit: bool = False) -> None: super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit) if not nlpdef and not cfgsection: # Debugging only self._max_external_prog_uses = 0 self._input_terminator = 'input_terminator' self._output_terminator = 'output_terminator' typepairs = [] # type: List[str] self._progenvsection = '' progargs = '' logtag = '' else: self._max_external_prog_uses = nlpdef.opt_int( cfgsection, 'max_external_prog_uses', default=0) self._input_terminator = nlpdef.opt_str(cfgsection, 'input_terminator', required=True) self._output_terminator = nlpdef.opt_str(cfgsection, 'output_terminator', required=True) typepairs = nlpdef.opt_strlist(cfgsection, 'outputtypemap', required=True, lower=False) self._progenvsection = nlpdef.opt_str(cfgsection, 'progenvsection') progargs = nlpdef.opt_str(cfgsection, 'progargs', required=True) logtag = nlpdef.get_logtag() or '.' self._outputtypemap = {} # type: Dict[str, OutputUserConfig] self._type_to_tablename = {} # type: Dict[str, str] for c in chunks(typepairs, 2): annottype = c[0] outputsection = c[1] if annottype != annottype.lower(): raise Exception( "Section {}: annotation types in outputtypemap must be in " "lower case: change {}".format(cfgsection, annottype)) # log.critical(outputsection) c = OutputUserConfig(nlpdef.get_parser(), outputsection) self._outputtypemap[annottype] = c self._type_to_tablename[annottype] = c.get_tablename() if self._progenvsection: self._env = nlpdef.get_env_dict(self._progenvsection, os.environ) else: self._env = os.environ.copy() self._env["NLPLOGTAG"] = logtag # ... because passing a "-lt" switch with no parameter will make # CrateGatePipeline.java complain and stop formatted_progargs = progargs.format(**self._env) self._progargs = shlex.split(formatted_progargs) self._n_uses = 0 self._pipe_encoding = 'utf8' self._p = None # the subprocess self._started = False # Sanity checks for ty, tn in self._type_to_tablename.items(): assert len(tn) <= MAX_SQL_FIELD_LEN, ( "Table name too long (max {} characters)".format( MAX_SQL_FIELD_LEN))
def __init__(self, nlpdef: NlpDefinition, cfgsection: str, commit: bool = False) -> None: """ Args: nlpdef: a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` cfgsection: the name of a CRATE NLP config file section (from which we may choose to get extra config information) commit: force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. """ super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, name="GATE") if not nlpdef and not cfgsection: # Debugging only self._debug_mode = True self._max_external_prog_uses = 0 self._input_terminator = 'input_terminator' self._output_terminator = 'output_terminator' typepairs = [] # type: List[str] self._progenvsection = '' progargs = '' logtag = '' else: self._debug_mode = False self._max_external_prog_uses = nlpdef.opt_int( self._sectionname, ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES, default=0) self._input_terminator = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.INPUT_TERMINATOR, required=True) self._output_terminator = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.OUTPUT_TERMINATOR, required=True) typepairs = nlpdef.opt_strlist( self._sectionname, ProcessorConfigKeys.OUTPUTTYPEMAP, required=True, lower=False) self._progenvsection = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.PROGENVSECTION) progargs = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.PROGARGS, required=True) logtag = nlpdef.get_logtag() or '.' self._outputtypemap = {} # type: Dict[str, OutputUserConfig] self._type_to_tablename = {} # type: Dict[str, str] for c in chunks(typepairs, 2): annottype = c[0] outputsection = c[1] # 2018-03-27: not clear why we need to force the user to specify # in lower case! We just said it's case-insensitive. So ditch this: # # if annottype != annottype.lower(): # raise Exception( # "Section {}: annotation types in outputtypemap must be in " # noqa # "lower case: change {}".format(cfgsection, annottype)) # # and add this: annottype = annottype.lower() # log.critical(outputsection) c = OutputUserConfig(nlpdef.get_parser(), outputsection) self._outputtypemap[annottype] = c self._type_to_tablename[annottype] = c.get_tablename() if self._progenvsection: self._env = nlpdef.get_env_dict( full_sectionname(NlpConfigPrefixes.ENV, self._progenvsection), os.environ) else: self._env = os.environ.copy() self._env["NLPLOGTAG"] = logtag # ... We have ensured that this is not empty for real use, because # passing a "-lt" switch with no parameter will make # CrateGatePipeline.java complain and stop. The environment variable # is read via the "progargs" config argument, as follows. formatted_progargs = progargs.format(**self._env) self._progargs = shlex.split(formatted_progargs) self._n_uses = 0 self._pipe_encoding = 'utf8' self._p = None # the subprocess self._started = False # Sanity checks for ty, tn in self._type_to_tablename.items(): assert len(tn) <= MAX_SQL_FIELD_LEN, ( f"Table name too long (max {MAX_SQL_FIELD_LEN} characters)")