def __init__( self, url=None, auth_token=None, lang="en", ann_type="Mention", task="tag", # or spot out_annset="", min_delay_ms=501, tweet=False, include_all_spots=False, long_text=None, epsilon=None, link_pattern="https://{0}.wikipedia.org/wiki/{1}", ): """ Create a TagMeAnnotator. Args: lang: the language of the text, one of 'de', 'en' (default), 'it' ann_type: the annotation type for the new annotations, default is "Mention" auth_token: the authentication token needed to use the service url: the annotation service endpoint, is None, the default endpoint for the task (spot or tag) is used task: one of "spot" (only find mentions) or "tag" (find mentions and link), default is "tag" out_annset: the annotationset to put the new annotations in min_delay_ms: minimum time in ms to wait between requests to the server tweet: if True, TagMe expects a Tweet (default is False) include_all_spots: if True, include spots that cannot be linked (default is False) long_text: if not None, the context length to use (default: None) epsilon: if not None, the epsilong value (float) to use (default: None) link_pattern: the URL pattern to use to turn the "title" returned from TagMe into an actual link. The default is "https://{0}.wikipedia.org/wiki/{1}" where {0} gets replaced with the language code and {1} gets replaced with the title. """ if url is None: if task == "tag": url = "https://tagme.d4science.org/tagme/tag" elif task == "spot": url = "https://tagme.d4science.org/tagme/spot" else: raise Exception("task must be 'tag' or 'spot'") assert lang in ["en", "de", "it"] if long_text is not None: assert isinstance(long_text, int) if epsilon is not None: assert isinstance(epsilon, float) self.long_text = long_text self.epsilon = epsilon self.lang = lang self.auth_token = auth_token self.url = url self.tweet = tweet self.include_all_spots = include_all_spots self.out_annset = out_annset self.min_delay_s = min_delay_ms / 1000.0 self.logger = init_logger() # self.logger.setLevel(logging.DEBUG) self._last_call_time = 0 self.ann_type = ann_type self.link_pattern = link_pattern
def __init__( self, api_key=None, api_password=None, url=None, ann_types=None, map_types=None, out_annset="", min_delay_ms=501, ): """ Create a GateCloudAnnotator. Args: api_key: API key needed to authenticate. Some services can be used in a limited way without authentication. api_password: API password needed to authenticale. url: the URL of the annotation service endpoint, shown on the GATE Cloud page for the service ann_types: this can be used to let the service annotate fewer or more than the default list of annotation types. The default list and all possible annotations are shown on the GATE Cloud page for the service. Either a string with comma separated annotation types preceded by a colon (e.g. ":Person,:Location") or a python list with those type names (e.g. [":Person", ":Location"]). If the list contains type names without a leading colon, the colon is added. map_types: a dict which maps the annotation types from the service to arbitrary new annotation types, any type name not in the map will remain unchanged. out_annset: the annotation set in which to store the annotations min_delay_ms: minimum time in milliseconds between two subsequent requests to the server """ self.api_key = api_key self.api_password = api_password self.url = url self.map_types = map_types self.min_delay_s = min_delay_ms / 1000.0 self.out_annset = out_annset if ann_types: if isinstance(ann_types, str): self.ann_types = ann_types elif isinstance(ann_types, list): self.ann_types = ",".join([ at if at.startswith(":") else ":" + at for at in ann_types ]) else: raise Exception( "ann_types mist be a string of types like ':Person,:Location' or a list of types" ) else: self.ann_types = None self.logger = init_logger() self.logger.setLevel(logging.DEBUG) self._last_call_time = 0
def __init__( self, url=None, # use default auth_token=None, lang=None, # if None/not specified, TextRazor auto-detects extractors=None, out_annset="", min_delay_ms=501, ): """ Create a TextRazorTextAnnotator. Args: lang: if specified, override the auto-detected language of the text auth_token: the authentication token needed to use the service url: the annotation service endpoint, is None, the default endpoint https://api.textrazor.com is used extractors: a list of extractor names or a string with comma-separated extractor names to add to the minimum extractors (words, sentences). If None uses words, sentences, entities. NOTE: currently only words, sentences, entities is supported.! out_annset: the annotationset to put the new annotations in min_delay_ms: minimum time in ms to wait between requests to the server """ if url is None: url = "https://api.textrazor.com" self.url = url self.lang = lang self.out_annset = out_annset self.auth_token = auth_token self.min_delay_s = min_delay_ms / 1000.0 self.logger = init_logger() self.logger.setLevel(logging.DEBUG) self._last_call_time = 0 if extractors is not None: if isinstance(extractors, str): extractors = extractors.split(",") if isinstance(extractors, list): allextrs = set() allextrs.update(extractors) allextrs.update(["words", "sentences"]) self.extractors = ",".join(list(allextrs)) else: raise Exception( "Odd extractors, must be list of strings or string") else: self.extractors = "words,sentences,entities"
def __init__( self, ignorefunc=None, mapfunc=None, matcherdata=None, defaultdata=None ): """ Create a TokenMatcher. :param ignorefunc: a predicate that returns True for any token that should be ignored. :param mapfunc: a function that returns the string to use for each token. :param matcherdata: data to add to all matches in the matcherdata field :param defaultdata: data to add to matches when the entry data is None """ # TODO: need to figure out how to handle word boundaries # TODO: need to figure out how to handle matching spaces vs. different spaces / no spaces! # self.nodes = defaultdict(Node) self.ignorefunc = ignorefunc self.mapfunc = mapfunc self.defaultdata = defaultdata self.matcherdata = matcherdata self._root = _Node() self.loger = init_logger(__name__) raise Exception("Not yet implemented")
def __init__(self, *annotators, **kwargs): """ Creates a pipeline annotator. Individual annotators can be added at a later time to the front or back using the add method. Note: each annotator can be assigned a name in a pipeline, either when using the add method or by passing a tuple (annotator, name) instead of just the annotator. Args: annotators: each parameter can be an annotator, a callable, a tuple where the first item is an annotator or callable and the second a string(name), or a list of these things. An annotator can be given as an instance or class, if it is a class, the kwargs are used to construct an instance. If no annotators are specified at construction, they can still be added later and incrementally using the `add` method. **kwargs: these arguments are passed to the constructor of any class in the annotators list """ self.annotators = [] self.names = [] self.names2annotators = dict() self.logger = init_logger("Pipeline") for ann in annotators: if not isinstance(ann, list): anns = [ann] for a in anns: if isinstance(a, tuple) and len(a) == 2: a, name = a else: name = f"{len(self.annotators)}" a = _check_and_ret_callable(a) if name in self.names2annotators: raise Exception(f"Duplicate name: {name}") self.names2annotators[name] = a self.annotators.append(a) self.names.append(name) if len(self.annotators) == 0: self.logger.warn("Pipeline is a do-nothing pipeline: no annotators")
CLASS_RE_PATTERN = _tmp_re_pattern.__class__ try: import regex _tmp_regex_pattern = regex.compile("x") CLASS_REGEX_PATTERN = _tmp_regex_pattern.__class__ except: # if the regex module is not available, make our code still work by introducing a dummy type class RegexPattern: pass CLASS_REGEX_PATTERN = RegexPattern from gatenlp.utils import init_logger logger = init_logger(debug=True) __pdoc__ = { "FeatureMatcher.__call__": True, "FeatureEqMatcher.__call__": True, "AnnMatcher.__call__": True, } class FeatureMatcher: """ Callable that matches the given dictionary against features. This creates a callable that can be used to easily check if features match the features and feature constraint defined by the matcher. When a matcher is created, the argument names are used as feature names and the argument values are either
def __init__( self, port=25333, start=True, java="java", host="127.0.0.1", gatehome=None, platform=None, auth_token=None, use_auth_token=True, log_actions=False, keep=False, debug=False, ): """ Create an instance of the GateSlave and either start our own Java GATE process for it to use (start=True) or connect to an existing one (start=False). After the GateSlave instance has been create successfully, it is possible to: * Use one of the methods of the instance to perform operations on the Java side or exchange data * use GateSlave.slave to invoke methods from the PythonSlave class on the Java side * use GateSlave.jvm to directly construct objects or call instance or static methods NOTE: the GATE process must not output anything important/big to stderr because everything from stderr gets captured and used for communication between the Java and Python processes. At least part of the output to stderr may only be passed on after the GATE process has ended. Example: ```python gs = GateSlave() pipeline = gs.slave.loadPipelineFromFile("thePipeline.xgapp") doc = gs.slave.createDocument("Some document text") gs.slave.run4doc(pipeline,doc) pdoc = gs.gdoc2pdoc(doc) gs.slave.deleteResource(doc) # process the document pdoc ... ``` port: port to use start: if True, try to start our own GATE process, otherwise expect an already started process at the host/port address java: path to the java binary to run or the java command to use from the PATH (for start=True) host: host an existing Java GATE process is running on (only relevant for start=False) gatehome: where GATE is installed (only relevant if start=True). If None, expects environment variable GATE_HOME to be set. platform: system platform we run on, one of Windows, Linux (also for MacOs) or Java auth_token: if None or "" and use_auth_token is True, generate a random token which is then accessible via the auth_token attribute, otherwise use the given auth token. use_auth_token: if False, do not use an auth token, otherwise either use the one specified via auth_token or generate a random one. log_actions: if the gate slave should log the actions it is doing keep: normally if gs.close() is called and we are not connected to the PythonSlaveLr, the slave will be shut down. If this is True, the gs.close() method does not shut down the slave. debug: show debug messages (default: False) """ self.logger = init_logger(__name__) from py4j.java_gateway import JavaGateway, GatewayParameters self.gatehome = gatehome self.port = port self.host = host self.start = start self.gatehome = gatehome self.platform = platform self.gateprocess = None self.gateway = None self.slave = None self.closed = False self.keep = keep self.debug = debug self.log_actions = log_actions if use_auth_token: if not auth_token: self.auth_token = secrets.token_urlsafe(20) else: self.auth_token = auth_token else: self.auth_token = "" if gatehome is None and start: gatehome = os.environ.get("GATE_HOME") if gatehome is None: raise Exception( "Parameter gatehome is None and environment var GATE_HOME not set" ) self.gatehome = gatehome if start: # make sure we find the jar we need # logger.info("DEBUG: file location: {}".format(__file__)) jarloc = os.path.join( os.path.dirname(__file__), "_jars", f"gatetools-gatenlpslave-{JARVERSION}.jar", ) if not os.path.exists(jarloc): raise Exception("Could not find jar, {} does not exist".format(jarloc)) cmdandparms = [java, "-cp"] cpsep = classpath_sep(platform=platform) cmdandparms.append( jarloc + cpsep + gate_classpath(self.gatehome, platform=platform) ) cmdandparms.append("gate.tools.gatenlpslave.GatenlpSlave") cmdandparms.append(str(port)) cmdandparms.append(host) if log_actions: cmdandparms.append("1") else: cmdandparms.append("0") if keep: cmdandparms.append("1") else: cmdandparms.append("0") os.environ["GATENLP_SLAVE_TOKEN_" + str(self.port)] = self.auth_token cmd = " ".join(cmdandparms) self.logger.debug(f"Running command: {cmd}") subproc = subprocess.Popen( cmdandparms, stderr=subprocess.PIPE, bufsize=0, encoding="utf-8" ) self.gateprocess = subproc while True: line = subproc.stderr.readline() if line == "": break line = line.rstrip("\n\r") if line == "PythonSlaveRunner.java: server start OK": break if line == "PythonSlaveRunner.java: server start NOT OK": raise Exception("Could not start server, giving up") print(line, file=sys.stderr) atexit.register(self.close) self.gateway = JavaGateway( gateway_parameters=GatewayParameters(port=port, auth_token=self.auth_token) ) self.jvm = self.gateway.jvm self.slave = self.gateway.entry_point self.gate_version = self.jvm.gate.Main.version self.gate_build = self.jvm.gate.Main.build self.slave_version = self.slave.pluginVersion() self.slave_build = self.slave.pluginBuild()
def start_gate_slave( port=25333, host="127.0.0.1", auth_token=None, use_auth_token=True, java="java", platform=None, gatehome=None, log_actions=False, keep=False, debug=False, ): """ Run the gate slave program. This starts the Java program included with gatenlp to run GATE and execute the gate slave within GATE so that Python can connect to it. Args: port: (Default value = 25333) Port number to use host: (Default value = "127.0.0.1") Host address to bind to auth_token: (Default value = None) Authorization token to use. If None, creates a random token. use_auth_token: (Default value = True) If False, do not aue an authorization token at all. This allows anyone who can connect to the host address to connect and use the gate slave process. java: (Default value = "java") Java command (if on the binary path) or full path to the binary to use for running the gate slave program. platform: (Default value = None) "win"/"windows" for Windows, anything else for non-Windows. If None, tries to determine automatically. gatehome: (Default value = None) The path to where GATE is installed. If None, the environment variable "GATE_HOME" is used. log_actions: (Default value = False) If True, the GATE Slave process will log everything it is ordered to do. keep: (Default value = False) passed on to the gate slave process and tells the process if it should report to the using Pythong process that it can be closed or not. debug: (Default valuye = False) Show debug messages. """ logger = init_logger(__name__) if debug: logger.setLevel(logging.DEBUG) if gatehome is None: gatehome = os.environ.get("GATE_HOME") if gatehome is None: raise Exception( "Parameter gatehome is None and environment var GATE_HOME not set" ) if use_auth_token: if not auth_token: auth_token = secrets.token_urlsafe(20) else: auth_token = auth_token else: auth_token = "" if log_actions: log_actions = "1" else: log_actions = "0" if keep: keep = "1" else: keep = "0" logger.debug( f"Starting gate slave, gatehome={gatehome}, auth_token={auth_token}, log_actions={log_actions}, keep={keep}" ) jarloc = os.path.join( os.path.dirname(__file__), "_jars", f"gatetools-gatenlpslave-{JARVERSION}.jar" ) if not os.path.exists(jarloc): raise Exception("Could not find jar, {} does not exist".format(jarloc)) logger.debug(f"Using JAR: {jarloc}") cmdandparms = [java, "-cp"] cpsep = classpath_sep(platform=platform) cmdandparms.append(jarloc + cpsep + gate_classpath(gatehome, platform=platform)) cmdandparms.append("gate.tools.gatenlpslave.GatenlpSlave") cmdandparms.append(str(port)) cmdandparms.append(host) cmdandparms.append(log_actions) cmdandparms.append(keep) os.environ["GATENLP_SLAVE_TOKEN_" + str(port)] = auth_token cmd = " ".join(cmdandparms) logger.debug(f"Running command: {cmd}") subproc = subprocess.Popen( cmdandparms, stderr=subprocess.PIPE, bufsize=0, encoding="utf-8" ) def shutdown(): """ Handler that gets invoked when the calling Python program exits. This terminates the gate slave by sending the SIGINT signal to it. """ subproc.send_signal(signal.SIGINT) for line in subproc.stderr: print(line, file=sys.stderr, end="") atexit.register(shutdown) while True: line = subproc.stderr.readline() if line == "": break line = line.rstrip("\n\r") if line == "PythonSlaveRunner.java: server start OK": break if line == "PythonSlaveRunner.java: server start NOT OK": raise Exception("Could not start server, giving up") print(line, file=sys.stderr) try: subproc.wait() except KeyboardInterrupt: print("Received keyboard interrupt, shutting down server...") shutdown()
def __init__( self, source, fmt="gate-def", source_sep="\t", source_encoding="UTF-8", cache_source=None, tokenizer=None, all=False, skip=True, outset="", outtype="Lookup", annset="", tokentype="Token", feature=None, septype=None, splittype=None, withintype=None, mapfunc=None, ignorefunc=None, getterfunc=None, listfeatures=None, listtype=None, ): """ Args: source: where to load the gazetteer from. What is actually expected here depends on the fmt parameter. fmt: defines what is expected as the format and/or content of the source parameter. One of: * "gate-def" (default): the path to a GATE-style "def" file. See https://gate.ac.uk/userguide/chap:gazetteers * "gazlist": a list of tuples or lists where the first element of the tuple/list is a list of strings and the second element is a dictionary containing the features to assign. All entries in the list belong to the first gazetteer list which has list features as specified with the listfeatures parameter and a list type as specified with the listtype parameter. source_sep: the field separator to use for some source formats (default: tab character) source_encoding: the encoding to use for some source formats (default: UTF-8) feature: the feature name to use to get the string for each token. If the feature does not exist, is None or is the empty string, the Token is completely ignored. If the feature name is None, use the document string covered by the token. all: return all matches, if False only return longest matches skip: skip forward over longest match (do not return contained/overlapping matches) annset: the set where the tokens to match should come from outset: the set where the new annotations are added outtype: the annotation type of the annotations to create, unless a type is given for the gazetteer entry or for the gazetteer list. tokentype: the annotation type of the token annotations septype: the annotation type of separator annotations (NOT YET USED/IMPLEMENTED!) splittype: the annotation type of any split annotations which will end any ongoing match withintype: only matches fully within annotations of this type will be made mapfunc: a function that maps the original string extracted for each token to the actual string to use. ignorefunc: a function which given the mapped token string decides if the token should be ignored (not added to the gazetteer list, not considered in the document when matching) getterfunc: a function which, given a token annotation, retrieves the string. If there is mapfunc, the retrieved string is then still run through the mapfunc. The getterfunc must accept the token and an optional document as parameters. listfeatures: a dictionary of features common to the whole list or None. If what gets loaded specifies its own list features, this is getting ignored. listtype: the output annotation type to use for the list, ignored if the input format specifies this on its own. If the input does not specify this on its own and this is not None, then it takes precedence over outtype for the data loaded from source. """ self.nodes = defaultdict(TokenGazetteerNode) self.mapfunc = mapfunc self.ignorefunc = ignorefunc self.feature = feature self.annset = annset self.tokentype = tokentype self.septype = septype self.splittype = splittype self.withintype = withintype self.outset = outset self.outtype = outtype self.all = all self.skip = skip self.tokenizer = tokenizer if getterfunc: self.getterfunc = getterfunc else: self.getterfunc = tokentext_getter self.listfeatures = [] self.listtypes = [] self.logger = init_logger(__name__) self.logger.setLevel(logging.DEBUG) self.append(source, fmt=fmt, listfeatures=listfeatures, listtype=listtype)
parser.add_argument("--infmt", type=str, default="bdocjs", help="Format / extension of initial files") parser.add_argument("--fmt", type=str, default="bdocjs", help="Format / extension of benchmark files") args = parser.parse_args(args) return args if __name__ == "__main__": args = process_args() logger = init_logger("loadsave") run_start(logger, "loadsave") if not os.path.exists(args.indir): raise Exception("Does not exist: {}".format(args.indir)) if not os.path.exists(args.outdir): raise Exception("Does not exist: {}".format(args.outdir)) gen = Path(args.indir).rglob("*.bdocjs") total_readorig = 0 total_save = 0 total_read = 0 newfiles = [] for f in gen: relpath = str(f)
def interact(args=None, annotator=None): """Starts and handles the interaction with a GATE python plugin process. This will get started by the GATE plugin if the interaction uses pipes, but can also be started separately for http/websockets. This MUST be called in the user's python file! The python file should also have one class or function decorated with the @gatenlp.PR decorator to identify it as the processing resource to the system. :return: Args: args: (Default value = None) Returns: """ logger = init_logger(__name__) loglvls = { "DEBUG": logging.DEBUG, "INFO": logging.INFO, "WARNING": logging.WARNING, "ERROR": logging.ERROR, "CRITICAL": logging.CRITICAL, } # before we do anything we need to check if a PR has actually # been defined. If not, use our own default debugging PR if gatenlp.gate_python_plugin_pr is None and annotator is None: logger.warning( "No processing resource defined with @GateNlpPr decorator or passed to interact, using default do-nothing" ) _pr_decorator(DefaultPr) if annotator is not None: pr = _pr_decorator(annotator) else: pr = gatenlp.gate_python_plugin_pr if args is None: args = get_arguments() if args.d: logger.setLevel(logging.DEBUG) if args.log_lvl: if args.log_lvl not in loglvls: raise Exception("Not a valid log level: {}".format(args.log_lvl)) logger.setLevel(loglvls[args.log_lvl]) if args.mode == "check": return logger.info("Using gatenlp version {}\n".format(gatenlp.__version__)) logger.debug("Starting interaction args={}".format(args)) if args.mode == "pipe": if args.format != "json": raise Exception( "For interaction mode pipe, only format=json is supported") for line in instream: try: request = json.loads(line) except Exception as ex: logger.error("Unable to load from JSON:\n{}".format(line)) raise ex logger.debug("Got request object: {}".format(request)) cmd = request.get("command", None) stop_requested = False ret = None try: if cmd == "execute": doc = Document.from_dict(request.get("data")) om = doc.to_offset_type(OFFSET_TYPE_PYTHON) doc.changelog = ChangeLog() pr.execute(doc) # NOTE: for now we just discard what the method returns and always return # the changelog instead! chlog = doc.changelog # if we got an offset mapper earlier, we had to convert, so we convert back to JAVA if om: # replace True is faster, and we do not need the ChangeLog any more! chlog.fixup_changes(offset_mapper=om, offset_type=OFFSET_TYPE_JAVA, replace=True) ret = doc.changelog.to_dict() logger.debug("Returning CHANGELOG: {}".format(ret)) elif cmd == "start": parms = request.get("data") pr.start(parms) elif cmd == "finish": ret = pr.finish() elif cmd == "reduce": results = request.get("data") ret = pr.reduce(results) elif cmd == "stop": stop_requested = True else: raise Exception("Odd command received: {}".format(cmd)) response = { "data": ret, "status": "ok", } except Exception as ex: error = repr(ex) tb_str = traceback.format_exception(etype=type(ex), value=ex, tb=ex.__traceback__) print("ERROR when running python code:", file=sys.stderr) for line in tb_str: print(line, file=sys.stderr, end="" ) # what we get from traceback already has new lines info = "".join(tb_str) # in case we want the actual stacktrace data as well: st = [(f.filename, f.lineno, f.name, f.line) for f in traceback.extract_tb(ex.__traceback__)] response = { "data": None, "status": "error", "error": error, "info": info, "stacktrace": st, } logger.debug("Sending back response: {}".format(response)) print(json.dumps(response), file=ostream) ostream.flush() if stop_requested: break # TODO: do any cleanup/restoring needed logger.debug("Finishing interaction") elif args.mode == "http": raise Exception("Mode http not implemented yet") elif args.mode == "websockets": raise Exception("Mode websockets not implemented yet") elif args.mode in ["file", "dir"]: if not args.path: raise Exception("Mode file or dir but no --path specified") fileext = ".bdoc" + args.format if args.mode == "file" and not os.path.isfile(args.path): raise Exception("Mode file but path is not a file: {}".format( args.path)) elif args.mode == "dir" and not os.path.isdir(args.path): raise Exception("Mode dir but path is not a directory: {}".format( args.path)) if args.mode == "file": pr.start({}) logger.info(f"Loading file {args.path}") doc = Document.load(args.path) pr.execute(doc) pr.finish() if args.out: logger.info(f"Saving file to {args.out}") doc.save(args.out) else: logger.info(f"Saving file to {args.path}") doc.save(args.path) else: import glob pr.start({}) files = glob.glob(args.path + os.path.sep + "*" + fileext) for file in files: logger.info("Loading file {}".format(file)) doc = Document.load(file) pr.execute(doc) if args.out: tofile = os.path.join(args.out, os.path.basename(file)) logger.info("Saving to {}".format(tofile)) doc.save(tofile) else: logger.info("Saving to {}".format(file)) doc.save(file) pr.finish() else: raise Exception("Not a valid mode: {}".format(args.mode))
import glob pr.start({}) files = glob.glob(args.path + os.path.sep + "*" + fileext) for file in files: logger.info("Loading file {}".format(file)) doc = Document.load(file) pr.execute(doc) if args.out: tofile = os.path.join(args.out, os.path.basename(file)) logger.info("Saving to {}".format(tofile)) doc.save(tofile) else: logger.info("Saving to {}".format(file)) doc.save(file) pr.finish() else: raise Exception("Not a valid mode: {}".format(args.mode)) if __name__ == "__main__": # we run this from the command line so we need to also first load the PR code from the python file args = get_arguments(from_main=True) logger = init_logger(__name__) import importlib.util spec = importlib.util.spec_from_file_location("gateapp", args.pythonfile) foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) interact(args=args)
import sortedcontainers except Exception as ex: import sys print("ERROR: required package sortedcontainers cannot be imported!", file=sys.stderr) print( "Please install it, using e.g. 'pip install -U sortedcontainers'", file=sys.stderr, ) sys.exit(1) # TODO: check version of sortedcontainers (we have 2.1.0) from gatenlp.utils import init_logger logger = init_logger("gatenlp") # this attribute globally holds the processing resource last defined # so it can be used for interacting with the GATE python plugin from gatenlp.gate_interaction import _pr_decorator as GateNlpPr from gatenlp.gate_interaction import interact from gatenlp.annotation import Annotation from gatenlp.document import Document from gatenlp.annotation_set import AnnotationSet from gatenlp.changelog import ChangeLog from gatenlp.gateslave import GateSlave from gatenlp.span import Span def init_notebook(): from gatenlp.serialization.default import HtmlAnnViewerSerializer
def __init__( self, url=None, service=None, auth=None, success_code=None, access_token=None, refresh_access=False, out_annset="", min_delay_ms=501, anntypes_map=None, ): """ Create an ElgTextAnnotator. NOTE: error handling is not properly implemented yet since we do not know yet how exactly the various error conditions are represented in the result returned from the ELG services. For now, any error will throw an exception when `__call__` is invoked. NOTE: initialization can fail with an exception if success_code is specified and retrieving the authentification information fails. Args: url: the annotation service URL to use. If not specified, the service parameter must be specified. service: the ELG service number or a tuple (servicenumber, domain). This requires the elg package. This may raise an exception. If successful, the url and service_meta attributes are set. auth: a pre-initialized ELG Authentication object. Requires the elg package. If not specified, the success_code or access_token parameter must be specified. success_code: the success code returned from the ELG web page for one of the URLs to obtain success codes. This will try to obtain the authentication information and store it in the `auth` attribute. Requires the elg package. To obtain a success code, go the the ELG_SC_LIVE_URL_OPENID or ELG_SC_LIVE_URL_OFFLINE url and log in with your ELG user id, this will show the success code that can be copy-pasted. access_token: the access token token for the ELG service. Only used if auth or success_code are not specified. The access token is probably only valid for a limited amount of time. No refresh will be done and once the access token is invalid, calling `__call__` will fail with an exception. The access token can be obtained using the elg package or copied from the "Code samples" tab on the web page for a service after logging in. refresh_access: if True, will try to refresh the access token if auth or success_code was specified and refreshing is possible. Ignored if only access_token was specified out_annset: the name of the annotation set where to create the annotations (default: "") min_delay_ms: the minimum delay time between requests in milliseconds (default: 501 ms) anntypes_map: a map for renaming the annotation type names from the service to the ones to use in the annotated document. """ if [x is not None for x in [url, service]].count(True) != 1: raise Exception("Exactly one of service or url must be specified") if [x is not None for x in [auth, success_code, access_token]].count(True) != 1: raise Exception( "Exactly one of auth, success_code, or access_token must be specified" ) self.access_token = access_token self.success_code = success_code self.auth = auth self.url = url self.service = service self.service_meta = None self.refresh_access = refresh_access # first check if we need to import the elg package import_elg = False if access_token: self.refresh_access = False if service is not None: import_elg = True if auth or success_code: import_elg = True if import_elg: try: from elg import Authentication from elg.utils import get_domain, get_metadatarecord except Exception as ex: raise Exception( "For this gatenlp must be installed with extra elg or extra all, e.g. gatenlp[elg]", ex) if service is not None: # update this to use the new method: # https://gitlab.com/european-language-grid/platform/python-client/-/issues/9 if isinstance(service, tuple): service_id, domain = service else: service_id = service domain = get_domain("live") self.service_meta = get_metadatarecord(service_id, domain) # NOTE: there is also elg_execution_location for async requests! self.url = self.service_meta["service_info"][ "elg_execution_location_sync"] if success_code is not None: self.auth = Authentication.from_success_code(success_code, domain="live") if self.auth: self.access_token = self.auth.access_token self.min_delay_s = min_delay_ms / 1000.0 self.anntypes_map = anntypes_map self.out_annset = out_annset self.logger = init_logger(__name__) # self.logger.setLevel(logging.DEBUG) self._last_call_time = 0