Exemple #1
0
    def __init__(
        self,
        url=None,
        auth_token=None,
        lang="en",
        ann_type="Mention",
        task="tag",  # or spot
        out_annset="",
        min_delay_ms=501,
        tweet=False,
        include_all_spots=False,
        long_text=None,
        epsilon=None,
        link_pattern="https://{0}.wikipedia.org/wiki/{1}",
    ):
        """
        Create a TagMeAnnotator.

        Args:
            lang: the language of the text, one of 'de', 'en' (default), 'it'
            ann_type: the annotation type for the new annotations, default is "Mention"
            auth_token: the authentication token needed to use the service
            url: the annotation service endpoint, is None, the default endpoint for the task (spot or tag) is used
            task: one of "spot" (only find mentions) or "tag" (find mentions and link), default is "tag"
            out_annset: the annotationset to put the new annotations in
            min_delay_ms: minimum time in ms to wait between requests to the server
            tweet: if True, TagMe expects a Tweet (default is False)
            include_all_spots: if True, include spots that cannot be linked (default is False)
            long_text: if not None, the context length to use (default: None)
            epsilon: if not None, the epsilong value (float) to use (default: None)
            link_pattern: the URL pattern to use to turn the "title" returned from TagMe into an actual link. The
               default is "https://{0}.wikipedia.org/wiki/{1}" where {0} gets replaced with the language code and
               {1} gets replaced with the title.
        """
        if url is None:
            if task == "tag":
                url = "https://tagme.d4science.org/tagme/tag"
            elif task == "spot":
                url = "https://tagme.d4science.org/tagme/spot"
            else:
                raise Exception("task must be 'tag' or 'spot'")
        assert lang in ["en", "de", "it"]
        if long_text is not None:
            assert isinstance(long_text, int)
        if epsilon is not None:
            assert isinstance(epsilon, float)
        self.long_text = long_text
        self.epsilon = epsilon
        self.lang = lang
        self.auth_token = auth_token
        self.url = url
        self.tweet = tweet
        self.include_all_spots = include_all_spots
        self.out_annset = out_annset
        self.min_delay_s = min_delay_ms / 1000.0
        self.logger = init_logger()
        # self.logger.setLevel(logging.DEBUG)
        self._last_call_time = 0
        self.ann_type = ann_type
        self.link_pattern = link_pattern
Exemple #2
0
    def __init__(
        self,
        api_key=None,
        api_password=None,
        url=None,
        ann_types=None,
        map_types=None,
        out_annset="",
        min_delay_ms=501,
    ):
        """
        Create a GateCloudAnnotator.

        Args:
            api_key: API key needed to authenticate. Some services can be used in a limited way without
               authentication.
            api_password: API password needed to authenticale.
            url:  the URL of the annotation service endpoint, shown on the GATE Cloud page for the service
            ann_types: this can be used to let the service annotate fewer or more than the default list of annotation
               types. The default list and all possible annotations are shown on the GATE Cloud page for the service.
               Either a string with comma separated annotation types preceded by a colon (e.g. ":Person,:Location")
               or a python list with those type names (e.g. [":Person", ":Location"]). If the list contains type names
               without a leading colon, the colon is added.
            map_types: a dict which maps the annotation types from the service to arbitrary new annotation types,
               any type name not in the map will remain unchanged.
            out_annset: the annotation set in which to store the annotations
            min_delay_ms: minimum time in milliseconds between two subsequent requests to the server
        """
        self.api_key = api_key
        self.api_password = api_password
        self.url = url
        self.map_types = map_types
        self.min_delay_s = min_delay_ms / 1000.0
        self.out_annset = out_annset
        if ann_types:
            if isinstance(ann_types, str):
                self.ann_types = ann_types
            elif isinstance(ann_types, list):
                self.ann_types = ",".join([
                    at if at.startswith(":") else ":" + at for at in ann_types
                ])
            else:
                raise Exception(
                    "ann_types mist be a string of types like ':Person,:Location' or a list of types"
                )
        else:
            self.ann_types = None
        self.logger = init_logger()
        self.logger.setLevel(logging.DEBUG)
        self._last_call_time = 0
Exemple #3
0
    def __init__(
        self,
        url=None,  # use default
        auth_token=None,
        lang=None,  # if None/not specified, TextRazor auto-detects
        extractors=None,
        out_annset="",
        min_delay_ms=501,
    ):
        """
        Create a TextRazorTextAnnotator.

        Args:
            lang: if specified, override the auto-detected language of the text
            auth_token: the authentication token needed to use the service
            url: the annotation service endpoint, is None, the default endpoint  https://api.textrazor.com is used
            extractors: a list of extractor names or a string with comma-separated extractor names to add to the
               minimum extractors (words, sentences). If None uses words, sentences, entities.
               NOTE: currently only words, sentences, entities is supported.!
            out_annset: the annotationset to put the new annotations in
            min_delay_ms: minimum time in ms to wait between requests to the server
        """
        if url is None:
            url = "https://api.textrazor.com"
        self.url = url
        self.lang = lang
        self.out_annset = out_annset
        self.auth_token = auth_token
        self.min_delay_s = min_delay_ms / 1000.0
        self.logger = init_logger()
        self.logger.setLevel(logging.DEBUG)
        self._last_call_time = 0
        if extractors is not None:
            if isinstance(extractors, str):
                extractors = extractors.split(",")
            if isinstance(extractors, list):
                allextrs = set()
                allextrs.update(extractors)
                allextrs.update(["words", "sentences"])
                self.extractors = ",".join(list(allextrs))
            else:
                raise Exception(
                    "Odd extractors, must be list of strings or string")
        else:
            self.extractors = "words,sentences,entities"
 def __init__(
     self, ignorefunc=None, mapfunc=None, matcherdata=None, defaultdata=None
 ):
     """
     Create a TokenMatcher.
     :param ignorefunc: a predicate that returns True for any token that should be ignored.
     :param mapfunc: a function that returns the string to use for each token.
     :param matcherdata: data to add to all matches in the matcherdata field
     :param defaultdata: data to add to matches when the entry data is None
     """
     # TODO: need to figure out how to handle word boundaries
     # TODO: need to figure out how to handle matching spaces vs. different spaces / no spaces!
     # self.nodes = defaultdict(Node)
     self.ignorefunc = ignorefunc
     self.mapfunc = mapfunc
     self.defaultdata = defaultdata
     self.matcherdata = matcherdata
     self._root = _Node()
     self.loger = init_logger(__name__)
     raise Exception("Not yet implemented")
    def __init__(self, *annotators, **kwargs):
        """
        Creates a pipeline annotator. Individual annotators can be added at a later time to the front or back
        using the add method.

        Note: each annotator can be assigned a name in a pipeline, either when using the add method or
        by passing a tuple (annotator, name) instead of just the annotator.

        Args:
            annotators: each parameter can be an annotator, a callable, a tuple where the first item is
                an annotator or callable and the second a string(name), or a list of these things.
                An annotator can be given as an instance or class, if it is a class, the kwargs are used
                to construct an instance. If no annotators are specified at construction, they can still
                be added later and incrementally using the `add` method.
            **kwargs: these arguments are passed to the constructor of any class in the annotators list
        """
        self.annotators = []
        self.names = []
        self.names2annotators = dict()
        self.logger = init_logger("Pipeline")
        for ann in annotators:
            if not isinstance(ann, list):
                anns = [ann]
            for a in anns:
                if isinstance(a, tuple) and len(a) == 2:
                    a, name = a
                else:
                    name = f"{len(self.annotators)}"
                a = _check_and_ret_callable(a)
                if name in self.names2annotators:
                    raise Exception(f"Duplicate name: {name}")
                self.names2annotators[name] = a
                self.annotators.append(a)
                self.names.append(name)
        if len(self.annotators) == 0:
            self.logger.warn("Pipeline is a do-nothing pipeline: no annotators")
CLASS_RE_PATTERN = _tmp_re_pattern.__class__
try:
    import regex

    _tmp_regex_pattern = regex.compile("x")
    CLASS_REGEX_PATTERN = _tmp_regex_pattern.__class__
except:
    # if the regex module is not available, make our  code still work by introducing a dummy type
    class RegexPattern:
        pass

    CLASS_REGEX_PATTERN = RegexPattern

from gatenlp.utils import init_logger

logger = init_logger(debug=True)

__pdoc__ = {
    "FeatureMatcher.__call__": True,
    "FeatureEqMatcher.__call__": True,
    "AnnMatcher.__call__": True,
}


class FeatureMatcher:
    """
    Callable that matches the given dictionary against features.

    This creates a callable that can be used to easily check if features match the
    features and feature constraint defined by the matcher. When a matcher is created,
    the argument names are used as feature names and the argument values are either
    def __init__(
        self,
        port=25333,
        start=True,
        java="java",
        host="127.0.0.1",
        gatehome=None,
        platform=None,
        auth_token=None,
        use_auth_token=True,
        log_actions=False,
        keep=False,
        debug=False,
    ):
        """
        Create an instance of the GateSlave and either start our own Java GATE process for it to use
        (start=True) or connect to an existing one (start=False).

        After the GateSlave instance has been create successfully, it is possible to:

        * Use one of the methods of the instance to perform operations on the Java side or exchange data

        * use GateSlave.slave to invoke methods from the PythonSlave class on the Java side

        * use GateSlave.jvm to directly construct objects or call instance or static methods

        NOTE: the GATE process must not output anything important/big to stderr because everything from
        stderr gets captured and used for communication between the Java and Python processes. At least
        part of the output to stderr may only be passed on after the GATE process has ended.

        Example:

            ```python
            gs = GateSlave()
            pipeline = gs.slave.loadPipelineFromFile("thePipeline.xgapp")
            doc = gs.slave.createDocument("Some document text")
            gs.slave.run4doc(pipeline,doc)
            pdoc = gs.gdoc2pdoc(doc)
            gs.slave.deleteResource(doc)
            # process the document pdoc ...
            ```

        port: port to use
        start: if True, try to start our own GATE process, otherwise expect an already started
           process at the host/port address
        java: path to the java binary to run or the java command to use from the PATH (for start=True)
        host: host an existing Java GATE process is running on (only relevant for start=False)
        gatehome: where GATE is installed (only relevant if start=True). If None, expects
               environment variable GATE_HOME to be set.
        platform: system platform we run on, one of Windows, Linux (also for MacOs) or Java
        auth_token: if None or "" and use_auth_token is True, generate a random token which
               is then accessible via the auth_token attribute, otherwise use the given auth token.
        use_auth_token: if False, do not use an auth token, otherwise either use the one specified
               via auth_token or generate a random one.
        log_actions: if the gate slave should log the actions it is doing
        keep: normally if gs.close() is called and we are not connected to the PythonSlaveLr,
               the slave will be shut down. If this is True, the gs.close() method does not shut down
               the slave.
        debug: show debug messages (default: False)
        """
        self.logger = init_logger(__name__)

        from py4j.java_gateway import JavaGateway, GatewayParameters

        self.gatehome = gatehome
        self.port = port
        self.host = host
        self.start = start
        self.gatehome = gatehome
        self.platform = platform
        self.gateprocess = None
        self.gateway = None
        self.slave = None
        self.closed = False
        self.keep = keep
        self.debug = debug
        self.log_actions = log_actions
        if use_auth_token:
            if not auth_token:
                self.auth_token = secrets.token_urlsafe(20)
            else:
                self.auth_token = auth_token
        else:
            self.auth_token = ""
        if gatehome is None and start:
            gatehome = os.environ.get("GATE_HOME")
            if gatehome is None:
                raise Exception(
                    "Parameter gatehome is None and environment var GATE_HOME not set"
                )
            self.gatehome = gatehome
        if start:
            # make sure we find the jar we need
            # logger.info("DEBUG: file location: {}".format(__file__))
            jarloc = os.path.join(
                os.path.dirname(__file__),
                "_jars",
                f"gatetools-gatenlpslave-{JARVERSION}.jar",
            )
            if not os.path.exists(jarloc):
                raise Exception("Could not find jar, {} does not exist".format(jarloc))
            cmdandparms = [java, "-cp"]
            cpsep = classpath_sep(platform=platform)
            cmdandparms.append(
                jarloc + cpsep + gate_classpath(self.gatehome, platform=platform)
            )
            cmdandparms.append("gate.tools.gatenlpslave.GatenlpSlave")
            cmdandparms.append(str(port))
            cmdandparms.append(host)
            if log_actions:
                cmdandparms.append("1")
            else:
                cmdandparms.append("0")
            if keep:
                cmdandparms.append("1")
            else:
                cmdandparms.append("0")
            os.environ["GATENLP_SLAVE_TOKEN_" + str(self.port)] = self.auth_token
            cmd = " ".join(cmdandparms)
            self.logger.debug(f"Running command: {cmd}")
            subproc = subprocess.Popen(
                cmdandparms, stderr=subprocess.PIPE, bufsize=0, encoding="utf-8"
            )
            self.gateprocess = subproc
            while True:
                line = subproc.stderr.readline()
                if line == "":
                    break
                line = line.rstrip("\n\r")
                if line == "PythonSlaveRunner.java: server start OK":
                    break
                if line == "PythonSlaveRunner.java: server start NOT OK":
                    raise Exception("Could not start server, giving up")
                print(line, file=sys.stderr)
            atexit.register(self.close)
        self.gateway = JavaGateway(
            gateway_parameters=GatewayParameters(port=port, auth_token=self.auth_token)
        )
        self.jvm = self.gateway.jvm
        self.slave = self.gateway.entry_point
        self.gate_version = self.jvm.gate.Main.version
        self.gate_build = self.jvm.gate.Main.build
        self.slave_version = self.slave.pluginVersion()
        self.slave_build = self.slave.pluginBuild()
def start_gate_slave(
    port=25333,
    host="127.0.0.1",
    auth_token=None,
    use_auth_token=True,
    java="java",
    platform=None,
    gatehome=None,
    log_actions=False,
    keep=False,
    debug=False,
):
    """
    Run the gate slave program. This starts the Java program included with gatenlp to
    run GATE and execute the gate slave within GATE so that Python can connect to it.

    Args:
      port:  (Default value = 25333) Port number to use
      host:  (Default value = "127.0.0.1") Host address to bind to
      auth_token:  (Default value = None)  Authorization token to use. If None, creates a random token.
      use_auth_token:  (Default value = True) If False, do not aue an authorization token at all.
         This allows anyone who can connect to the host address to connect and use the gate slave process.
      java:  (Default value = "java") Java command (if on the binary path) or full path to the binary
         to use for running the gate slave program.
      platform:  (Default value = None) "win"/"windows" for Windows, anything else for non-Windows.
         If None, tries to determine automatically.
      gatehome:  (Default value = None) The path to where GATE is installed. If None, the environment
         variable "GATE_HOME" is used.
      log_actions:  (Default value = False) If True, the GATE Slave process will log everything it is
         ordered to do.
      keep:  (Default value = False) passed on to the gate slave process and tells the process if it should
         report to the using Pythong process that it can be closed or not.
      debug: (Default valuye = False) Show debug messages.
    """
    logger = init_logger(__name__)
    if debug:
        logger.setLevel(logging.DEBUG)

    if gatehome is None:
        gatehome = os.environ.get("GATE_HOME")
        if gatehome is None:
            raise Exception(
                "Parameter gatehome is None and environment var GATE_HOME not set"
            )
    if use_auth_token:
        if not auth_token:
            auth_token = secrets.token_urlsafe(20)
        else:
            auth_token = auth_token
    else:
        auth_token = ""
    if log_actions:
        log_actions = "1"
    else:
        log_actions = "0"
    if keep:
        keep = "1"
    else:
        keep = "0"
    logger.debug(
        f"Starting gate slave, gatehome={gatehome}, auth_token={auth_token}, log_actions={log_actions}, keep={keep}"
    )
    jarloc = os.path.join(
        os.path.dirname(__file__), "_jars", f"gatetools-gatenlpslave-{JARVERSION}.jar"
    )
    if not os.path.exists(jarloc):
        raise Exception("Could not find jar, {} does not exist".format(jarloc))
    logger.debug(f"Using JAR: {jarloc}")
    cmdandparms = [java, "-cp"]
    cpsep = classpath_sep(platform=platform)
    cmdandparms.append(jarloc + cpsep + gate_classpath(gatehome, platform=platform))
    cmdandparms.append("gate.tools.gatenlpslave.GatenlpSlave")
    cmdandparms.append(str(port))
    cmdandparms.append(host)
    cmdandparms.append(log_actions)
    cmdandparms.append(keep)
    os.environ["GATENLP_SLAVE_TOKEN_" + str(port)] = auth_token
    cmd = " ".join(cmdandparms)
    logger.debug(f"Running command: {cmd}")
    subproc = subprocess.Popen(
        cmdandparms, stderr=subprocess.PIPE, bufsize=0, encoding="utf-8"
    )

    def shutdown():
        """
        Handler that gets invoked when the calling Python program exits.
        This terminates the gate slave by sending the SIGINT signal to it.
        """
        subproc.send_signal(signal.SIGINT)
        for line in subproc.stderr:
            print(line, file=sys.stderr, end="")

    atexit.register(shutdown)
    while True:
        line = subproc.stderr.readline()
        if line == "":
            break
        line = line.rstrip("\n\r")
        if line == "PythonSlaveRunner.java: server start OK":
            break
        if line == "PythonSlaveRunner.java: server start NOT OK":
            raise Exception("Could not start server, giving up")
        print(line, file=sys.stderr)
    try:
        subproc.wait()
    except KeyboardInterrupt:
        print("Received keyboard interrupt, shutting down server...")
        shutdown()
    def __init__(
        self,
        source,
        fmt="gate-def",
        source_sep="\t",
        source_encoding="UTF-8",
        cache_source=None,
        tokenizer=None,
        all=False,
        skip=True,
        outset="",
        outtype="Lookup",
        annset="",
        tokentype="Token",
        feature=None,
        septype=None,
        splittype=None,
        withintype=None,
        mapfunc=None,
        ignorefunc=None,
        getterfunc=None,
        listfeatures=None,
        listtype=None,
    ):
        """

        Args:
            source: where to load the gazetteer from. What is actually expected here depends on the fmt
              parameter.
            fmt: defines what is expected as the format and/or content of the source parameter. One of:
               *  "gate-def" (default): the path to a GATE-style "def" file.
                  See https://gate.ac.uk/userguide/chap:gazetteers
               * "gazlist": a list of tuples or lists where the first element of the tuple/list
                  is a list of strings and the second element is a dictionary containing the features to assign.
                  All entries in the list belong to the first gazetteer list which has list features as
                  specified with the listfeatures parameter and a list type as specified with the listtype parameter.
            source_sep: the field separator to use for some source formats (default: tab character)
            source_encoding: the encoding to use for some source formats (default: UTF-8)
            feature: the feature name to use to get the string for each token. If the feature does not exist, is None
              or is the empty string, the Token is completely ignored. If the feature name is None, use the document
              string covered by the token.
            all: return all matches, if False only return longest matches
            skip: skip forward over longest match (do not return contained/overlapping matches)
            annset: the set where the tokens to match should come from
            outset: the set where the new annotations are added
            outtype: the annotation type of the annotations to create, unless a type is given for the gazetteer
               entry or for the gazetteer list.
            tokentype: the annotation type of the token annotations
            septype: the annotation type of separator annotations (NOT YET USED/IMPLEMENTED!)
            splittype: the annotation type of any split annotations which will end any ongoing match
            withintype: only matches fully within annotations of this type will be made
            mapfunc: a function that maps the original string extracted for each token to the actual string to use.
            ignorefunc: a function which given the mapped token string decides if the token should be ignored
              (not added to the gazetteer list, not considered in the document when matching)
            getterfunc: a function which, given a token annotation, retrieves the string. If there is mapfunc, the
              retrieved string is then still run through the mapfunc. The getterfunc must accept the token and
              an optional document as parameters.
            listfeatures: a dictionary of features common to the whole list or None. If what gets loaded specifies
              its own list features, this is getting ignored.
            listtype: the output annotation type to use for the list, ignored if the input format specifies this
              on its own. If the input does not specify this on its own and this is not None, then it takes
              precedence over outtype for the data loaded from source.

        """
        self.nodes = defaultdict(TokenGazetteerNode)
        self.mapfunc = mapfunc
        self.ignorefunc = ignorefunc
        self.feature = feature
        self.annset = annset
        self.tokentype = tokentype
        self.septype = septype
        self.splittype = splittype
        self.withintype = withintype
        self.outset = outset
        self.outtype = outtype
        self.all = all
        self.skip = skip
        self.tokenizer = tokenizer
        if getterfunc:
            self.getterfunc = getterfunc
        else:
            self.getterfunc = tokentext_getter
        self.listfeatures = []
        self.listtypes = []
        self.logger = init_logger(__name__)
        self.logger.setLevel(logging.DEBUG)
        self.append(source, fmt=fmt, listfeatures=listfeatures, listtype=listtype)
    parser.add_argument("--infmt",
                        type=str,
                        default="bdocjs",
                        help="Format / extension of initial files")
    parser.add_argument("--fmt",
                        type=str,
                        default="bdocjs",
                        help="Format / extension of benchmark files")
    args = parser.parse_args(args)
    return args


if __name__ == "__main__":

    args = process_args()
    logger = init_logger("loadsave")
    run_start(logger, "loadsave")

    if not os.path.exists(args.indir):
        raise Exception("Does not exist: {}".format(args.indir))
    if not os.path.exists(args.outdir):
        raise Exception("Does not exist: {}".format(args.outdir))

    gen = Path(args.indir).rglob("*.bdocjs")

    total_readorig = 0
    total_save = 0
    total_read = 0
    newfiles = []
    for f in gen:
        relpath = str(f)
Exemple #11
0
def interact(args=None, annotator=None):
    """Starts and handles the interaction with a GATE python plugin process.
    This will get started by the GATE plugin if the interaction uses
    pipes, but can also be started separately for http/websockets.

    This MUST be called in the user's python file!
    The python file should also have one class or function decorated
    with the @gatenlp.PR  decorator to identify it as the
    processing resource to the system.

    :return:

    Args:
      args:  (Default value = None)

    Returns:

    """
    logger = init_logger(__name__)
    loglvls = {
        "DEBUG": logging.DEBUG,
        "INFO": logging.INFO,
        "WARNING": logging.WARNING,
        "ERROR": logging.ERROR,
        "CRITICAL": logging.CRITICAL,
    }
    # before we do anything we need to check if a PR has actually
    # been defined. If not, use our own default debugging PR
    if gatenlp.gate_python_plugin_pr is None and annotator is None:
        logger.warning(
            "No processing resource defined with @GateNlpPr decorator or passed to interact, using default do-nothing"
        )
        _pr_decorator(DefaultPr)
    if annotator is not None:
        pr = _pr_decorator(annotator)
    else:
        pr = gatenlp.gate_python_plugin_pr

    if args is None:
        args = get_arguments()
    if args.d:
        logger.setLevel(logging.DEBUG)
    if args.log_lvl:
        if args.log_lvl not in loglvls:
            raise Exception("Not a valid log level: {}".format(args.log_lvl))
        logger.setLevel(loglvls[args.log_lvl])

    if args.mode == "check":
        return

    logger.info("Using gatenlp version {}\n".format(gatenlp.__version__))

    logger.debug("Starting interaction args={}".format(args))
    if args.mode == "pipe":
        if args.format != "json":
            raise Exception(
                "For interaction mode pipe, only format=json is supported")
        for line in instream:
            try:
                request = json.loads(line)
            except Exception as ex:
                logger.error("Unable to load from JSON:\n{}".format(line))
                raise ex
            logger.debug("Got request object: {}".format(request))
            cmd = request.get("command", None)
            stop_requested = False
            ret = None
            try:
                if cmd == "execute":
                    doc = Document.from_dict(request.get("data"))
                    om = doc.to_offset_type(OFFSET_TYPE_PYTHON)
                    doc.changelog = ChangeLog()
                    pr.execute(doc)
                    # NOTE: for now we just discard what the method returns and always return
                    # the changelog instead!
                    chlog = doc.changelog
                    # if we got an offset mapper earlier, we had to convert, so we convert back to JAVA
                    if om:
                        # replace True is faster, and we do not need the ChangeLog any more!
                        chlog.fixup_changes(offset_mapper=om,
                                            offset_type=OFFSET_TYPE_JAVA,
                                            replace=True)
                    ret = doc.changelog.to_dict()
                    logger.debug("Returning CHANGELOG: {}".format(ret))
                elif cmd == "start":
                    parms = request.get("data")
                    pr.start(parms)
                elif cmd == "finish":
                    ret = pr.finish()
                elif cmd == "reduce":
                    results = request.get("data")
                    ret = pr.reduce(results)
                elif cmd == "stop":
                    stop_requested = True
                else:
                    raise Exception("Odd command received: {}".format(cmd))
                response = {
                    "data": ret,
                    "status": "ok",
                }
            except Exception as ex:
                error = repr(ex)
                tb_str = traceback.format_exception(etype=type(ex),
                                                    value=ex,
                                                    tb=ex.__traceback__)
                print("ERROR when running python code:", file=sys.stderr)
                for line in tb_str:
                    print(line, file=sys.stderr, end=""
                          )  # what we get from traceback already has new lines
                info = "".join(tb_str)
                # in case we want the actual stacktrace data as well:
                st = [(f.filename, f.lineno, f.name, f.line)
                      for f in traceback.extract_tb(ex.__traceback__)]
                response = {
                    "data": None,
                    "status": "error",
                    "error": error,
                    "info": info,
                    "stacktrace": st,
                }
            logger.debug("Sending back response: {}".format(response))
            print(json.dumps(response), file=ostream)

            ostream.flush()
            if stop_requested:
                break
        # TODO: do any cleanup/restoring needed
        logger.debug("Finishing interaction")
    elif args.mode == "http":
        raise Exception("Mode http not implemented yet")
    elif args.mode == "websockets":
        raise Exception("Mode websockets not implemented yet")
    elif args.mode in ["file", "dir"]:
        if not args.path:
            raise Exception("Mode file or dir but no --path specified")
        fileext = ".bdoc" + args.format
        if args.mode == "file" and not os.path.isfile(args.path):
            raise Exception("Mode file but path is not a file: {}".format(
                args.path))
        elif args.mode == "dir" and not os.path.isdir(args.path):
            raise Exception("Mode dir but path is not a directory: {}".format(
                args.path))
        if args.mode == "file":
            pr.start({})
            logger.info(f"Loading file {args.path}")
            doc = Document.load(args.path)
            pr.execute(doc)
            pr.finish()
            if args.out:
                logger.info(f"Saving file to {args.out}")
                doc.save(args.out)
            else:
                logger.info(f"Saving file to {args.path}")
                doc.save(args.path)
        else:
            import glob

            pr.start({})
            files = glob.glob(args.path + os.path.sep + "*" + fileext)
            for file in files:
                logger.info("Loading file {}".format(file))
                doc = Document.load(file)
                pr.execute(doc)
                if args.out:
                    tofile = os.path.join(args.out, os.path.basename(file))
                    logger.info("Saving to {}".format(tofile))
                    doc.save(tofile)
                else:
                    logger.info("Saving to {}".format(file))
                    doc.save(file)
            pr.finish()
    else:
        raise Exception("Not a valid mode: {}".format(args.mode))
Exemple #12
0
            import glob

            pr.start({})
            files = glob.glob(args.path + os.path.sep + "*" + fileext)
            for file in files:
                logger.info("Loading file {}".format(file))
                doc = Document.load(file)
                pr.execute(doc)
                if args.out:
                    tofile = os.path.join(args.out, os.path.basename(file))
                    logger.info("Saving to {}".format(tofile))
                    doc.save(tofile)
                else:
                    logger.info("Saving to {}".format(file))
                    doc.save(file)
            pr.finish()
    else:
        raise Exception("Not a valid mode: {}".format(args.mode))


if __name__ == "__main__":
    # we run this from the command line so we need to also first load the PR code from the python file
    args = get_arguments(from_main=True)
    logger = init_logger(__name__)
    import importlib.util

    spec = importlib.util.spec_from_file_location("gateapp", args.pythonfile)
    foo = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(foo)
    interact(args=args)
    import sortedcontainers
except Exception as ex:
    import sys

    print("ERROR: required package sortedcontainers cannot be imported!",
          file=sys.stderr)
    print(
        "Please install it, using e.g. 'pip install -U sortedcontainers'",
        file=sys.stderr,
    )
    sys.exit(1)
# TODO: check version of sortedcontainers (we have 2.1.0)

from gatenlp.utils import init_logger

logger = init_logger("gatenlp")

# this attribute globally holds the processing resource last defined
# so it can be used for interacting with the GATE python plugin
from gatenlp.gate_interaction import _pr_decorator as GateNlpPr
from gatenlp.gate_interaction import interact
from gatenlp.annotation import Annotation
from gatenlp.document import Document
from gatenlp.annotation_set import AnnotationSet
from gatenlp.changelog import ChangeLog
from gatenlp.gateslave import GateSlave
from gatenlp.span import Span


def init_notebook():
    from gatenlp.serialization.default import HtmlAnnViewerSerializer
Exemple #14
0
    def __init__(
        self,
        url=None,
        service=None,
        auth=None,
        success_code=None,
        access_token=None,
        refresh_access=False,
        out_annset="",
        min_delay_ms=501,
        anntypes_map=None,
    ):
        """
        Create an ElgTextAnnotator.

        NOTE: error handling is not properly implemented yet since we do not know yet how exactly the various
        error conditions are represented in the result returned from the ELG services. For now, any error will
        throw an exception when `__call__` is invoked.

        NOTE: initialization can fail with an exception if success_code is specified and retrieving the
        authentification information fails.

        Args:
            url:  the annotation service URL to use. If not specified, the service parameter must be specified.
            service: the ELG service number or a tuple (servicenumber, domain). This requires the elg package.
                This may raise an exception. If successful, the url and service_meta attributes are set.
            auth: a pre-initialized ELG Authentication object. Requires the elg package. If not specified, the
                success_code or access_token parameter must be specified.
            success_code: the success code returned from the ELG web page for one of the URLs to obtain
                success codes. This will try to obtain the authentication information and store it in the
                `auth` attribute.  Requires the elg package.
                To obtain a success code, go the the ELG_SC_LIVE_URL_OPENID or ELG_SC_LIVE_URL_OFFLINE url
                and log in with your ELG user id, this will show the success code that can be copy-pasted.
            access_token: the access token token for the ELG service. Only used if auth or success_code are not
                specified. The access token is probably only valid for a limited amount of time. No refresh
                will be done and once the access token is invalid, calling `__call__` will fail with an exception.
                The access token can be obtained using the elg package or copied from the "Code samples" tab
                on the web page for a service after logging in.
            refresh_access: if True, will try to refresh the access token if auth or success_code was specified and
                refreshing is possible. Ignored if only access_token was specified
            out_annset: the name of the annotation set where to create the annotations (default: "")
            min_delay_ms: the minimum delay time between requests in milliseconds (default: 501 ms)
            anntypes_map: a map for renaming the annotation type names from the service to the ones to use in
               the annotated document.
        """
        if [x is not None for x in [url, service]].count(True) != 1:
            raise Exception("Exactly one of service or url must be specified")
        if [x is not None
                for x in [auth, success_code, access_token]].count(True) != 1:
            raise Exception(
                "Exactly one of auth, success_code, or access_token must be specified"
            )
        self.access_token = access_token
        self.success_code = success_code
        self.auth = auth
        self.url = url
        self.service = service
        self.service_meta = None
        self.refresh_access = refresh_access
        # first check if we need to import the elg package
        import_elg = False
        if access_token:
            self.refresh_access = False
        if service is not None:
            import_elg = True
        if auth or success_code:
            import_elg = True
        if import_elg:
            try:
                from elg import Authentication
                from elg.utils import get_domain, get_metadatarecord
            except Exception as ex:
                raise Exception(
                    "For this gatenlp must be installed with extra elg or extra all, e.g. gatenlp[elg]",
                    ex)
        if service is not None:
            # update this to use the new method:
            # https://gitlab.com/european-language-grid/platform/python-client/-/issues/9
            if isinstance(service, tuple):
                service_id, domain = service
            else:
                service_id = service
                domain = get_domain("live")
            self.service_meta = get_metadatarecord(service_id, domain)
            # NOTE: there is also elg_execution_location for async requests!
            self.url = self.service_meta["service_info"][
                "elg_execution_location_sync"]
        if success_code is not None:
            self.auth = Authentication.from_success_code(success_code,
                                                         domain="live")
        if self.auth:
            self.access_token = self.auth.access_token
        self.min_delay_s = min_delay_ms / 1000.0
        self.anntypes_map = anntypes_map
        self.out_annset = out_annset
        self.logger = init_logger(__name__)
        # self.logger.setLevel(logging.DEBUG)
        self._last_call_time = 0