Ejemplo n.º 1
0
    def load(clazz, from_ext=None, ignore_unknown_types=False):
        """

        Args:
          clazz:
          from_ext: (Default value = None)
          ignore_unknown_types: (Default value = False)

        Returns:

        """
        # TODO: the code below is just an outline and needs work!
        # TODO: make use of the test document created in repo project-python-gatenlp
        import xml.etree.ElementTree as ET

        isurl, extstr = is_url(from_ext)
        if isurl:
            xmlstring = get_str_from_url(extstr, encoding="utf-8")
            root = ET.fromstring(xmlstring)
        else:
            tree = ET.parse(extstr)
            root = tree.getroot()

        # or: root = ET.fromstring(xmlstring)

        # check we do have a GATE document

        assert root.tag == "GateDocument"
        assert root.attrib == {"version": "3"}

        def parsefeatures(feats):
            """

            Args:
              feats:

            Returns:

            """
            features = {}
            for feat in list(feats):
                name = None
                value = None
                for el in list(feat):
                    if el.tag == "Name":
                        if el.get("className") == "java.lang.String":
                            name = el.text
                        else:
                            raise Exception("Odd Feature Name type: " +
                                            el.get("className"))
                    elif el.tag == "Value":
                        cls_name = el.get("className")
                        if cls_name == "java.lang.String":
                            value = el.text
                        elif cls_name == "java.lang.Integer":
                            value = int(el.text)
                        elif cls_name == "java.lang.Long":
                            value = int(el.text)
                        elif cls_name == "java.math.BigDecimal":
                            value = float(el.text)
                        elif cls_name == "java.lang.Boolean":
                            value = bool(el.text)
                        # elif cls_name == "gate.corpora.ObjectWrapper":
                        #    value = GateXmlLoader.value4objectwrapper(el.text)
                        else:
                            if ignore_unknown_types:
                                print(
                                    f"Warning: ignoring feature with serialization type: {cls_name}",
                                    file=sys.stderr,
                                )
                            else:
                                raise Exception(
                                    "Unsupported serialization type: " +
                                    el.get("className"))
                if name is not None and value is not None:
                    features[name] = value
            return features

        # get the document features
        docfeatures = {}
        feats = root.findall("./GateDocumentFeatures/Feature")

        docfeatures = parsefeatures(feats)

        textwithnodes = root.findall("./TextWithNodes")
        text = ""
        node2offset = {}
        curoff = 0
        for item in textwithnodes:
            if item.text:
                print("Got item text: ", item.text)
                text += item.text
                # TODO HTML unescape item text
                curoff += len(item.text)
            for node in item:
                nodeid = node.get("id")
                node2offset[nodeid] = curoff
                if node.tail:
                    # TODO: unescape item.text?
                    print("Gote node tail: ", node.tail)
                    text += node.tail
                    curoff += len(node.tail)

        annsets = root.findall("./AnnotationSet")

        annotation_sets = {}  # map name - set
        for annset in annsets:
            if annset.get("Name"):
                setname = annset.get("Name")
            else:
                setname = ""
            annots = annset.findall("./Annotation")
            annotations = []
            maxannid = 0
            for ann in annots:
                annid = int(ann.attrib["Id"])
                maxannid = max(maxannid, annid)
                anntype = ann.attrib["Type"]
                startnode = ann.attrib["StartNode"]
                endnode = ann.attrib["EndNode"]
                startoff = node2offset[startnode]
                endoff = node2offset[endnode]
                feats = ann.findall("./Feature")
                features = parsefeatures(feats)
                if len(features) == 0:
                    features = None
                annotation = {
                    "id": annid,
                    "type": anntype,
                    "start": startoff,
                    "end": endoff,
                    "features": features,
                }
                annotations.append(annotation)
            annset = {
                "name": setname,
                "annotations": annotations,
                "next_annid": maxannid + 1,
            }
            annotation_sets[setname] = annset

        docmap = {
            "text": text,
            "features": docfeatures,
            "offset_type": "p",
            "annotation_sets": annotation_sets,
        }

        doc = Document.from_dict(docmap)
        return doc
Ejemplo n.º 2
0
def interact(args=None, annotator=None):
    """Starts and handles the interaction with a GATE python plugin process.
    This will get started by the GATE plugin if the interaction uses
    pipes, but can also be started separately for http/websockets.

    This MUST be called in the user's python file!
    The python file should also have one class or function decorated
    with the @gatenlp.PR  decorator to identify it as the
    processing resource to the system.

    :return:

    Args:
      args:  (Default value = None)

    Returns:

    """
    logger = init_logger(__name__)
    loglvls = {
        "DEBUG": logging.DEBUG,
        "INFO": logging.INFO,
        "WARNING": logging.WARNING,
        "ERROR": logging.ERROR,
        "CRITICAL": logging.CRITICAL,
    }
    # before we do anything we need to check if a PR has actually
    # been defined. If not, use our own default debugging PR
    if gatenlp.gate_python_plugin_pr is None and annotator is None:
        logger.warning(
            "No processing resource defined with @GateNlpPr decorator or passed to interact, using default do-nothing"
        )
        _pr_decorator(DefaultPr)
    if annotator is not None:
        pr = _pr_decorator(annotator)
    else:
        pr = gatenlp.gate_python_plugin_pr

    if args is None:
        args = get_arguments()
    if args.d:
        logger.setLevel(logging.DEBUG)
    if args.log_lvl:
        if args.log_lvl not in loglvls:
            raise Exception("Not a valid log level: {}".format(args.log_lvl))
        logger.setLevel(loglvls[args.log_lvl])

    if args.mode == "check":
        return

    logger.info("Using gatenlp version {}\n".format(gatenlp.__version__))

    logger.debug("Starting interaction args={}".format(args))
    if args.mode == "pipe":
        if args.format != "json":
            raise Exception(
                "For interaction mode pipe, only format=json is supported")
        for line in instream:
            try:
                request = json.loads(line)
            except Exception as ex:
                logger.error("Unable to load from JSON:\n{}".format(line))
                raise ex
            logger.debug("Got request object: {}".format(request))
            cmd = request.get("command", None)
            stop_requested = False
            ret = None
            try:
                if cmd == "execute":
                    doc = Document.from_dict(request.get("data"))
                    om = doc.to_offset_type(OFFSET_TYPE_PYTHON)
                    doc.changelog = ChangeLog()
                    pr.execute(doc)
                    # NOTE: for now we just discard what the method returns and always return
                    # the changelog instead!
                    chlog = doc.changelog
                    # if we got an offset mapper earlier, we had to convert, so we convert back to JAVA
                    if om:
                        # replace True is faster, and we do not need the ChangeLog any more!
                        chlog.fixup_changes(offset_mapper=om,
                                            offset_type=OFFSET_TYPE_JAVA,
                                            replace=True)
                    ret = doc.changelog.to_dict()
                    logger.debug("Returning CHANGELOG: {}".format(ret))
                elif cmd == "start":
                    parms = request.get("data")
                    pr.start(parms)
                elif cmd == "finish":
                    ret = pr.finish()
                elif cmd == "reduce":
                    results = request.get("data")
                    ret = pr.reduce(results)
                elif cmd == "stop":
                    stop_requested = True
                else:
                    raise Exception("Odd command received: {}".format(cmd))
                response = {
                    "data": ret,
                    "status": "ok",
                }
            except Exception as ex:
                error = repr(ex)
                tb_str = traceback.format_exception(etype=type(ex),
                                                    value=ex,
                                                    tb=ex.__traceback__)
                print("ERROR when running python code:", file=sys.stderr)
                for line in tb_str:
                    print(line, file=sys.stderr, end=""
                          )  # what we get from traceback already has new lines
                info = "".join(tb_str)
                # in case we want the actual stacktrace data as well:
                st = [(f.filename, f.lineno, f.name, f.line)
                      for f in traceback.extract_tb(ex.__traceback__)]
                response = {
                    "data": None,
                    "status": "error",
                    "error": error,
                    "info": info,
                    "stacktrace": st,
                }
            logger.debug("Sending back response: {}".format(response))
            print(json.dumps(response), file=ostream)

            ostream.flush()
            if stop_requested:
                break
        # TODO: do any cleanup/restoring needed
        logger.debug("Finishing interaction")
    elif args.mode == "http":
        raise Exception("Mode http not implemented yet")
    elif args.mode == "websockets":
        raise Exception("Mode websockets not implemented yet")
    elif args.mode in ["file", "dir"]:
        if not args.path:
            raise Exception("Mode file or dir but no --path specified")
        fileext = ".bdoc" + args.format
        if args.mode == "file" and not os.path.isfile(args.path):
            raise Exception("Mode file but path is not a file: {}".format(
                args.path))
        elif args.mode == "dir" and not os.path.isdir(args.path):
            raise Exception("Mode dir but path is not a directory: {}".format(
                args.path))
        if args.mode == "file":
            pr.start({})
            logger.info(f"Loading file {args.path}")
            doc = Document.load(args.path)
            pr.execute(doc)
            pr.finish()
            if args.out:
                logger.info(f"Saving file to {args.out}")
                doc.save(args.out)
            else:
                logger.info(f"Saving file to {args.path}")
                doc.save(args.path)
        else:
            import glob

            pr.start({})
            files = glob.glob(args.path + os.path.sep + "*" + fileext)
            for file in files:
                logger.info("Loading file {}".format(file))
                doc = Document.load(file)
                pr.execute(doc)
                if args.out:
                    tofile = os.path.join(args.out, os.path.basename(file))
                    logger.info("Saving to {}".format(tofile))
                    doc.save(tofile)
                else:
                    logger.info("Saving to {}".format(file))
                    doc.save(file)
            pr.finish()
    else:
        raise Exception("Not a valid mode: {}".format(args.mode))