Ejemplo n.º 1
0
        def do_POST(self):
            """
            Handle an annotate request
            """
            if not self.path.endswith("/"): self.path += "/"
            if self.path == "/annotate/":
                # Read message
                length = int(self.headers.get('content-length'))
                msg = self.rfile.read(length)

                # Do the annotation
                doc = Document()
                parseFromDelimitedString(doc, msg)
                self.annotator.annotate(doc)

                with io.BytesIO() as stream:
                    writeToDelimitedString(doc, stream)
                    msg = stream.getvalue()

                # write message
                self.send_response(HTTPStatus.OK)
                self.send_header("Content-Type", "application/x-protobuf")
                self.send_header("Content-Length", len(msg))
                self.end_headers()
                self.wfile.write(msg)

            else:
                self.send_response(HTTPStatus.BAD_REQUEST)
                self.end_headers()
Ejemplo n.º 2
0
    def parse_sentence(self, sentence: str, properties: Optional[Dict] = None):
        """
        Run CoreNLP over a sentence.
        :param sentence: a single sentence
        :param properties: additional properties for CoreNLP
        :return: parsing result
        """
        # The same input sentence can result in different annotations depending on the CoreNLP properties specified.
        # We therefore use a cache identifier for the sentence which includes the annotation properties.
        sent_cache_identifier = get_dict_hash(
            {
                "sentence": sentence,
                "properties": properties
            }, shorten=False)

        if not sent_cache_identifier in self.cache:
            # Kludge ahead: We want to cache the parsed sentence provided by CoreNLP, but also want to work with it in
            # a convenient format. A convenient format is the default format (protobuf-based), but that's not
            # pickle-able for the cache. We therefore convert the protobuf-format back into a bytestring and cache that.
            # When reading from the cache, we reassemble the protobuf object.
            req_properties = {"outputFormat": "serialized"}
            if properties is not None:
                req_properties.update(properties)
            doc = self.client.annotate(sentence, properties=req_properties)
            stream = writeToDelimitedString(doc)
            buf = stream.getvalue()
            stream.close()
            self.cache[sent_cache_identifier] = buf
        else:
            buf = self.cache[sent_cache_identifier]
            doc = Document()
            parseFromDelimitedString(doc, buf)

        return doc
Ejemplo n.º 3
0
def test_write_protobuf(doc_pb):
    stream = writeToDelimitedString(doc_pb)
    buf = stream.getvalue()
    stream.close()

    doc_pb_ = Document()
    parseFromDelimitedString(doc_pb_, buf)
    assert doc_pb == doc_pb_
Ejemplo n.º 4
0
def doc_pb():
    test_dir = os.path.dirname(os.path.abspath(__file__))
    test_data = os.path.join(test_dir, 'data', 'test.dat')
    with open(test_data, 'rb') as f:
        buf = f.read()
    doc = Document()
    parseFromDelimitedString(doc, buf)
    return doc
Ejemplo n.º 5
0
    def annotate(self,
                 text,
                 annotators=None,
                 output_format=None,
                 properties=None):
        """Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (list | string) annotators: list of annotators to use
        :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
        :param (dict) properties: properties that the server expects
        :return: request result
        """
        # set properties for server call
        if properties is None:
            properties = self.default_properties
            properties.update({
                'annotators':
                ','.join(annotators or self.default_annotators),
                'inputFormat':
                'text',
                'outputFormat':
                self.default_output_format,
                'serializer':
                'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        elif "annotators" not in properties:
            properties.update({
                'annotators':
                ','.join(annotators or self.default_annotators)
            })
        # if an output_format is specified, use that to override
        if output_format is not None:
            properties["outputFormat"] = output_format
        # make the request
        r = self._request(text.encode('utf-8'), properties)
        # customize what is returned based outputFormat
        if properties["outputFormat"] == "serialized":
            doc = Document()
            parseFromDelimitedString(doc, r.content)
            return doc
        elif properties["outputFormat"] == "json":
            return r.json()
        elif properties["outputFormat"] in ["text", "conllu", "conll", "xml"]:
            return r.text
        else:
            return r
Ejemplo n.º 6
0
    def update(self, doc, annotators=None, properties=None):
        if properties is None:
            properties = self.default_properties
            properties.update({
                'annotators':
                ','.join(annotators or self.default_annotators),
                'inputFormat':
                'serialized',
                'outputFormat':
                'serialized',
                'serializer':
                'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        with io.BytesIO() as stream:
            writeToDelimitedString(doc, stream)
            msg = stream.getvalue()

        r = self._request(msg, properties)
        doc = Document()
        parseFromDelimitedString(doc, r.content)
        return doc
Ejemplo n.º 7
0
    def update(self, doc, annotators=None, properties=None):
        if properties is None:
            properties = {}
            properties.update({
                'inputFormat':
                'serialized',
                'outputFormat':
                'serialized',
                'serializer':
                'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        if annotators:
            properties['annotators'] = ",".join(annotators) if isinstance(
                annotators, list) else annotators
        with io.BytesIO() as stream:
            writeToDelimitedString(doc, stream)
            msg = stream.getvalue()

        r = self._request(msg, properties)
        doc = Document()
        parseFromDelimitedString(doc, r.content)
        return doc
Ejemplo n.º 8
0
    def annotate(self,
                 text,
                 annotators=None,
                 output_format=None,
                 properties_key=None,
                 properties=None,
                 **kwargs):
        """
        Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (list | string) annotators: list of annotators to use
        :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)

        The properties for a request are written in this order:

        1. Server default properties (server side)
        2. Properties from client's properties_cache corresponding to properties_key (client side)
           If the properties_key is the name of a Stanford CoreNLP supported language:
           [Arabic, Chinese, English, French, German, Spanish], the Stanford CoreNLP defaults will be used (server side)
        3. Additional properties corresponding to properties (client side)
        4. Special case specific properties: annotators, output_format (client side)

        :return: request result
        """
        # set properties for server call
        # first look for a cached default properties set
        # if a Stanford CoreNLP supported language is specified, just pass {pipelineLanguage="french"}
        if properties_key is not None:
            if properties_key.lower() in CoreNLPClient.PIPELINE_LANGUAGES:
                request_properties = {
                    'pipelineLanguage': properties_key.lower()
                }
            else:
                request_properties = self.properties_cache.get(
                    properties_key, {})
        else:
            request_properties = {}
        # add on custom properties for this request
        if properties is None:
            properties = {}
        request_properties.update(properties)
        # if annotators list is specified, override with that
        if annotators is not None:
            request_properties['annotators'] = ",".join(
                annotators) if isinstance(annotators, list) else annotators
        # always send an output format with request
        # in some scenario's the server's default output format is unknown, so default to serialized
        if output_format is not None:
            request_properties['outputFormat'] = output_format
        if request_properties.get('outputFormat') is None:
            if self.server_start_info.get('props', {}).get('outputFormat'):
                request_properties['outputFormat'] = self.server_start_info[
                    'props']['outputFormat']
            else:
                request_properties[
                    'outputFormat'] = CoreNLPClient.DEFAULT_OUTPUT_FORMAT
        # make the request
        r = self._request(text.encode('utf-8'), request_properties, **kwargs)
        if request_properties["outputFormat"] == "json":
            return r.json()
        elif request_properties["outputFormat"] == "serialized":
            doc = Document()
            parseFromDelimitedString(doc, r.content)
            return doc
        elif request_properties["outputFormat"] in [
                "text", "conllu", "conll", "xml"
        ]:
            return r.text
        else:
            return r