Exemple #1
0
        def do_POST(self):
            """
            Handle an annotate request
            """
            if not self.path.endswith("/"): self.path += "/"
            if self.path == "/annotate/":
                # Read message
                length = int(self.headers.get('content-length'))
                msg = self.rfile.read(length)

                # Do the annotation
                doc = Document()
                parseFromDelimitedString(doc, msg)
                self.annotator.annotate(doc)

                with io.BytesIO() as stream:
                    writeToDelimitedString(doc, stream)
                    msg = stream.getvalue()

                # write message
                self.send_response(HTTPStatus.OK)
                self.send_header("Content-Type", "application/x-protobuf")
                self.send_header("Content-Length", len(msg))
                self.end_headers()
                self.wfile.write(msg)

            else:
                self.send_response(HTTPStatus.BAD_REQUEST)
                self.end_headers()
Exemple #2
0
    def annotate(self, text, annotators=None, properties=None):
        """Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (dict) properties: properties that the server expects
        :return: request result
        """
        if properties is None:
            properties = self.default_properties
            properties.update({
                'annotators':
                ','.join(annotators or self.default_annotators),
                'inputFormat':
                'text',
                'outputFormat':
                'serialized',
                'serializer':
                'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })

        # 如果text本身就是utf-8,再encode会报错
        if isinstance(text, unicode):
            text = text.encode('utf-8')
        r = self._request(text, properties)
        doc = Document()
        parseFromDelimitedString(doc, r.content)
        return doc
        def do_POST(self):
            """
            Handle an annotate request
            """
            if not self.path.endswith("/"): self.path += "/"
            if self.path == "/annotate/":
                # Read message
                length = int(self.headers.get('content-length'))
                msg = self.rfile.read(length)

                # Do the annotation
                doc = Document()
                parseFromDelimitedString(doc, msg)
                self.annotator.annotate(doc)

                with io.BytesIO() as stream:
                    writeToDelimitedString(doc, stream)
                    msg = stream.getvalue()

                # write message
                self.send_response(HTTPStatus.OK)
                self.send_header("Content-Type", "application/x-protobuf")
                self.send_header("Content-Length", len(msg))
                self.end_headers()
                self.wfile.write(msg)

            else:
                self.send_response(HTTPStatus.BAD_REQUEST)
                self.end_headers()
def test_write_protobuf(doc_pb):
    stream = writeToDelimitedString(doc_pb)
    buf = stream.getvalue()
    stream.close()

    doc_pb_ = Document()
    parseFromDelimitedString(doc_pb_, buf)
    assert doc_pb == doc_pb_
def doc_pb():
    test_dir = os.path.dirname(os.path.abspath(__file__))
    test_data = os.path.join(test_dir, 'data', 'test.dat')
    with open(test_data, 'rb') as f:
        buf = f.read()
    doc = Document()
    parseFromDelimitedString(doc, buf)
    return doc
Exemple #6
0
    def annotate(self,
                 text,
                 annotators=None,
                 output_format=None,
                 properties=None,
                 date=None):
        """Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (list | string) annotators: list of annotators to use
        :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
        :param (dict) properties: properties that the server expects
        :return: request result
        """
        # set properties for server call
        if properties is None:
            properties = self.default_properties
            properties.update({
                'annotators':
                ','.join(annotators or self.default_annotators),
                'inputFormat':
                'text',
                'outputFormat':
                self.default_output_format,
                'serializer':
                'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        elif "annotators" not in properties:
            properties.update({
                'annotators':
                ','.join(annotators or self.default_annotators)
            })
        # if an output_format is specified, use that to override
        if output_format is not None:
            properties["outputFormat"] = output_format
        # make the request
        r = self._request(text.encode('utf-8'), properties, date)
        # customize what is returned based outputFormat
        if properties["outputFormat"] == "serialized":
            doc = Document()
            parseFromDelimitedString(doc, r.content)
            return doc
        elif properties["outputFormat"] == "json":
            return r.json()
        elif properties["outputFormat"] in ["text", "conllu", "conll", "xml"]:
            return r.text
        else:
            return r
    def update(self, doc, annotators=None, properties=None):
        if properties is None:
            properties = self.default_properties
            properties.update({
                'annotators': ','.join(annotators or self.default_annotators),
                'inputFormat': 'serialized',
                'outputFormat': 'serialized',
                'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        with io.BytesIO() as stream:
            writeToDelimitedString(doc, stream)
            msg = stream.getvalue()

        r = self._request(msg, properties)
        doc = Document()
        parseFromDelimitedString(doc, r.content)
        return doc
    def annotate(self, text, annotators=None, properties=None):
        """Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (dict) properties: properties that the server expects
        :return: request result
        """
        if properties is None:
            properties = self.default_properties
            properties.update({
                'annotators': ','.join(annotators or self.default_annotators),
                'inputFormat': 'text',
                'outputFormat': 'serialized',
                'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        r = self._request(text.encode('utf-8'), properties)
        doc = Document()
        parseFromDelimitedString(doc, r.content)
        return doc
Exemple #9
0
    def update(self, doc, annotators=None, properties=None):
        if properties is None:
            properties = self.default_properties
            properties.update({
                'annotators':
                ','.join(annotators or self.default_annotators),
                'inputFormat':
                'serialized',
                'outputFormat':
                'serialized',
                'serializer':
                'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        with io.BytesIO() as stream:
            writeToDelimitedString(doc, stream)
            msg = stream.getvalue()

        r = self._request(msg, properties)
        doc = Document()
        parseFromDelimitedString(doc, r.content)
        return doc
Exemple #10
0
def run_corenlp(text, corenlp_ips):
    """ Run CoreNLP through server/REST API"""

    text_ = text.encode('utf-8')

    try:
        params = {
            'properties': '{"annotators":"tokenize,ssplit,pos,lemma,ner,parse,coref", "tokenize.whitespace": "true"}',
            'outputFormat': 'serialized',
            "serializer": "edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer"
        }

        url = random.choice(corenlp_ips)
        r = requests.post(url, params=params, data=text_, headers={'Content-type': 'text/plain'})
        print('\tdone.')

        if r.status_code == 200:
            # Decode protobuf object
            doc = Document()
            parseFromDelimitedString(doc, r.content)            

            print(f'Processing annotations...')
            annotated_text, chars = process_text(doc, text)
            print('\tdone.')
            
            return annotated_text, chars

        else:
            print("CoreNLP Error, status code:{}".format(r.status_code))
            return None, None

    except Exception as e:
        track = traceback.format_exc()
        print(track)
        #print('Server {} not working'.format(url))
        print('Server not working')
        return None, None
Exemple #11
0
def from_bytes(bytes_):
    doc = Document()
    parseFromDelimitedString(doc, bytes_)
    return doc
Exemple #12
0
 def _protobuf(response):
     proto_doc = corenlp_protobuf.Document()
     corenlp_protobuf.parseFromDelimitedString(proto_doc, response.content)
     return proto_doc
Exemple #13
0
def from_bytes(protobuf):
    doc = core.Document()
    core.parseFromDelimitedString(doc, protobuf)
    return doc