def do_POST(self): """ Handle an annotate request """ if not self.path.endswith("/"): self.path += "/" if self.path == "/annotate/": # Read message length = int(self.headers.get('content-length')) msg = self.rfile.read(length) # Do the annotation doc = Document() parseFromDelimitedString(doc, msg) self.annotator.annotate(doc) with io.BytesIO() as stream: writeToDelimitedString(doc, stream) msg = stream.getvalue() # write message self.send_response(HTTPStatus.OK) self.send_header("Content-Type", "application/x-protobuf") self.send_header("Content-Length", len(msg)) self.end_headers() self.wfile.write(msg) else: self.send_response(HTTPStatus.BAD_REQUEST) self.end_headers()
def annotate(self, text, annotators=None, properties=None): """Send a request to the CoreNLP server. :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (dict) properties: properties that the server expects :return: request result """ if properties is None: properties = self.default_properties properties.update({ 'annotators': ','.join(annotators or self.default_annotators), 'inputFormat': 'text', 'outputFormat': 'serialized', 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' }) # 如果text本身就是utf-8,再encode会报错 if isinstance(text, unicode): text = text.encode('utf-8') r = self._request(text, properties) doc = Document() parseFromDelimitedString(doc, r.content) return doc
def test_write_protobuf(doc_pb): stream = writeToDelimitedString(doc_pb) buf = stream.getvalue() stream.close() doc_pb_ = Document() parseFromDelimitedString(doc_pb_, buf) assert doc_pb == doc_pb_
def doc_pb(): test_dir = os.path.dirname(os.path.abspath(__file__)) test_data = os.path.join(test_dir, 'data', 'test.dat') with open(test_data, 'rb') as f: buf = f.read() doc = Document() parseFromDelimitedString(doc, buf) return doc
def annotate(self, text, annotators=None, output_format=None, properties=None, date=None): """Send a request to the CoreNLP server. :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (list | string) annotators: list of annotators to use :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml :param (dict) properties: properties that the server expects :return: request result """ # set properties for server call if properties is None: properties = self.default_properties properties.update({ 'annotators': ','.join(annotators or self.default_annotators), 'inputFormat': 'text', 'outputFormat': self.default_output_format, 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' }) elif "annotators" not in properties: properties.update({ 'annotators': ','.join(annotators or self.default_annotators) }) # if an output_format is specified, use that to override if output_format is not None: properties["outputFormat"] = output_format # make the request r = self._request(text.encode('utf-8'), properties, date) # customize what is returned based outputFormat if properties["outputFormat"] == "serialized": doc = Document() parseFromDelimitedString(doc, r.content) return doc elif properties["outputFormat"] == "json": return r.json() elif properties["outputFormat"] in ["text", "conllu", "conll", "xml"]: return r.text else: return r
def update(self, doc, annotators=None, properties=None): if properties is None: properties = self.default_properties properties.update({ 'annotators': ','.join(annotators or self.default_annotators), 'inputFormat': 'serialized', 'outputFormat': 'serialized', 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' }) with io.BytesIO() as stream: writeToDelimitedString(doc, stream) msg = stream.getvalue() r = self._request(msg, properties) doc = Document() parseFromDelimitedString(doc, r.content) return doc
def run_corenlp(text, corenlp_ips): """ Run CoreNLP through server/REST API""" text_ = text.encode('utf-8') try: params = { 'properties': '{"annotators":"tokenize,ssplit,pos,lemma,ner,parse,coref", "tokenize.whitespace": "true"}', 'outputFormat': 'serialized', "serializer": "edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer" } url = random.choice(corenlp_ips) r = requests.post(url, params=params, data=text_, headers={'Content-type': 'text/plain'}) print('\tdone.') if r.status_code == 200: # Decode protobuf object doc = Document() parseFromDelimitedString(doc, r.content) print(f'Processing annotations...') annotated_text, chars = process_text(doc, text) print('\tdone.') return annotated_text, chars else: print("CoreNLP Error, status code:{}".format(r.status_code)) return None, None except Exception as e: track = traceback.format_exc() print(track) #print('Server {} not working'.format(url)) print('Server not working') return None, None
def from_bytes(bytes_): doc = Document() parseFromDelimitedString(doc, bytes_) return doc