def test_write_protobuf(doc_pb): stream = writeToDelimitedString(doc_pb) buf = stream.getvalue() stream.close() doc_pb_ = Document() parseFromDelimitedString(doc_pb_, buf) assert doc_pb == doc_pb_
def doc_pb(): test_dir = os.path.dirname(os.path.abspath(__file__)) test_data = os.path.join(test_dir, 'data', 'test.dat') with open(test_data, 'rb') as f: buf = f.read() doc = Document() parseFromDelimitedString(doc, buf) return doc
def update(self, doc, annotators=None, properties=None): if properties is None: properties = {} properties.update({ 'inputFormat': 'serialized', 'outputFormat': 'serialized', 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' }) if annotators: properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators) with io.BytesIO() as stream: writeToDelimitedString(doc, stream) msg = stream.getvalue() r = self._request(msg, properties) doc = Document() parseFromDelimitedString(doc, r.content) return doc
def annotate(self, text, annotators=None, output_format=None, properties=None, reset_default=None, **kwargs): """ Send a request to the CoreNLP server. :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (list | string) annotators: list of annotators to use :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml :param (dict) properties: additional request properties (written on top of defaults) :param (bool) reset_default: don't use server defaults Precedence for settings: 1. annotators and output_format args 2. Values from properties dict 3. Client defaults self.annotators and self.output_format (set during client construction) 4. Server defaults Additional request parameters (apart from CoreNLP pipeline properties) such as 'username' and 'password' can be specified with the kwargs. :return: request result """ # validate request properties validate_corenlp_props(properties=properties, annotators=annotators, output_format=output_format) # set request properties request_properties = {} # start with client defaults if self.annotators is not None: request_properties['annotators'] = self.annotators if self.output_format is not None: request_properties['outputFormat'] = self.output_format # add values from properties arg # handle str case if type(properties) == str: if is_corenlp_lang(properties): properties = {'pipelineLanguage': properties.lower()} if reset_default is None: reset_default = True else: raise ValueError(f"Unrecognized properties keyword {properties}") if type(properties) == dict: request_properties.update(properties) # if annotators list is specified, override with that # also can use the annotators field the object was created with if annotators is not None and (type(annotators) == str or type(annotators) == list): request_properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators) # if output format is specified, override with that if output_format is not None and type(output_format) == str: request_properties['outputFormat'] = output_format # make the request # if not explictly set or the case of pipelineLanguage, reset_default should be None if reset_default is None: reset_default = False r = self._request(text.encode('utf-8'), request_properties, reset_default, **kwargs) if request_properties["outputFormat"] == "json": return r.json() elif request_properties["outputFormat"] == "serialized": doc = Document() parseFromDelimitedString(doc, r.content) return doc elif request_properties["outputFormat"] in ["text", "conllu", "conll", "xml"]: return r.text else: return r
def annotate(self, text, annotators=None, output_format=None, properties_key=None, properties=None, **kwargs): """ Send a request to the CoreNLP server. :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (list | string) annotators: list of annotators to use :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml :param (str) properties_key: key into properties cache for the client :param (dict) properties: additional request properties (written on top of defaults) The properties for a request are written in this order: 1. Server default properties (server side) 2. Properties from client's properties_cache corresponding to properties_key (client side) If the properties_key is the name of a Stanford CoreNLP supported language: [Arabic, Chinese, English, French, German, Spanish], the Stanford CoreNLP defaults will be used (server side) 3. Additional properties corresponding to properties (client side) 4. Special case specific properties: annotators, output_format (client side) :return: request result """ # set properties for server call # first look for a cached default properties set # if a Stanford CoreNLP supported language is specified, just pass {pipelineLanguage="french"} if properties_key is not None: if properties_key.lower() in ['en', 'english']: request_properties = dict(ENGLISH_DEFAULT_REQUEST_PROPERTIES) elif properties_key.lower() in CoreNLPClient.PIPELINE_LANGUAGES: request_properties = {'pipelineLanguage': properties_key.lower()} else: request_properties = dict(self.properties_cache.get(properties_key, {})) else: request_properties = {} # add on custom properties for this request if properties is None: properties = {} request_properties.update(properties) # if annotators list is specified, override with that if annotators is not None: request_properties['annotators'] = ",".join(annotators) if isinstance(annotators, list) else annotators # always send an output format with request # in some scenario's the server's default output format is unknown, so default to serialized if output_format is not None: request_properties['outputFormat'] = output_format if request_properties.get('outputFormat') is None: if self.server_start_info.get('props', {}).get('outputFormat'): request_properties['outputFormat'] = self.server_start_info['props']['outputFormat'] else: request_properties['outputFormat'] = CoreNLPClient.DEFAULT_OUTPUT_FORMAT # make the request r = self._request(text.encode('utf-8'), request_properties, **kwargs) if request_properties["outputFormat"] == "json": return r.json() elif request_properties["outputFormat"] == "serialized": doc = Document() parseFromDelimitedString(doc, r.content) return doc elif request_properties["outputFormat"] in ["text", "conllu", "conll", "xml"]: return r.text else: return r