class NLPclient: def __init__(self, core_nlp_version = '2018-10-05'): from stanza.server import CoreNLPClient self.client = CoreNLPClient(annotators=['tokenize','ssplit','pos', 'lemma','ner','parse','coref']) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def __del__(self): self.client.stop() def step(self,text) : core_nlp_output = self.client.annotate(text=text, annotators=annotators, output_format='json') for sentence in core_nlp_output['sentences']: lexs=tuple(lexs_of(sentence)) deps=deps_of(sentence) ies=tuple(ies_of(sentence)) yield lexs,deps,ies def extract(self, text): tail=clean_text(text) while tail: chunk=2**13 head=tail[0:chunk] tail=tail[chunk:] #print('EXTRACTING FROM',len(head), 'chars.') yield from self.step(head)
class CoreNLPBinaryParser: DEFAULT_PORT = 9003 def __init__(self, threads=1, port=None): sid = random.randint(0, 65535) if port is None: port = self.DEFAULT_PORT self.corenlp = CoreNLPClient(endpoint='http://localhost:{0}'.format(port), annotators=['parse'], output_format='json', properties={'ssplit.eolonly': 'true'}, timeout=300000, memory='8G', threads=threads, server_id='clinicgen{0}'.format(sid)) self.corenlp.start() self.run = True def __del__(self): self.stop() @classmethod def _format(cls, tree): childstrs = [] for child in tree: if isinstance(child, Tree): childstrs.append(cls._format(child)) elif isinstance(child, tuple): childstrs.append("/".join(child)) elif isinstance(child, string_types): childstrs.append('%s' % child) else: childstrs.append(unicode_repr(child)) if len(childstrs) > 1: return '( %s )' % ' '.join(childstrs) else: return childstrs[0] @classmethod def binarize(cls, tree): # collapse t = Tree.fromstring(tree) # chomsky normal form transformation Tree.collapse_unary(t, collapsePOS=True, collapseRoot=True) Tree.chomsky_normal_form(t) s = cls._format(t) return s def parse(self, text): ann = self.corenlp.annotate(text) return self.binarize(ann['sentences'][0]['parse']) def stop(self): if self.run: self.corenlp.stop() self.run = False
def coreference_resolution(text): import os import stanza from stanza.server import CoreNLPClient import json os.environ["CORENLP_HOME"] = "/home/soheil/Downloads/corenlp" # set up the client # with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos','lemma','ner', 'parse','dcoref'], timeout=5000, memory='2G', output_format='json') as client: # with CoreNLPClient(annotators=['pos','lemma','ner', 'parse','coref'], timeout=5000, memory='2G') as client: # properties={'annotators': 'coref', 'coref.algorithm' : 'statistical'} client = CoreNLPClient( annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'dcoref' ], memory='2G', timeout=5000, output_format='json' ) # 'dcoref' to do multipass sieve 'tokenize', 'ssplit', , endpoint='http://localhost:9001' # for sieve # annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref # annotators needed for coreference resolution # annotators = pos, lemma, ner, parse # print(client) # Start the background server and wait for some time client.start() # Print background processes and look for java # ps -o pid,cmd | grep java text = "Albert Einstein was a German-born theoretical physicist. He developed the theory of relativity." ann = client.annotate(text) # Shut down the background CoreNLP server client.stop() # print(ann['corefs']) for word in ann['corefs']: print(word['text'])
class CoreNLPProcessor(AbstractNLPProcessor): def grammar(self): ADP = '<RB|RBR|RP|TO|IN|PREP>' NP = '<JJ|ADJ>*<NN|VBG|RBS|FW|NNS|PRP|PRP$>+<POS>?<CD>?' return """ NP: {{({NP})+({ADP}?<DT>?{NP})*}} VP: {{<VB*>+{ADP}?}} PNP: {{<NNP|NNPS>+}} """.format(NP=NP, ADP=ADP) def __init__(self): super().__init__() os.environ["CORENLP_HOME"] = os.path.join( os.getcwd(), 'stanford-corenlp-full-2018-10-05') self.tagger = CoreNLPClient(annotators=['tokenize', 'pos', 'ner'], timeout=30000, memory='4G') def __del__(self): self.tagger.stop() def _extract_ner(self, token): ann = self.tagger.annotate(token) sentence = ann.sentence[0] return [(n.entityMentionText, n.entityType) for n in sentence.mentions] def extract_named_entities(self, token): entities = self._extract_ner(token) entities = list(set(map(lambda x: x[0], entities))) return entities def get_named_entity_types(self, token): return [entity[1] for entity in self._extract_ner(token)] def extract_phrase_by_type(self, token, type): ann = self.tagger.annotate(token) sentence = ann.sentence[0] tagged = [(token.word, token.pos) for token in sentence.token] return self._extract_phrase(tagged, type)
class DocumentProcessor(object): """This class represents the Document Processor class that processes the whole input document """ def __init__(self, config_path, lang): self.config = yaml.load(open(config_path, "r")) self.client = None self.lang = lang def __enter__(self): if environ.get("CORENLP_HOME") is None: raise EnvPathException( "The CORENLP_HOME path was not found. Please export it pointing to the directory that contains the CoreNLP resources" ) my_path = os.path.abspath(os.path.dirname(__file__)) settings.init() settings.LANGUAGE = self.lang stanza.download(self.lang, dir=self.config["stanza"]["dir"]) self.nlp = stanza.Pipeline(**self.config["stanza"], lang=self.lang) language_properties_fp = os.path.join(my_path, "language_resources", self.lang + "_properties.txt") self.client = CoreNLPClient(properties=language_properties_fp, **self.config["corenlp"]) self.client.start() return self def break_json_into_chunks(self, doc_json): """Convert an input json to a list of sentences Args: doc_json (dict): The input json representing the input document Returns: list : The list of sentences with raw text list: The list of sentences as jsons """ raw_sentences = [] sentence_jsons = [] try: for sent_json in doc_json: sentence_jsons.append(sent_json) sent_text = " ".join( [word["word"] for word in sent_json["words"]]) raw_sentences.append(sent_text) except Exception as e: raise InavlidJSONFileException( "The input JSON file you provided could not be analysed. Please check the example format provided" ) return raw_sentences, sentence_jsons def break_text_into_sentences(self, text): """Break the input raw text string into sentences using Stanza Args: doc_json (dict): The input json representing the input document Returns: list : The list of sentences with raw text list : The list of sentences as jsons """ sentences = [] stanza_doc = self.nlp(text) for sentence in stanza_doc.sentences: sentences.append(sentence.text) return sentences def analyze(self, doc, input_format): """Method to analyze the input as either a json or a string and return back a Document object Args: doc (json / string): The input that needs to be analyzed using Stanza Returns: Document: The Document object """ if input_format.lower() not in ["string", "json"]: raise InavlidFormatException( "Please provide the format as either 'string' or 'json'") settings.INPUT_FORMAT = input_format.lower() doc_obj = Document(self.lang, self.nlp, self.client) if settings.INPUT_FORMAT == "json": # the input format here is json doc = json.loads(doc) raw_sentences, sentence_jsons = self.break_json_into_chunks(doc) for raw_sent, sent_json in zip(raw_sentences, sentence_jsons): sentence = Sentence(self.lang, self.nlp, self.client, raw_sent, sent_json) sentence.json = sent_json doc_obj.sentence_objs.append(sentence) else: # the input format here is string raw_sentences = self.break_text_into_sentences(doc) for raw_sent in raw_sentences: sentence = Sentence(self.lang, self.nlp, self.client, raw_sent) doc_obj.sentence_objs.append(sentence) return doc_obj def __exit__(self, exc_type, exc_value, tb): """ Method to stop the CoreNLP client""" if self.client is not None: self.client.stop()