class StanfordCoreferenceResolver(CoreferenceResolver): def __init__(self, start_server=True, endpoint=CoreNLPClient.DEFAULT_ENDPOINT): self.__client = CoreNLPClient(start_server=start_server, endpoint=endpoint, annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'coref' ], output_format='json') self.__client.start() def __del__(self): self.__client.stop() def resolve_coreferences(self, text, entities): annotations = self.__client.annotate(text) entity_mention_indices = [] for chain in annotations.corefChain: mention_indices = [] for mention in chain.mention: sentence = annotations.sentence[mention.sentenceIndex] token_start = sentence.token[mention.beginIndex] token_end = sentence.token[mention.endIndex - 1] char_start = token_start.beginChar char_end = token_end.endChar mention_indices.append((char_start, char_end)) entity_mention_indices.append(mention_indices) entity_sets = [list() for _ in range(len(entity_mention_indices))] for entity in entities: is_coreferred = False for i, mention_indices in enumerate(entity_mention_indices): for start_index, end_index in mention_indices: if entity.start_offset >= start_index and entity.end_offset <= end_index: entity_sets[i].append(entity) is_coreferred = True if not is_coreferred: entity_sets.append([entity]) return entity_sets
class StanfordOpenIE: def __init__(self, core_nlp_version: str = '2018-10-05', threads: int = 5, close_after_finish: bool = True): self.remote_url = 'http://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format( core_nlp_version) self.install_dir = Path(os.environ['STANFORD_HOME']).expanduser() self.install_dir.mkdir(exist_ok=True) if not (self.install_dir / Path( 'stanford-corenlp-full-{}'.format(core_nlp_version))).exists(): print('Downloading to %s.' % self.install_dir) output_filename = wget.download(self.remote_url, out=str(self.install_dir)) print('\nExtracting to %s.' % self.install_dir) zf = ZipFile(output_filename) zf.extractall(path=self.install_dir) zf.close() os.environ['CORENLP_HOME'] = str(self.install_dir / 'stanford-corenlp-full-2018-10-05') from stanfordnlp.server import CoreNLPClient self.close_after_finish = close_after_finish self.client = CoreNLPClient(annotators=['openie'], memory='8G', threads=threads) def get_openie_with_boundary(self, annotation: Dict, remove_dup: bool = False) -> List[Triple]: triples: List[Triple] = [] dup: Set['unique'] = set() for sentence in annotation['sentences']: tokens = sentence['tokens'] for triple in sentence['openie']: new_triple = {} for field in ['subject', 'relation', 'object']: text = triple[field] s, e = triple[field + 'Span'] s = tokens[s]['characterOffsetBegin'] e = tokens[e - 1]['characterOffsetEnd'] new_triple[field] = Span(text=text, start=s, end=e) key = '\t'.join([ '{}-{}'.format(new_triple[field].start, new_triple[field].end) for field in ['subject', 'relation', 'object'] ]) if remove_dup and key in dup: continue triples.append(Triple(**new_triple)) dup.add(key) return triples def annotate(self, text: str, properties_key: str = None, properties: dict = None, simple_format: bool = True, remove_dup: bool = False, max_len: int = 15000): """ :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (str) properties_key: key into properties cache for the client :param (dict) properties: additional request properties (written on top of defaults) :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict. :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>. """ if len(text) >= max_len: return [] # https://stanfordnlp.github.io/CoreNLP/openie.html core_nlp_output = self.client.annotate(text=text, annotators=['openie'], output_format='json', properties_key=properties_key, properties=properties) if simple_format: return self.get_openie_with_boundary(core_nlp_output, remove_dup=remove_dup) else: return core_nlp_output def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def __del__(self): if self.close_after_finish: self.client.stop() del os.environ['CORENLP_HOME']
class StanfordOpenIE: def __init__(self, core_nlp_version: str = '2018-10-05'): self.remote_url = 'https://nlp.stanford.edu/software/stanford-corenlp-full-{}.zip'.format( core_nlp_version) self.install_dir = Path('~/.stanfordnlp_resources/').expanduser() self.install_dir.mkdir(exist_ok=True) if not (self.install_dir / Path( 'stanford-corenlp-full-{}'.format(core_nlp_version))).exists(): print('Downloading from %s.' % self.remote_url) output_filename = wget.download(self.remote_url, out=str(self.install_dir)) print('\nExtracting to %s.' % self.install_dir) zf = ZipFile(output_filename) zf.extractall(path=self.install_dir) zf.close() os.environ['CORENLP_HOME'] = str(self.install_dir / 'stanford-corenlp-full-2018-10-05') from stanfordnlp.server import CoreNLPClient self.client = CoreNLPClient(annotators=['openie'], memory='8G') def annotate(self, text: str, properties_key: str = None, properties: dict = None, simple_format: bool = True): """ :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (str) properties_key: key into properties cache for the client :param (dict) properties: additional request properties (written on top of defaults) :param (bool) simple_format: whether to return the full format of CoreNLP or a simple dict. :return: Depending on simple_format: full or simpler format of triples <subject, relation, object>. """ # https://stanfordnlp.github.io/CoreNLP/openie.html core_nlp_output = self.client.annotate(text=text, annotators=['openie'], output_format='json', properties_key=properties_key, properties=properties) if simple_format: triples = [] for sentence in core_nlp_output['sentences']: for triple in sentence['openie']: triples.append({ 'subject': triple['subject'], 'relation': triple['relation'], 'object': triple['object'] }) return triples else: return core_nlp_output def generate_graphviz_graph(self, text: str, png_filename: str = './out/graph.png'): """ :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (list | string) png_filename: list of annotators to use """ entity_relations = self.annotate(text, simple_format=True) """digraph G { # a -> b [ label="a to b" ]; # b -> c [ label="another label"]; }""" graph = list() graph.append('digraph {') for er in entity_relations: graph.append('"{}" -> "{}" [ label="{}" ];'.format( er['subject'], er['object'], er['relation'])) graph.append('}') output_dir = os.path.join('.', os.path.dirname(png_filename)) if not os.path.exists(output_dir): os.makedirs(output_dir) out_dot = os.path.join(tempfile.gettempdir(), 'graph.dot') with open(out_dot, 'w') as output_file: output_file.writelines(graph) command = 'dot -Tpng {} -o {}'.format(out_dot, png_filename) dot_process = Popen(command, stdout=stderr, shell=True) dot_process.wait() assert not dot_process.returncode, 'ERROR: Call to dot exited with a non-zero code status.' def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def __del__(self): self.client.stop() del os.environ['CORENLP_HOME']
os.environ["CORENLP_HOME"] = corenlp_path corenlpclient_UD1 = CoreNLPClient( properties={'ssplit.isOneSentence': True}, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=False, max_char_length=100000, output_format='conllu') _UD1_Auto = corenlpclient_UD1.annotate(text1) # annotators = ['tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats'] # _UD1_Auto = _UD1_Auto['sentences'][1]['basicDependencies'] # extract only basic dependencies print(_UD1_Auto) corenlpclient_UD1.stop() print( convert_const2dep( LANG, dataset, filename='', readpath='/02_modelbuilding/02_output/input_temp.parser', writepath='/02_modelbuilding/02_output/output_temp.parser', format_='UD1', usage='experiments')) annotated = _parse_segmenttokenize_en( 'What is the star of the moon? Where is the sea of the trees?', usage='production') print('Annotated', annotated)
def __populate_Parses(lang, parsejson, new_parsedict): """ """ # start CoreNLP servers for UD1 from stanfordnlp.server import CoreNLPClient cwd = os.getcwd() version = 'stanford-corenlp-full-2018-10-05' corenlp_path = re.findall(r'\S*/marta-v2', cwd)[0] + '/04_utils/' + version os.environ["CORENLP_HOME"] = corenlp_path if lang == 'en': lang = {} # i.e. CoreNLP defaults to English model corenlpclient_UD1 = CoreNLPClient(properties={ 'ssplit.isOneSentence': True, 'tokenize.whitespace': True }, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu') # parse annotator is necessary to obtain udfeats (for postags) if lang == 'fr': lang = 'french' corenlpclient_UD1 = CoreNLPClient( properties=lang, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu' ) # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html if lang == 'zh': lang = 'chinese' corenlpclient_UD1 = CoreNLPClient(properties=lang, annotators=[ 'tokenize', 'ssplit', 'pos', 'parse', 'depparse', 'udfeats' ], memory='2G', be_quiet=True, max_char_length=100000, output_format='conllu') # note that udfeats (for postags) currently works for english only https://stanfordnlp.github.io/CoreNLP/udfeats.html # begin processing for DocID in parsejson: print('Now processing: ', dataset, DocID) sentence_offset = 0 # this is the 4th element in a TokenList # obtain the gold constituency parses for the document. ConstTrees = __obtain_ConstTrees_Gold( DocID, readpath='./03_data/{}/{}tbRoot/{}/', lang=LANG) for sentence in parsejson[DocID]['sentences']: # 1. create a ParsePDTB object __parsepdtb = ParsePDTB( lang=LANG, docid=DocID, sentid=sentence_offset, gold_consttree=ConstTrees[sentence_offset], pdtb_version=PDTB_VERSION) # 2. add to .RawText and .Words __parsepdtb.RawText = " ".join( [word[0] for word in sentence['words']]) __parsepdtb.Words = sentence['words'] # 3. add to ConstTree_Auto. generate parse if missing if sentence['parsetree'] == '(())\n': _parse = a2_parsers._parse_rawtext2consttree( LANG, __parsepdtb.RawText, tokenized=True) __parsepdtb.ConstTree_Auto = _parse else: __parsepdtb.ConstTree_Auto = sentence['parsetree'] # 3. write to temp file, for converting to SD/UD1 in next steps with open('./02_modelbuilding/02_output/input_temp.parser', 'w+') as f: f.write(__parsepdtb.ConstTree_Gold) # 4. convert constituency parse to gold UD 1.0 and add to DepTree_UD1_Gold a2_parsers.convert_const2dep( LANG, dataset, filename='', readpath='/02_modelbuilding/02_output/input_temp.parser', writepath='/02_modelbuilding/02_output/output_temp.parser', format_='UD1', usage='experiments') with open('./02_modelbuilding/02_output/output_temp.parser', 'r') as f: UD1_Gold_conllu = f.read() def __conllu2tuple(conllu_doc): """helper function to convert CoNLL format into 3-tuple used by CoNLL 2016 organisers to store dependency parses """ to_list = conllu_doc.split('\n') tokenlist = [ i.split('\t')[1] + '-' + i.split('\t')[0] for i in to_list if i != '' ] # convert CoNLL line to <wordform>-<token num> tokenlist.insert(0, 'ROOT-0') # add a root token to the start deptree_gold = [ [ i.split('\t')[7], tokenlist[int(i.split('\t')[6])], i.split('\t')[1] + '-' + i.split('\t')[0] ] for i in to_list if i != '' ] # convert to CoNLL 2016 dependencies format return deptree_gold __parsepdtb.DepTree_UD1_Gold = __conllu2tuple(UD1_Gold_conllu) # 5. automatically generate UD 1.0 constituency parse (from raw text), place into same 3-tuple format as CoNLL 2016 Shared Task,and add to DepTree_UD1_Auto UD1_Auto_conllu = corenlpclient_UD1.annotate( __parsepdtb.RawText) __parsepdtb.DepTree_UD1_Auto = __conllu2tuple(UD1_Auto_conllu) # 6. add PTB-style and UD pos tags to .Words. Each of the variable below contain a list comprising 2-tuples. each tuple is (<wordform>, <part of speech>) globals()['pos_PTBGold'] = [ i for i in ParentedTree.fromstring( __parsepdtb.ConstTree_Gold).pos() if i[-1] != '-NONE-' ] # gold PTB parses have traces and these causes misalignment with the surface form. we drop these since parsers don't predict traces (Johannsen & Søgaard, 2013) globals()['pos_PTBAuto'] = ParentedTree.fromstring( __parsepdtb.ConstTree_Auto).pos() globals()['pos_UDGold'] = [(i.split('\t')[1], i.split('\t')[3]) for i in UD1_Gold_conllu.split('\n') if i != ''] globals()['pos_UDAuto'] = [(i.split('\t')[1], i.split('\t')[3]) for i in UD1_Auto_conllu.split('\n') if i != ''] for postagset in ['PTBGold', 'PTBAuto', 'UDGold', 'UDAuto']: try: _tagset = globals()['pos_' + postagset] assert len(_tagset) == len(__parsepdtb.Words) for idx in range(len(__parsepdtb.Words)): # add the part of speech as a new key in the dictionary for the token in .Words __parsepdtb.Words[idx][1].update( {'PartOfSpeech_' + postagset: _tagset[idx][1]}) except AssertionError as e: e.args += ( postagset.upper() + " is not of the same size as the .Words attribute for this sentence.", ) print(e) print("Continuing to attempt alignment of tokens.") _words = [i[0] for i in __parsepdtb.Words] _words_maxidx = len(_words) - 1 #'drop' the additional tokens in _tagset _tagset = [i for i in _tagset if i[0] in _words] _words_curridx = -1 # start with -1 for idx in range(len(_tagset)): _words_curridx += 1 while __parsepdtb.Words[_words_curridx][ 0] != _tagset[idx][ 0] and _words_curridx < _words_maxidx: __parsepdtb.Words[_words_curridx][1].update( { 'PartOfSpeech_' + postagset: 'ParserError' } ) # place a marker identifying the missing pos tag as an error from parsing _words_curridx += 1 __parsepdtb.Words[_words_curridx][1].update( {'PartOfSpeech_' + postagset: _tagset[idx][1]}) continue # raise sentence_offset += 1 # increase sentence offset before moving to handle next sentence try: new_parsedict[DocID].append(__parsepdtb) except: new_parsedict[DocID] = [__parsepdtb] # shut down the CoreNLP servers corenlpclient_UD1.stop()
import matplotlib.colors as colors import matplotlib.cm as cmx import networkx as nx import numpy as np dic = {} for x in range(len(listofchar)): dic[x] = listofchar[x] G = nx.from_numpy_matrix(np.array(Graph)) H = nx.relabel_nodes(G, dic) edgenum = len(H.edges) values = range(edgenum) jet = cm = plt.get_cmap('jet') cNorm = colors.Normalize(vmin=0, vmax=values[-1]) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet) colorList = [] for i in range(edgenum): colorVal = scalarMap.to_rgba(values[i]) colorList.append(colorVal) nx.draw(H,edge_color=colorList, with_labels= True) plt.show() #nx.draw(H, pos, node_color='b', edgelist=edges, edge_color=weights, width=10.0, edge_cmap=plt.cm.Blues) #nx.draw(H, with_labels=True) # Shut down the background CoreNLP server client.stop() time.sleep(10) !ps -o pid,cmd | grep java
os.environ['CORENLP_HOME'] = './stanford-corenlp-full-2018-10-05' properties = { 'ner.model': './stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz,' './stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz,' './stanford-ner-2018-10-16/classifiers/english.conll.4class.distsim.crf.ser.gz' } client = CoreNLPClient(annotators=['tokenize', 'pos', 'lemma', 'ner'], memory='8g', endpoint='http://localhost:9001') doc = client.annotate(text) for sent in doc.sentence: for m in sent.mentions: print(m.entityMentionText, '\t\t\t', m.entityType) client.stop() ## do not forget to stop the client # In[ ]: # In[ ]: ## nltk nltk.download() # d-punkt-q st = StanfordNERTagger( 'stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2018-10-16/stanford-ner.jar', encoding='utf-8') rt = 'this is a test, to see the result of nltk.' tokenized_text = word_tokenize(rt)
class CoreNlp(ComponentBase): def __init__(self, config, config_global, logger): super(CoreNlp, self).__init__(config, config_global, logger) self.cache = self._provide_cache("stanfordnlp_cache", human_readable=False) corenlp_home = config.get("corenlp_home", None) if corenlp_home: # resolve corenlp_home against the shell's working dir os.environ["CORENLP_HOME"] = str(Path.cwd() / Path(corenlp_home)) self._kwargs = config.pop("corenlp_kwargs", {"annotators": "depparse"}) self._client = None # type: Optional[CoreNLPClient] def parse_sentence(self, sentence: str, properties: Optional[Dict] = None): """ Run CoreNLP over a sentence. :param sentence: a single sentence :param properties: additional properties for CoreNLP :return: parsing result """ # The same input sentence can result in different annotations depending on the CoreNLP properties specified. # We therefore use a cache identifier for the sentence which includes the annotation properties. sent_cache_identifier = get_dict_hash( { "sentence": sentence, "properties": properties }, shorten=False) if not sent_cache_identifier in self.cache: # Kludge ahead: We want to cache the parsed sentence provided by CoreNLP, but also want to work with it in # a convenient format. A convenient format is the default format (protobuf-based), but that's not # pickle-able for the cache. We therefore convert the protobuf-format back into a bytestring and cache that. # When reading from the cache, we reassemble the protobuf object. req_properties = {"outputFormat": "serialized"} if properties is not None: req_properties.update(properties) doc = self.client.annotate(sentence, properties=req_properties) stream = writeToDelimitedString(doc) buf = stream.getvalue() stream.close() self.cache[sent_cache_identifier] = buf else: buf = self.cache[sent_cache_identifier] doc = Document() parseFromDelimitedString(doc, buf) return doc @property def client(self): if self._client is None: self._client = CoreNLPClient(**self._kwargs) self._client.start() return self._client @overrides def clean_up(self): if self._client is not None: self._client.stop()