def __init__(self, host=None): """ This preprocessor connects to an CoreNLP server to perform sentence splitting, tokenization, syntactic parsing, named entity recognition and coref-resolution on passed documents. :param host: the core-nlp host """ self.log = logging.getLogger('GiveMe5W') # connect to CoreNLP server host = "http://localhost:9000" if host is None else host self.cnlp = CoreNLPClient(endpoint=host, start_server=StartServer.DONT_START) # define basic base_config and desired processing pipeline self.base_config = { 'timeout': 500000, 'annotators': 'tokenize,ssplit,pos,lemma,parse,ner,depparse,mention,coref', 'tokenize.language': 'English', # 'coref.algorithm' :'neural', see https://github.com/smilli/py-corenlp/issues/18 # CoreNLPs charniak-wrapper has some problems ... # 'parse.type': 'charniak', # 'parse.executable': '/home/ubuntu/bllip-parser/', # 'parse.verbose': 'true', # 'parse.model': './parse-50best.sh',#'~/.local/share/bllipparser/WSJ+Gigaword-v2', 'outputFormat': 'json' } self._token_index = None
def __init__(self): super().__init__() os.environ["CORENLP_HOME"] = os.path.join( os.getcwd(), 'stanford-corenlp-full-2018-10-05') self.tagger = CoreNLPClient(annotators=['tokenize', 'pos', 'ner'], timeout=30000, memory='4G')
class NLPclient: def __init__(self, core_nlp_version = '2018-10-05'): from stanza.server import CoreNLPClient self.client = CoreNLPClient(annotators=['tokenize','ssplit','pos', 'lemma','ner','parse','coref']) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): pass def __del__(self): self.client.stop() def step(self,text) : core_nlp_output = self.client.annotate(text=text, annotators=annotators, output_format='json') for sentence in core_nlp_output['sentences']: lexs=tuple(lexs_of(sentence)) deps=deps_of(sentence) ies=tuple(ies_of(sentence)) yield lexs,deps,ies def extract(self, text): tail=clean_text(text) while tail: chunk=2**13 head=tail[0:chunk] tail=tail[chunk:] #print('EXTRACTING FROM',len(head), 'chars.') yield from self.step(head)
def __init__(self, port=9000): utils.get_corenlp() while is_port_in_use(port): port += 1 self._core_nlp_client = CoreNLPClient( annotators=['parse'], timeout=600000, memory='16G', be_quiet=True, endpoint="http://localhost:%d" % port)
def __init__(self, port=9001): self.nlp = stanza.Pipeline('en') # initialize English neural pipeline self.client = CoreNLPClient( annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'parse'], timeout=60000, memory='4G', endpoint=f'http://localhost:{port}')
def __init__(self): self.client = CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref' ], timeout=30000, memory='16G', threads=1)
def __init__(self, threads=1, port=None): sid = random.randint(0, 65535) if port is None: port = self.DEFAULT_PORT self.corenlp = CoreNLPClient(endpoint='http://localhost:{0}'.format(port), annotators=['parse'], output_format='json', properties={'ssplit.eolonly': 'true'}, timeout=300000, memory='8G', threads=threads, server_id='clinicgen{0}'.format(sid)) self.corenlp.start() self.run = True
def __init__(self): self.client = CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'parse', ], timeout=30000, properties="zh", output_format="json", memory='5g')
def __init__(self, dir_corenlp: str, annotators: str = "tokenize,ssplit,pos,lemma,jmwe", jmwe_detector_type: str = "Consecutive", output_format: str = "serialized", threads: int = 1, kwargs_properties: Optional[Dict[str, str]] = None, kwargs_corenlp_client: Optional[Dict[str, str]] = None): """ Stanford Core NLP Client Wrapper class. @param dir_corenlp: @param annotators: @param jmwe_detector_type: @param output_format: @param kwargs_properties: @param kwargs_corenlp_client: """ _props = copy.deepcopy(self._default_corenlp_properties) _props["annotators"] = annotators _props["customAnnotatorClass.jmwe.detector"] = jmwe_detector_type if isinstance(kwargs_properties, dict): _props.update(kwargs_properties) # instanciate corenlp client _args = { "properties": _props, "output_format": output_format, "classpath": ":".join(glob2.glob(os.path.join(dir_corenlp, "*.jar"))), "start_server": StartServer.TRY_START } if isinstance(kwargs_corenlp_client, dict): _args.update(kwargs_corenlp_client) self._corenlp = {} if threads == 1: self._corenlp[0] = CoreNLPClient(**_args) else: self._pool = multiprocessing.Pool(threads) for index in range(threads): _args_i = copy.deepcopy(_args) _args_i["start_server"] = StartServer.TRY_START _args_i["endpoint"] = f"http://localhost:{9000+index}" _args_i["output_format"] = "serialized" _args_i["threads"] = 1 self._corenlp[index] = CoreNLPClient(**_args_i) self._corenlp_properties = _props self._corenlp_client_args = _args self._threads = threads
class CoreNLPBinaryParser: DEFAULT_PORT = 9003 def __init__(self, threads=1, port=None): sid = random.randint(0, 65535) if port is None: port = self.DEFAULT_PORT self.corenlp = CoreNLPClient(endpoint='http://localhost:{0}'.format(port), annotators=['parse'], output_format='json', properties={'ssplit.eolonly': 'true'}, timeout=300000, memory='8G', threads=threads, server_id='clinicgen{0}'.format(sid)) self.corenlp.start() self.run = True def __del__(self): self.stop() @classmethod def _format(cls, tree): childstrs = [] for child in tree: if isinstance(child, Tree): childstrs.append(cls._format(child)) elif isinstance(child, tuple): childstrs.append("/".join(child)) elif isinstance(child, string_types): childstrs.append('%s' % child) else: childstrs.append(unicode_repr(child)) if len(childstrs) > 1: return '( %s )' % ' '.join(childstrs) else: return childstrs[0] @classmethod def binarize(cls, tree): # collapse t = Tree.fromstring(tree) # chomsky normal form transformation Tree.collapse_unary(t, collapsePOS=True, collapseRoot=True) Tree.chomsky_normal_form(t) s = cls._format(t) return s def parse(self, text): ann = self.corenlp.annotate(text) return self.binarize(ann['sentences'][0]['parse']) def stop(self): if self.run: self.corenlp.stop() self.run = False
def __enter__(self): if environ.get("CORENLP_HOME") is None: raise EnvPathException( "The CORENLP_HOME path was not found. Please export it pointing to the directory that contains the CoreNLP resources" ) my_path = os.path.abspath(os.path.dirname(__file__)) settings.init() settings.LANGUAGE = self.lang stanza.download(self.lang, dir=self.config["stanza"]["dir"]) self.nlp = stanza.Pipeline(**self.config["stanza"], lang=self.lang) language_properties_fp = os.path.join(my_path, "language_resources", self.lang + "_properties.txt") self.client = CoreNLPClient(properties=language_properties_fp, **self.config["corenlp"]) return self
def run_conversion(qas, corenlp_home): os.environ['CORENLP_HOME'] = 'stanford-corenlp-full-2018-10-05' ret = list() with CoreNLPClient( annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse'], timeout=30000, memory='16G', properties={ 'ssplit.eolonly': True, 'ssplit.newlineIsSentenceBreak': 'always', 'outputFormat': 'json' }, endpoint='http://localhost:9001') as client: for question, answer in tqdm(qas): parse = client.annotate(question)['sentences'][0] tokens = parse['tokens'] const_parse = read_const_parse(parse['parse']) for rule in CONVERSION_RULES: sent = rule.convert(question, answer, tokens, const_parse) if sent: ret.append([question, answer, sent]) break else: ret.append([question, answer, None]) return ret
def process_one_headline(self, headline): with self.lock: with CoreNLPClient(annotators=self.annotators, timeout=self.timeout, memory=self.memory, classpath=self.core_nlp_folder) as client: return self.collect_data(client.annotate(headline))
def main(data_dir, to_annotate, affinity_cap, output_name, graph, graph_out_loc): properties = {'openie.affinity_probability_cap': affinity_cap} dygiepp_jsonl = [] with CoreNLPClient(annotators=["openie"], output_format="json") as client: for doc in to_annotate: # Get the doc_key doc_key = splitext(basename(doc))[0] # Read in the text with open(doc) as f: text = " ".join(f.read().split('\n')) # Perform OpenIE ann = client.annotate(text) # Convert output to dygiepp format dygiepp_jsonl.append(openie_to_dygiepp(ann, doc_key)) # Graph annotations if requested if graph: graph_annotations(text, properties, doc_key, graph_out_loc) # Write out dygiepp-formatted output with jsonlines.open(output_name, 'w') as writer: writer.write_all(dygiepp_jsonl)
def POSTag(text, sent_split=True, tolist=True): StanfordCoreNLP_chinese_properties = get_StanfordCoreNLP_chinese_properties( ) words = [] if text != '': try: lang = langdetect.detect(text) except langdetect.lang_detect_exception.LangDetectException: lang = "undetermined" if sent_split: annotators = ['tokenize', 'ssplit', 'pos'] else: annotators = ['tokenize', 'pos'] ########## if (lang == "zh-cn") or (lang == "en"): if (lang == "zh-cn"): with CoreNLPClient( annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: ann = client.annotate(text) elif (lang == "en"): with CoreNLPClient(annotators=annotators, timeout=15000) as client: ann = client.annotate(text) ######### if sent_split: words = [[(token.word, token.pos) for token in sent.token] for sent in ann.sentence] segmented_list = [ ' '.join(['#'.join(posted) for posted in wordlist]) for wordlist in words ] segmented = '\n'.join(segmented_list) else: words = [(token.word, token.pos) for token in ann.sentencelessToken] segmented = ' '.join(['#'.join(posted) for posted in words]) else: segmented = text words = segmented.split() else: segmented = text if tolist: return words #list else: return segmented #string
def annotate(sentence, lower=True): global client if client is None: client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(',')) words, gloss, after = [], [], [] for s in client.annotate(sentence): for t in s: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
def process_q_batch(batch: List[str], tagger_client: CoreNLPClient) -> List[List[str]]: n_questions = len(batch) assert n_questions > 0 text = " ".join(batch) assert len(text) <= tagger_client.DEFAULT_MAX_CHAR_LENGTH ann = tagger_client.annotate(text) assert len(ann.sentence) == n_questions return [process_tagged(s) for s in ann.sentence]
def Parse(text, lang='zh-cn', annotators=None): StanfordCoreNLP_chinese_properties = get_StanfordCoreNLP_chinese_properties( ) if annotators == None: annotators = [ 'tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'parse', 'depparse', 'regnexer', 'coref' ] # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'parse'] if lang == 'zh-cn': with CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: ann = client.annotate(text) elif lang == 'en': with CoreNLPClient(annotators=annotators, timeout=15000) as client: ann = client.annotate(text) return ann
def start_server() -> CoreNLPClient: """Starts a CoreNLP server through Stanza and returns it.""" stanza.install_corenlp(dir="./stanza_corenlp") return CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref', 'kbp', 'natlog', 'openie' ], timeout=30000, memory='16G')
def annotate_do(txt): if not txt: return {} from stanza.server import CoreNLPClient, StartServer with CoreNLPClient(start_server=StartServer.DONT_START, output_format='json') as client: try: return client.annotate(txt) except: return
def __init__(self, url, compound_map_file): self.nlp_properties = { 'annotators': "tokenize,ssplit,pos,lemma,ner", "tokenize.options": "splitHyphenated=true,normalizeParentheses=false", "tokenize.whitespace": False, 'ssplit.isOneSentence': True, 'outputFormat': 'json' } os.environ["CORENLP_HOME"] = "/content/stanford-corenlp-full-2018-10-05" client = CoreNLPClient(annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner'], properties=self.nlp_properties, memory='10G', endpoint=url) client.start() print(client) time.sleep(10) self.nlp = client self.compound_map = self.load_compound_map(compound_map_file)
def main(): assert config['config_target'] == 'conll16_discourse' # start CoreNLP server manually # java -Xmx16G -cp "/homes/lee2226/scratch2/stanford-corenlp-full-2020-04-20/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9002 -timeout 60000 -threads 5 -maxCharLength 100000 -preload tokenize,ssplit, pos,lemma,ner, parse,depparse,coref,kbp -outputFormat json # use use the tokenization from the given parse file and re-parse them with our CoreNLP split_dir = config['{}_dir'.format(args.split)] parse_fpath = os.path.join(split_dir, 'parses.json') logger.info('loading {}...'.format(parse_fpath)) old_parses = json.load(open(parse_fpath, 'r')) fw = open(args.output_file, 'w') properties = { 'tokenize.whitespace': True, 'tokenize.keepeol': True, 'ssplit.eolonly': True, 'ner.useSUTime': False } annotators = [ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref' ] with CoreNLPClient(annotators=annotators, properties=properties, timeout=300000, endpoint='http://localhost:{}'.format( args.nlp_server_port), start_server=False) as client: for doc_id, parse in tqdm(old_parses.items()): sents = [] for i_sent, sent in enumerate(parse['sentences']): words = [w[0] for w in sent['words']] sent_text = ' '.join(words) sents.append(sent_text) all_text = '\n'.join(sents) try: ann = client.annotate(all_text, annotators=annotators, properties=properties, output_format='json') ann['doc_id'] = doc_id # verify lengths assert len(ann['sentences']) == len( parse['sentences']), 'ssplit mismatch' for i_sent in range(len(parse['sentences'])): n_words = len(parse['sentences'][i_sent]['words']) assert len(ann['sentences'][i_sent]['tokens']) == n_words out = json.dumps(ann) fw.write(out + '\n') except: logger.warning('failed parsing {}'.format(doc_id)) fw.close()
def run_parsing(gen, prefix): logger.info('start parsing {}'.format(prefix)) fpath = os.path.join(args.output_dir, '{}_parses.json'.format(prefix)) fw = open(fpath, 'w') # java -Xmx16G -cp "/homes/lee2226/scratch2/stanford-corenlp-full-2020-04-20/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9002 -timeout 300000 -threads 5 -maxCharLength 100000 -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,coref -outputFormat json cnt = 0 failed_sids = [] properties = { # 'tokenize.whitespace': True, 'tokenize.keepeol': True, 'ssplit.eolonly': True, # 'coref.algorithm': 'statistical', 'ner.useSUTime': False } annotators = [ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'depparse', 'coref' ] with CoreNLPClient(annotators=annotators, properties=properties, timeout=1200000, endpoint='http://localhost:{}'.format( args.nlp_server_port), start_server=False) as client: t1 = time.time() for sid, doc in tqdm(gen()): logger.info('last processing time: {} s'.format(time.time() - t1)) t1 = time.time() text = [doc['lines'][str(i)]['text'] for i in range(1, 6)] text = '\n'.join(text) # parsing try: ann = client.annotate(text, annotators=annotators, properties=properties, output_format='json') except: logger.warning('failed parsing {}'.format(sid)) failed_sids.append(sid) continue if len(ann['sentences']) != 5: logger.warning('failed sentence length {}'.format(sid)) failed_sids.append(sid) continue ann['sid'] = sid line = json.dumps(ann) fw.write(line + '\n') cnt += 1 fw.close() logger.info('failed sids={}'.format(failed_sids)) logger.info('done: {} files, {} s'.format(cnt, time.time() - t1))
def process_multiple_headlines(self, headlines): data = [] with self.lock: with CoreNLPClient(annotators=self.annotators, timeout=self.timeout, memory=self.memory, classpath=self.core_nlp_folder) as client: for headline in headlines: data.append(self.collect_data(client.annotate(headline))) return data
def annotate(sentence, lower=True): global client if client is None: client = CoreNLPClient(endpoint="http://localhost:9001", annotators=['ssplit', 'tokenize'], start_server=False) words, gloss, after = [], [], [] for s in client.annotate(sentence): for t in s: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
def annotate(sentence, lower=True): global client if client is None: # import pdb; pdb.set_trace() client = CoreNLPClient(default_annotators='ssplit,tokenize'.split(','), be_quiet=True) words, gloss, after = [], [], [] sent_annotated = client.annotate(sentence).sentence[0] for t in sent_annotated.token: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
def Segment_Chinese_only(text, sent_split=True, tolist=True): # Grabs a Chinese string and returns as list of words nested in a list of sentences # sent_split=True if we want to split the text into sentences, and then parse each sentence individually. # tolist=True if we want to receive a list of words, False if we want a sentence split by spaces StanfordCoreNLP_chinese_properties = get_StanfordCoreNLP_chinese_properties( ) words = [] if text != '': try: lang = langdetect.detect(text) except langdetect.lang_detect_exception.LangDetectException: lang = "undetermined" if (lang == "zh-cn"): #If text is Chinese, segment it, else leave it ######### if sent_split: annotators = ['tokenize', 'ssplit'] with CoreNLPClient( annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: ann = client.annotate(text) words = [[token.word for token in sent.token] for sent in ann.sentence] segmented_list = [' '.join(wordlist) for wordlist in words] segmented = '\n'.join(segmented_list) else: annotators = ['tokenize'] with CoreNLPClient( annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: ann = client.annotate(text) words = [token.word for token in ann.sentencelessToken] segmented = ' '.join(words) else: segmented = text words = segmented.split() else: segmented = text if tolist: return words #list else: return segmented #string
def main(coref_path, out_dir, gum_file_lists=None): train_list = [] dev_list = [] test_list = [] for filename in os.listdir(gum_file_lists): file_path = gum_file_lists + os.sep + filename if "train" in filename: train_list = find_list(file_path) elif "dev" in filename: dev_list = find_list(file_path) else: test_list = find_list(file_path) genres = [ "academic", "bio", "fiction", "interview", "news", "voyage", "whow", "reddit", "conversation", "speech", "textbook", "vlog" ] for genre in genres: for filename in os.listdir(coref_path + os.sep + "conll"): if genre in filename: tsv_file = coref_path + os.sep + "tsv" + os.sep + filename.split( ".")[0] + ".tsv" text = build_text(tsv_file) with CoreNLPClient(properties={ 'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,dcoref', 'ssplit.eolonly': True, 'tokenize.whitespace': True, }, output_format='xml', timeout=60000, memory='8G') as client: xml_out = client.annotate(text) if filename.split(".")[0] in train_list: write_file( out_dir + os.sep + 'train' + os.sep + filename.split(".")[0] + '.xml', xml_out) elif filename.split(".")[0] in dev_list: write_file( out_dir + os.sep + 'dev' + os.sep + filename.split(".")[0] + '.xml', xml_out) elif filename.split(".")[0] in test_list: write_file( out_dir + os.sep + 'test' + os.sep + filename.split(".")[0] + '.xml', xml_out) else: sys.stderr.write(f"ERROR: file {filename} not in list.\n") print("Done!")
def corenlp_coref_resolution(self, memory, timeout, properties): """ Perform coreference resolution on given text using Stanford CoreNLP :param - memory: str - timeout: int - properties: dict :return: - texts: list, List of sentences resolved and unresolved by coreference resolution operation. """ # Start CoreNLP Server with required properties with CoreNLPClient(pipeline='StanfordCoreNLP', timeout=timeout, memory=memory, properties=properties) as client: texts = self.input_data() index = 0 time.sleep(10) for text in texts: doc = self.nlp(text) modified_text = [ sentence.string.strip() for sentence in doc.sents ] # submit the request to the server ann = client.annotate(text) # In each chain, replace the anaphora with the correct representative for coref in ann.corefChain: mts = [mention for mention in coref.mention] representative = coref.representative phrase_rep = self.create_phrase(mts[coref.representative], ann) antecedent = ' '.join(word for word in phrase_rep) check_rep = 0 for mention in coref.mention: if check_rep == representative: check_rep += 1 continue phrase = self.create_phrase(mts[check_rep], ann) anaphor = ' '.join(word for word in phrase) anaphor = anaphor + ' ' antecedent = antecedent + ' ' modified_text[mention.sentenceIndex] = modified_text[ mention.sentenceIndex].replace( anaphor, antecedent) check_rep += 1 modified_text = ' '.join(modified_text) texts[index] = modified_text index += 1 if self.coref_output is True: self.coref_output_file(texts) return texts
def _annotate_parse(client: CoreNLPClient, doc: str): """ 並列処理用のヘルパー関数 @param client: corenlp clientインスタンス @param doc: 処理するdocument.できるだけ大きくするとよい @param index: プロセス番号(0,1,2,...,n_process) @return: 処理したドキュメント """ obj_doc = client.annotate(doc) iter_sentences = parse_serialized_document(obj_doc) return iter_sentences