def __init__(self, src, src_folder="../data/", corenlp_path="../stanford-corenlp/",\ regexp=None, verbose=False): # initialization self.src = src self.filename = re.search("([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json", self.src).group(1) self.src_candidates = os.path.join(src_folder, "lexicon/candidates.json") self.dst_starred = os.path.join(src_folder, "starred_reviews/") self.corenlp_path = os.path.normpath(corenlp_path) self.regexp = regexp self.verbose = verbose self.entity = {} self.entity_name = "" self.entity_regexp = [] self.ratings = [] self.sentiment_words = [] self.adv_adj_combinations = {} self.clean_reviews = [] self.starred_reviews = [] self.stopwords = set(stopwords.words("english")) self.stopwords.remove("not") self.stemmer = SnowballStemmer("english") # need set the CORENLP_HOME path os.environ["CORENLP_HOME"] = self.corenlp_path self.corenlp = corenlp.CoreNLPClient(annotators="tokenize ssplit dcoref".split(), timeout=50000)
def get_openIE_triples(document): # CoreNLP OpenIE """ Input: document: str Output: tripleSet: set(tripple-tuple, confidence) """ tripleSet = set() try: with corenlp.CoreNLPClient( annotators="tokenize,ssplit,pos,lemma,depparse,natlog,openie". split()) as client: ann = client.annotate(document) # 'doc.CoreNLP_pb2.Document'> for sent in ann.sentence: if len(sent.openieTriple) > 0: # if there are any triples... # <class 'google.protobuf.pyext._message.RepeatedCompositeContainer'> triples = sent.openieTriple for t in triples: triple_tuple = (t.subject, t.relation, t.object) c = t.confidence out = (triple_tuple, c) tripleSet.add(out) except: # corenlp server probably timed out because the document was too big # TODO: grab a smaller section and annotate that. pass return tripleSet
def test_tokensregex(): with corenlp.CoreNLPClient( annotators='tokenize ssplit ner depparse'.split(), timeout=60000) as client: # Example pattern from: https://nlp.stanford.edu/software/tokensregex.shtml pattern = '([ner: PERSON]+) /wrote/ /an?/ []{0,3} /sentence|article/' matches = client.tokensregex(TEXT, pattern) assert len(matches["sentences"]) == 1 assert matches["sentences"][0]["length"] == 1 assert matches == { "sentences": [ { "0": { "text": "Chris wrote a simple sentence", "begin": 0, "end": 5, "1": { "text": "Chris", "begin": 0, "end": 1 } }, "length": 1 }, ] }
def find_all_pos_words(pos): results = [] verbs = [] with corenlp.CoreNLPClient(annotators="tokenize ssplit pos".split()) as client: for idx in tqdm(range(tokenizer.vocab_size)): word = tokenizer._convert_id_to_token(idx) if check_word_pos(word, pos): ann = client.annotate(word) pos_all = [[token.pos for token in sent.token] for sent in ann.sentence] pos_list = [] for pos_sent in pos_all: pos_list.extend(pos_sent) if len(pos_list) == 1 and pos_list[0] not in \ ['VVD', 'VVG', 'VVN', 'VVP', 'VVZ', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'VHD', 'VHG', 'VHN', 'VHP', 'VHZ', 'MD']: results.append(idx) verbs.append(word) # for idx in tqdm(range(tokenizer.vocab_size)): # word = tokenizer._convert_id_to_token(idx) # if check_word_pos(word, pos): # results.append(idx) # verbs.append(word) # print(results) print(len(results)) return results, verbs
def tokenize_text(text): """ Tokenizes the title's text by the embedding class :param title: The title to tokenize its text """ # Filters out external links text = ' '.join([ word for word in text.split() if "https://" not in word and "http://" not in word ]) # Start the coreNLPServer separately with corenlp.CoreNLPClient( start_server=False, timeout=10000, annotators="tokenize ssplit lemma pos".split()) as client: ann = client.annotate(text) # Filters out stop words stop_words = set(stopwords.words('english')) | { word.capitalize() for word in stopwords.words('english') } # Removes punctuation from each word punctuation = set(string.punctuation) | {"\"\""} | {'\'\''} | {'``'} punctuation_and_stop_words = stop_words | punctuation # Couple each word with its pos tag if the word isn't a stop word and isn't a punctuation text = [ f"{token.lemma}_{TREEBANK_TO_UNIVERSAL[token.pos]}" for sentence in ann.sentence for token in sentence.token if token.lemma not in punctuation_and_stop_words ] return text
def annotate_question(question, table_id, dir_in, dir_out, split): if not os.path.isdir(dir_out): os.makedirs(dir_out) ftable = os.path.join(dir_in, split) + '.tables.jsonl' fout = os.path.join( dir_out, split) + datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.jsonl' fbase = os.path.join(dir_in, split) + '_base.jsonl' copyfile(fbase, fout) with open(ftable) as ft, open(fout, 'a') as fo, corenlp.CoreNLPClient( annotators="tokenize ssplit".split()) as client: tables = {} for line in ft: d = json.loads(line) tables[d['id']] = d # to get table headers raw_data = { "phase": 1, "table_id": table_id, "question": question, "sql": { "sel": 0, "conds": [[0, 0, 0]], "agg": 0 } } a = annotate_example(client, raw_data, tables[table_id]) # if not is_valid_example(a): # raise Exception(str(a)) nlp = spacy.load('en_core_web_sm') doc = nlp(question) a["question"]["ent"] = [token.tag_ for token in doc] fo.write('\n' + json.dumps(a) + '\n') return fout, tables[table_id]['header']
def main(args): num_sentences = 0 with open(args.input, encoding='utf-8') as f, open(args.output, mode='w', encoding='utf-8') as out: with corenlp.CoreNLPClient(annotators="tokenize ner".split(), endpoint="http://localhost:5000") as client: for line in f.readlines(): mrp_json = json.loads(line) tok = [] ner = [] ann = client.annotate(mrp_json['input'], output_format='json') for sentence in ann['sentences']: for tokens in sentence['tokens']: tok.append(tokens['word']) ner.append(tokens['ner']) if len(ner) != len(mrp_json['nodes']): print(mrp_json['id'], " error!") mrp_json['tok'] = tok mrp_json['ner'] = ner # for ner, nodes in zip(ner, mrp_json['nodes']): # nodes['properties'].append('ner') # nodes['values'].append(ner) out.write(json.dumps(mrp_json) + '\n') num_sentences += 1 if num_sentences % 1000 == 0: print(f"Processed {num_sentences} sentences!")
def test_tokenizer(): cases = [ (u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)", u"rt @ #happyfuncoding : this is a typical twitter tweet :-)".split() ), (u"HTML entities & other Web oddities can be an ácute <em class='grumpy'>pain</em> >:(", u"html entities and other web oddities can be an ácute".split() + [u"<em class='grumpy'>", u"pain", u"</em>", u">:("]), (u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.", u"it's perhaps noteworthy that phone numbers like".split() + [ u"+1 (800) 123-4567", u",", u"(800) 123-4567", u",", u"and", u"123-4567" ] + u"are treated as words despite their whitespace .".split()) ] annotator = HappyFunTokenizer() annotator.start() try: with corenlp.CoreNLPClient( properties=annotator.properties, annotators="happyfun ssplit pos".split()) as client: for text, tokens in cases: ann = client.annotate(text) tokens_ = [t.word for t in ann.sentence[0].token] assert tokens == tokens_ finally: annotator.terminate() annotator.join()
def corenlp_start(): os.environ['CORENLP_HOME'] = './corenlp' corenlp_client = corenlp.CoreNLPClient(endpoint="http://localhost:9000", memory='16G', annotators="tokenize ssplit pos lemma depparse natlog openie".split()) corenlp_client.annotate('This is a dummy input sentence to initialize all CoreNLP components.') print('(Re)initialized coreNLP server...') return corenlp_client
def qgnet_main(args_dict): # first, run shell script, if necessary, in qgnet to create model subprocess.call([ '../{}/download_QG-Net.sh'.format(settings.qgnet_dir), args_dict['qgnet_path'] ]) # second, pre-process the pdfs jsonObj, allDocs = load_data('{}/da_embeddings.txt'.format( settings.models_dir)) abstracts = [] for value in jsonObj.values(): if "summary" in value['metadata']: abstracts.append(value['metadata']["summary"]) elif "abstract" in value['metadata']: abstracts.append(value['metadata']["abstract"]) nlp = corenlp.CoreNLPClient(output_format='json', properties={'timeout': '50000'}) features, tfidf = create_tf_idf(abstracts, False) for i, abstract in enumerate(abstracts): preprocess_pdf(abstract, features[i, :].toarray(), tfidf, nlp) # third, generate qg-net questions subprocess.call([ '../{}/qg_reproduce_LS.sh'.format(settings.qgnet_dir), args_dict['qgnet_path'], settings.models_dir ])
def __init__(self, ontology_name): self.causal_headers = ["Source_File", 'Query', "Score", "Span", "Relation Index", "Relation", "Relation_Type", "Indicator", "Cause Index", "Cause", "Effect Index", "Effect", "Sentence"] self.event_headers = ["Source_File", 'Query', "Score", "Event Index", "Span", "Sentence Span","Relation", "Event_Type", "FrameNet_Frame", "Indicator", "Location", "Time", 'Agent Index', "Agent", 'Patient Index', "Patient", "Sentence"] self.entity_headers = ["Source_File", 'Query', "Score", "Entity Index", "Span", "Sentence Span", "Entity", "Entity_Type", "FrameNet_Frame", "Indicator", "Qualifier", "Sentence"] self.variable_headers = ["Source_File", 'Sentence', 'Indicator', 'Scoring', 'Index'] self.entity_index = 0 self.event_index = 0 self.variable_index = 0 self.causal_index = 0 self.ontology_name = ontology_name if os.getenv('CORENLP_HOME') is not None and os.getenv('CORENLP_HOME') != '': print(f'using Stanford CoreNLP Server @ {os.getenv("CORENLP_HOME")}') self.CoreNLPclient = corenlp.CoreNLPClient( start_server=True, be_quiet=True, timeout=100000, annotators=['tokenize', 'ssplit', 'pos', 'parse', 'lemma', 'ner', 'depparse']) self.CoreNLPclient.annotate("hello world") # warmup the CoreNLP client and start the java server else: raise ValueError('the "CORENLP_HOME" environment variable is not set, cannot run Stanford CoreNLP Server')
def __init__(self, corenlp_path): self.keyPhrases = {} self.rels = ['IsA', 'UsedFor', 'PartOf', 'HasA', 'CreatedBy', 'MadeOf'] os.environ['CORENLP_HOME'] = corenlp_path self.CoreNLPclient = corenlp.CoreNLPClient(annotators=[ 'tokenize', 'ssplit', 'pos', 'depparse', 'lemma', 'parse' ])
def __init__(self, corenlp_path, previous_def_file): self.nounTags=['NN', 'NNP', 'NNS', "NNPS",'VBG'] #os.environ['CORENLP_HOME'] = '/Users/evangeliaspiliopoulou/Desktop/stanfordCoreNLP' os.environ['CORENLP_HOME'] = corenlp_path self.CoreNLPclient = corenlp.CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'lemma']) if previous_def_file!= None: self.previous_def = json.load(open(previous_def_file)) self.exceptionTerms= set()
def start(): timer = Timer() timer.start('Start CoreNLP server') Corenlp.client = corenlp.CoreNLPClient( annotators=Corenlp.DEFAULT_ANNOTATORS, properties=Corenlp.DEFAULT_PROPERTIES) timer.stop()
def Parse(text, annotators=None): if annotators == None: # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'parse', 'depparse', 'regnexer','coref'] annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'parse'] with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: ann = client.annotate(text) return ann
def POSTag(text, sent_split=True, tolist=True): words = [] if text != '': try: lang = langdetect.detect(text) except langdetect.lang_detect_exception.LangDetectException: lang = "undetermined" if (lang == "zh-cn"): #If text is chinese segment, else leave it ######### if sent_split: annotators = ['tokenize', 'ssplit', 'pos'] with corenlp.CoreNLPClient( annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: ann = client.annotate(text) words = [[(token.word, token.pos) for token in sent.token] for sent in ann.sentence] segmented_list = [ ' '.join(['#'.join(posted) for posted in wordlist]) for wordlist in words ] segmented = '\n'.join(segmented_list) else: annotators = ['tokenize', 'pos'] with corenlp.CoreNLPClient( annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: ann = client.annotate(text) words = [(token.word, token.pos) for token in ann.sentencelessToken] segmented = ' '.join(['#'.join(posted) for posted in words]) else: segmented = text words = segmented.split() else: segmented = text if tolist: return words #list else: return segmented #string
def __init__(self): #if not os.environ.get('CORENLP_HOME'): os.environ['CORENLP_HOME'] = os.path.abspath( os.path.join(os.path.dirname(__file__), '../../third_party/stanford-corenlp-full-2018-10-05')) if not os.path.exists(os.environ['CORENLP_HOME']): raise Exception( f'''Please install Stanford CoreNLP and put it at {os.environ['CORENLP_HOME']}. Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip Landing page: https://stanfordnlp.github.io/CoreNLP/''') self.client = corenlp.CoreNLPClient()
def Chinese_CoreNLP_test(text=None, annotators=None): if text == None: text = ("国务院日前发出紧急通知,要求各地切实落实保证市场供应的各项政策,维护副食品价格稳定。") #### if annotators == None: annotators = ['tokenize', 'ssplit', 'pos'] # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'depparse'] with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: ann = client.annotate(text) # sent_list = [token.word for token in ann.sentence[0].token] # ['国务院', '日前', '发出', '紧急', '通知', ',', '要求', '各地', '切实', '落实', '保证', '市场', '供应', '的', '各', '项', '政策', ',', '维护', '副食品', '价格', '稳定', '。'] return ann
def __init__(self): if not os.environ.get("CORENLP_HOME"): os.environ["CORENLP_HOME"] = os.path.abspath( os.path.join( os.path.dirname(__file__), "/corenlp/stanford-corenlp-full-2018-10-05", )) if not os.path.exists(os.environ["CORENLP_HOME"]): raise Exception( """Please install Stanford CoreNLP and put it at {}. Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip Landing page: https://stanfordnlp.github.io/CoreNLP/""".format( os.environ["CORENLP_HOME"])) self.client = corenlp.CoreNLPClient()
def get_examples_from_forums(forums, guest_client): """ Convert reviews from a list of forums into classification examples. Args: forums: A list of forum ids conference: Conference name (from openreview_lib.Conference) Returns: A list of openreview_lib.ClassificationExamples """ sid_map, pairs = orl.get_review_rebuttal_pairs(forums, guest_client) with corenlp.CoreNLPClient(annotators=CORENLP_ANNOTATORS, output_format='conll') as corenlp_client: return orl.get_classification_examples(pairs, "review", sid_map, corenlp_client)
def testPart(): TEST_SENTENCE = "Which prize did Frederick Buechner create?" with corenlp.CoreNLPClient(annotators='tokenize ssplit parse lemma pos ner'.split()) as testclient: c = testclient.annotate(TEST_SENTENCE) TEST_PARSE_TREE = c.sentence[0].parseTree TEST_Q = queue.Queue(maxsize=MAX_SIZE) TEST_Q.put(TEST_PARSE_TREE) print (TEST_PARSE_TREE) tokens_list = parse_tree_WHNP(TEST_Q) print (tokens_list) spanList = generate_candidateAnswerSpans(tokens_list, c.sentence[0].token) sizeList = len(spanList) for s in spanList: print (str(s.tokens) + str(s.type)) return 0
def __init__(self): directory = (os.getcwd() + r'\\corenlp') os.environ['CORENLP_HOME'] = directory prop = {'pos.model': directory + r'\gate-EN-twitter.model'} self.client = corenlp.CoreNLPClient( annotators="tokenize pos lemma".split(), properties=prop, output_format='json') #to combat connection error which occur in initial trial for _ in range(3): try: self.client.annotate('Hello, World') break except: continue
def annotate(sentence, lower=True): global client if client is None: client = corenlp.CoreNLPClient(annotators='ssplit,tokenize'.split(',')) words, gloss, after = [], [], [] for s in client.annotate(sentence).sentence: for t in s.token: words.append(t.word) gloss.append(t.originalText) after.append(t.after) if lower: words = [w.lower() for w in words] return { 'gloss': gloss, 'words': words, 'after': after, }
def separate_sentences(keywords, filepath, mode=0): global out_file pattern = '' for word in keywords: pattern += '(' + word + ')|' pattern = pattern[:-1] # print(pattern) # return # pattern += ']' # print(pattern) with corenlp.CoreNLPClient(annotators="tokenize ssplit".split()) as client: with open(filepath) as file: i = 0 text = '' headline = False for line in file: if i < 2: if i == 0 and re.search(pattern, line, re.IGNORECASE): headline = True out_file.write(line) i += 1 continue if line == '\n': file.readline() # print(text) ann = client.annotate(text) # print('\n\n\n') # print('=================================================================') for i in range(len(ann.sentence)): # print(corenlp.to_text(ann.sentence[i])) sentence = corenlp.to_text(ann.sentence[i]) if headline and mode == 0: out_file.write(sentence + '\n') elif re.search(pattern, sentence, re.IGNORECASE): out_file.write(sentence + '\n') out_file.write('\n\n') # print('=================================================================') # print('\n\n\n') text = '' headline = False i = 0 continue text += line
def corenlp_client(self): if self._corenlp_client is None: if not os.environ.get('CORENLP_HOME'): os.environ['CORENLP_HOME'] = os.path.abspath( os.path.join( os.path.dirname(__file__), '../../third_party/stanford-corenlp-full-2018-10-05')) if not os.path.exists(os.environ['CORENLP_HOME']): raise Exception( '''Please install Stanford CoreNLP and put it at {}. Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip Landing page: https://stanfordnlp.github.io/CoreNLP/'''. format(os.environ['CORENLP_HOME'])) self._corenlp_client = corenlp.CoreNLPClient( annotators="tokenize ssplit") return self._corenlp_client
def tokenize_lemmatize(data): import corenlp os.environ['CORENLP_HOME'] = 'lib/stanford-corenlp-full' with corenlp.CoreNLPClient(annotators='tokenize ssplit pos lemma'.split()) as client: for sample in data: ann = client.annotate(replace_email(replace_phone(replace_url(sample['text'])))) tokens = [] lemmas = [] pos = [] for sent in ann.sentence: tokens += [format_token(token.word) for token in sent.token] lemmas += [format_token(token.lemma) for token in sent.token] pos += [token.pos for token in sent.token] sample['tokens'] = tokens sample['lemmas'] = lemmas sample['pos'] = pos return data
def __init__(self, model="stanford-corenlp-full-2018-10-05", lemmatize=False): if not os.environ.get('CORENLP_HOME'): os.environ['CORENLP_HOME'] = os.path.abspath( os.path.join(os.path.dirname(__file__), f'../../third_party/{model}')) if not os.path.exists(os.environ['CORENLP_HOME']): raise Exception( f'''Please install Stanford CoreNLP and put it at {os.environ['CORENLP_HOME']}. Direct URL: http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip command: `curl https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip | jar xv` Landing page: https://stanfordnlp.github.io/CoreNLP/''') self.client = corenlp.CoreNLPClient() self.corenlp_annotators = ['tokenize', 'ssplit'] if lemmatize: self.corenlp_annotators.append('lemma')
def main(): args = parser.parse_args() conn = ordb.create_connection(args.dbfile) if conn is not None: ordb.create_table(conn, ordb.CREATE_COMMENTS_TABLE) else: print("Error! cannot create the database connection.") conn = ordb.create_connection(args.dbfile) with corenlp.CoreNLPClient(annotators=ANNOTATORS, output_format='conll') as corenlp_client: orl.get_datasets(args.inputfile, corenlp_client, conn, debug=args.debug) metadata = create_metadata_json(conn) relevant_text = create_text_json(conn)
def do_annotate(args): args.props = dict(args.props) if args.props else {} if args.sentence_mode: args.props["ssplit.isOneSentence"] = True with corenlp.CoreNLPClient(annotators=args.annotators, properties=args.props, be_quiet=not args.verbose_server) as client: for line in args.input: if line.startswith("#"): continue ann = client.annotate(line.strip(), output_format=args.format) if args.format == "json": if args.sentence_mode: ann = ann["sentences"][0] args.output.write(json.dumps(ann)) args.output.write("\n")
def main(): guest_client = openreview.Client(baseurl='https://api.openreview.net') conference = orl.Conference.iclr18 notes = list(openreview.tools.iterget_notes( guest_client, invitation=orl.INVITATION_MAP[conference])) obj = {conference:[]} with corenlp.CoreNLPClient( annotators=orl.CORENLP_ANNOTATORS, output_format='conll') as corenlp_client: for note in notes: p = orl.get_tokenized_chunks(corenlp_client, note.content["abstract"]) obj[conference].append(p) with open("tokenized_abstracts.json", 'w') as f: json.dump(obj, f)