def parsing(): corenlp_dir = "stanford-corenlp-full-2014-08-27/" raw_text_directory = "sample_raw_text/" parsed = batch_parse(raw_text_directory, corenlp_dir) arr = [] result = parsed.next() corefs = result['coref'] sentences = result['sentences'] ##### Saving all pairs ##### for outer_itr in corefs: for inner_itr in outer_itr: arr.append(inner_itr) for itr in arr: new_hash = {} to_be_replaced = -1 to_be_replaced_from = -1 ###Matching word if (text_match(itr[0][0]) and text_match(itr[0][1])): break elif text_match(itr[0][0]): to_be_replaced = 0 to_be_replaced_from = 1 elif text_match(itr[0][1]): to_be_replaced = 1 to_be_replaced_from = 0 if (to_be_replaced != -1 and to_be_replaced_from != -1): to_be_replaced = itr[0][to_be_replaced] to_be_replaced_from = itr[0][to_be_replaced_from] sentences[to_be_replaced_from[1]]['text'][to_be_replaced_from[2]] = sentences[to_be_replaced[1]]['text'][to_be_replaced[2]] print to_be_replaced[0] print to_be_replaced_from[1] print "#####################"
def parse_directory(fpath, sentiment=[], count=0): parsed = batch_parse(fpath, corenlp_dir) last_file_name = '' for obj in wrapper(parsed): if not pd.isnull(obj): last_file_name = obj['file_name'] # the wrapper will return np.nan when it dies from an error. if pd.isnull(obj): sentiment.append(np.nan) return (last_file_name, count, sentiment) # otherwise do the normal thing. count += 1 if count % 500 == 0: print "analyzed", count, "speechacts." temp_pickle_name = "corenlp_sentiment" + str(count) + "_tmp.p" print "analyzed", count, "speechacts. Saving temporary pickle as", temp_pickle_name pickle.dump(sentiment, open("pickles/" + temp_pickle_name, 'wb')) # if count % 5001 == 0: # print "did 5k, stopping for now..." # break speechact_sent = {} sentences = obj['sentences'] for sentence in sentences: # key is the sentence speechact_sent[sentence['text']] = (sentence['sentiment'], sentence['sentimentValue']) speechact_sent.append((sentence['sentiment'], sentence['sentimentValue'])) sentiment[] sentiment.append(speechact_sent) return sentiment
def create_plot(): corenlp_dir = "stanford-corenlp-full-2014-08-27/" raw_text_directory = "sample_raw_text/" parsed = batch_parse(raw_text_directory, corenlp_dir) # out = len(parsed) # for itr in out: while True: try: itr = parsed.next() result = parsing(itr) file_name = itr['file_name'] # result = parsing() new_result = [] for itr in result: new_result.append(itr.encode('utf8')) plot = " ".join(new_result) plot = plot.replace(" 's","'s") plot = plot.replace(" ,",",") plot = plot.replace(" n't","n't") plot = plot.replace("'ll","will") plot = plot.replace(" '","'") plot = plot.replace("' ","'") # plot = plot.replace(", ",",") plot = plot.replace(" .",".") create_file(plot,file_name) except: break
def batch_parse(text_dir, stanford_dir): """Function to parse multi-sentence input using StanfordNLP in batch mode. The function parses the input, and performs pronoun coreferencing where appropriate. Coreferences are linked across sentences. Parameters ---------- text_dir: String. Directory of text files to parse using StanfordNLP. stanford_dir: String. Directory that contains the StanfordNLP files. Returns -------- output_dict : Dictionary. Parsed and formated output for each input event or news story. This dictionary contains the info that should pass to the event coder and other postprocessing utilites. Output dictionary format is of the following form. The main level has story IDs, e.g., story1.txt, as keys with values dictionaries as values. At this stage, the value dictionary has one key, `sent_info`, which has another dictionary as the value. Within the `sent_info` dictionary are keys `sents` and `coref_info`. Each has dictionaries as their values. The `sents` dictionary has integers as keys, which represent the different sentences within a text input. Each individual sentence dictionary contains the keys `parse_tree` (nltk.tree), `dependencies` (list), `np_words` (list), `word_info` (list), `verb_phrases` (list), `vp_words` (list), and `noun_phrases` (list). The `coref_info` dictionary has a similar structure, with each sentence having its own individual dictionary with keys `shift` (integer) and `corefs` (list). Given this, the final structure of the output resembles: {'event_id': {'sent_info': {'sents': {0: {'parse_tree': tree 'dependencies': list} 1: {...}} 'coref_info': {0: {'shift': 0 'corefs': []}} }}} """ output_dict = dict() results = corenlp.batch_parse(text_dir, stanford_dir) for index in xrange(len(results)): parsed = results[index] name = parsed['file_name'] output = parse_sents(name, parsed) output_dict.update(output) for article in output_dict: utilities.coref_replace(output_dict, article) return output_dict
def parse(self, stimtextdir): self.stimtextdir=stimtextdir parse=batch_parse(self.stimtextdir, corenlp_dir) parse=[el for el in parse if '.DS_Store' != el['file_name']] if len(parse)>1: print "warning: multiple files. using only first temptext file." parse=parse[0] print "parse completed" summary=summarizeparse(parse) return parse, summary
def tokenize(): corenlp_dir = "./lib/stanford-corenlp-full-2015-01-29/" #parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir) parsed = corenlp.batch_parse('tmp/', corenlp_path=corenlp_dir, raw_output=True) for p in parsed: text = xmltodict.unparse(p, pretty=True) print(text)
def parse(self, stimtextdir): self.stimtextdir = stimtextdir parse = batch_parse(self.stimtextdir, corenlp_dir) parse = [el for el in parse if '.DS_Store' != el['file_name']] if len(parse) > 1: print "warning: multiple files. using only first temptext file." parse = parse[0] print "parse completed" summary = summarizeparse(parse) return parse, summary
def parse(self, inputfile): self.copyfile(inputfile) parse=batch_parse(self.innerdir, corenlp_dir) parse=[el for el in parse if 'temptext' in el['file_name']] if len(parse)>1: print "warning: multiple files. using only first temptext file." parse=parse[0] self.deletefile() print "parse completed" summary=summarizeparse(parse) return parse, summary
def parse(self, inputfile): self.copyfile(inputfile) parse = batch_parse(self.innerdir, corenlp_dir) parse = [el for el in parse if 'temptext' in el['file_name']] if len(parse) > 1: print "warning: multiple files. using only first temptext file." parse = parse[0] self.deletefile() print "parse completed" summary = summarizeparse(parse) return parse, summary
def batch_process(file_dict, dbpath, memory): """Parses, resolves corefs, and extracts triplets from file in a directory. """ from threading import Thread try: # Parse files with progress bar t = Thread(target=monitor_progress, kwargs={ 'num_files':len(file_dict) }) t.daemon = True t.start() print "Starting corenlp. Wait a few moments." this_dir = os.path.dirname(os.path.realpath(__file__)) corenlp_path = os.path.join(this_dir, "stanford-corenlp-full-2013-11-12") log_path = os.path.join(TEMP, 'corenlp_log.txt') parses = corenlp.batch_parse(TEMP, log_path, memory=memory, corenlp_path=corenlp_path) # Extract triplets and save to db pbar = ProgressBar(len(file_dict)) file_name = '' for parse_dict in parses: if not pbar.has_started(): print "Extracting triplets..." pbar.start() article_dict = file_dict[parse_dict['file_name']] # add article to db database.save_article(article_dict, dbpath) # resolve corefs and extract triplets triplets = process_parsed(parse_dict) # save triplet to db if len(triplets) > 0: for triplet in triplets: triplet['article_path'] = article_dict['path'] triplet['pub_date'] = article_dict['pub_date'] database.save_triplet(triplet, dbpath) if parse_dict['file_name'] != file_name: file_name = parse_dict['file_name'] pbar.tick() finally: # remove temp files for root, dirs, fnames in os.walk(TEMP): for fname in fnames: p = os.path.join(root, fname) os.remove(p)
def _parse(self, text, dest_file): '''Computes the parsing calling Stanford NLP api. ''' import tempfile from corenlp import batch_parse dirname = tempfile.mkdtemp() with tempfile.NamedTemporaryFile('w', dir=dirname, delete=False) as f: filename = f.name with codecs.open(filename, 'w', encoding='utf8') as tmp: tmp.write(text) tmp.flush() result = batch_parse(os.path.dirname(tmp.name), self.folder) result = list(result)[0] cPickle.dump(result, open(dest_file, 'w')) return result
def batch_process(directory): """Parses, resolves corefs, and extracts triplets from file in a directory. """ from threading import Thread try: file_dict = preprocess_dir(directory) # Parse files with progress bar t = Thread(target=monitor_progress, kwargs={ 'num_files':len(file_dict) }) t.daemon = True t.start() print "Starting corenlp. Wait a few moments." parses = corenlp.batch_parse(config.TEMP, memory=config.memory) # Extract triplets and save to db pbar = ProgressBar(len(file_dict)) file_name = '' for parse_dict in parses: if not pbar.has_started(): print "Extracting triplets..." pbar.start() article_dict = file_dict[parse_dict['file_name']] # add article to db database.save_article(article_dict) # resolve corefs and extract triplets triplets = process_parsed(parse_dict) # save triplet to db if len(triplets) > 0: for triplet in triplets: triplet['article_path'] = article_dict['path'] triplet['pub_date'] = article_dict['pub_date'] database.save_triplet(triplet) if parse_dict['file_name'] != file_name: file_name = parse_dict['file_name'] pbar.tick() finally: # remove temp files for root, dirs, fnames in os.walk(config.TEMP): for fname in fnames: p = os.path.join(root, fname) os.remove(p)
def extract_NER(): print('テキストを解析するディレクトリの名前を入力してください') filename = input('>>> ') print('固有名詞の抽出を開始します') corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2017-06-09/" # corenlp_dir = "stanford-corenlp-full-2017-06-09/" raw_text_directory = "output/sf" properties_file = "user.properties" parsed = batch_parse(raw_text_directory, corenlp_dir) NERlist = [] preTag = "" currentNER = "" TempTag = "" count = 0 for value in parsed: count += 1 valuePh = value['sentences'] for value2 in valuePh: value2Ph = value2['words'] for value3 in value2Ph: value4 = value3[1] tempTag = value4['NamedEntityTag'] if not tempTag == preTag and not currentNER == "": NERlist.append(currentNER) currentNER = "" if value4['NamedEntityTag'] == 'PERSON' or value4[ 'NamedEntityTag'] == 'ORGANIZATION' or value4[ 'NamedEntityTag'] == 'LOCATION': if currentNER == "": currentNER = currentNER + value3[0] else: currentNER = currentNER + ' ' + value3[0] preTag = value4['NamedEntityTag'] print(value3[0]) print(value4['NamedEntityTag']) NERlist.append(currentNER) NERlist_uniq = [] for x in NERlist: if x not in NERlist_uniq: NERlist_uniq.append(x) fp = open('output/' + filename + '/NERlist.txt', 'a') for value in NERlist_uniq: fp.write(value + "\n") fp.close() print('固有名詞リストが' + filename + '/NERlist.txt として出力されました')
def corenlp_batch_parse(self, rawtextdir=rawtextdir): """ perform the batch parse on a directory full of text files, containing one "body" per line. return a dict mapping unique ids to mean sentiments. """ print "\n\ninitiating batch parse..." parsed = batch_parse(rawtextdir, corenlpdir) parse_tree = [x for x in parsed] fpath = "data/processed/" + self.fpath + "_parse_tree.json" print "\n\nsaving parse tree to %s..." % fpath with open(fpath, 'wb') as parsetreefile: json.dump(parse_tree, parsetreefile) print "\n\ndone." return parse_tree
def parse_corefs(entities): from tempfile import NamedTemporaryFile import os # Sort the list by string length. entities.sort(key=len, reverse=True) # Put all entities in a txt file. entity_str = '. '.join(entities) temp = NamedTemporaryFile(dir=config.TEMP, delete=False) temp.write(entity_str) # And send it StanfordCoreNLP to resolve corefs. parses = corenlp.batch_parse(config.TEMP, memory=config.memory) # Clean out temp dir for root, dirs, fnames in os.walk(config.TEMP): for fname in fnames: p = os.path.join(root, fname) os.remove(p) return parses
from corenlp import batch_parse x = batch_parse("../sample_raw_text", "../../stanford-corenlp-full-2014-01-04") print x for t in x: print t
try: from xml.etree.ElementTree import * tree = parse("nlp.txt.xml") elem = tree.getroot() for w in elem.findall(".//word"): print(w.text) # solution for 53 except FileNotFoundError: import corenlp import xmltodict #from subprocess import call # corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2016-10-31" corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2014-08-27" raw_text_directory = "sample_raw_text" # command = corenlp.init_corenlp_command(corenlp_path=corenlp_dir, memory="3g", properties='default.properties') + ' -filelist + "./nlp.txt" -outputDirectory +"./"' # call(command,shell=True) parsed = corenlp.batch_parse(raw_text_directory, corenlp_dir, raw_output=True) parsed_list = [] while True: parsed_list.append(parsed.__next__()) parsed_xml = xmltodict.unparse(dict(parsed_list)) print(parsed_xml)
def get_batch_parse(directory): return batch_parse(directory, corenlp_dir)
import requests import pandas as pd from corenlp import StanfordCoreNLP,batch_parse from collections import Counter from bs4 import BeautifulSoup #parse using corenlp corenlp_dir = "stanford-corenlp-full-2014-08-27" parse = batch_parse('scripts',corenlp_dir, raw_output=True) parsedEpisodes = [] for p in parse: try: parsedEpisodes.append(p) except: parsedEpisodes.append('') del(parsedEpisodes[0:2]) #remove hidden files #extract sentiment from corenlp results # allSentiments = [] allValues = [] for e in range(0,len(parsedEpisodes)): try: sentences = parsedEpisodes[e]['root']['document']['sentences']['sentence'] sentimentValues = [] sentiments = [] for s in sentences: try: sentimentValues.append(int(s['@sentimentValue'])-2) # sentiments.append(s['@sentiment'])
from corenlp import batch_parse corenlp_dir = "../tools/corenlp-python/corenlp/stanford-corenlp-full-2014-01-04" raw_text_directory = "../dataset/books_txt/small_sample" parsed = batch_parse(raw_text_directory, corenlp_dir,raw_output=True) for books in parsed: print books['sentences']