Beispiel #1
0
def parsing():
	corenlp_dir = "stanford-corenlp-full-2014-08-27/"
	raw_text_directory = "sample_raw_text/"
	parsed = batch_parse(raw_text_directory, corenlp_dir)
	arr = []
	result = parsed.next()
	corefs = result['coref'] 
	sentences = result['sentences']
	##### Saving all pairs #####
	for outer_itr in corefs:
		for inner_itr in outer_itr:
			arr.append(inner_itr)
	for itr in arr:
		new_hash = {}
		to_be_replaced = -1
		to_be_replaced_from = -1
		###Matching word
		if (text_match(itr[0][0]) and text_match(itr[0][1])):
			break
		elif text_match(itr[0][0]): 
			to_be_replaced = 0
			to_be_replaced_from = 1
		elif text_match(itr[0][1]):
			to_be_replaced = 1
			to_be_replaced_from = 0
		if (to_be_replaced != -1 and to_be_replaced_from != -1):
			to_be_replaced = itr[0][to_be_replaced]
			to_be_replaced_from = itr[0][to_be_replaced_from]
			sentences[to_be_replaced_from[1]]['text'][to_be_replaced_from[2]] = sentences[to_be_replaced[1]]['text'][to_be_replaced[2]]
			print to_be_replaced[0]
			print to_be_replaced_from[1]
			print "#####################"
def parse_directory(fpath, sentiment=[], count=0):
    parsed = batch_parse(fpath, corenlp_dir)
    last_file_name = ''
    for obj in wrapper(parsed):

        if not pd.isnull(obj):
            last_file_name = obj['file_name']

        # the wrapper will return np.nan when it dies from an error.
        if pd.isnull(obj):
            sentiment.append(np.nan)
            return (last_file_name, count, sentiment)

        # otherwise do the normal thing.
        count += 1
        if count % 500 == 0:
            print "analyzed", count, "speechacts."
            temp_pickle_name = "corenlp_sentiment" + str(count) + "_tmp.p"
            print "analyzed", count, "speechacts. Saving temporary pickle as", temp_pickle_name
            pickle.dump(sentiment, open("pickles/" + temp_pickle_name, 'wb'))
        # if count % 5001 == 0:
        #     print "did 5k, stopping for now..."
        #     break
        speechact_sent = {}
        sentences = obj['sentences']
        for sentence in sentences:
            # key is the sentence
            speechact_sent[sentence['text']] = (sentence['sentiment'], sentence['sentimentValue'])
            speechact_sent.append((sentence['sentiment'], sentence['sentimentValue']))
        sentiment[]
        sentiment.append(speechact_sent)
    return sentiment
Beispiel #3
0
def create_plot():
	corenlp_dir = "stanford-corenlp-full-2014-08-27/"
	raw_text_directory = "sample_raw_text/"
	parsed = batch_parse(raw_text_directory, corenlp_dir)
	# out = len(parsed)
	# for itr in out:
	while True:
		try:
			itr = parsed.next()
			result = parsing(itr)
			file_name = itr['file_name']
			# result = parsing()
			new_result = []
			for itr in result:
				new_result.append(itr.encode('utf8'))
			plot = " ".join(new_result)
			plot = plot.replace(" 's","'s")
			plot = plot.replace(" ,",",")
			plot = plot.replace(" n't","n't")
			plot = plot.replace("'ll","will")
			plot = plot.replace(" '","'")
			plot = plot.replace("' ","'")
			# plot = plot.replace(", ",",")
			plot = plot.replace(" .",".")
			create_file(plot,file_name)
		except:
			break
Beispiel #4
0
def batch_parse(text_dir, stanford_dir):
    """Function to parse multi-sentence input using StanfordNLP in batch mode.
    The function parses the input, and performs pronoun coreferencing where
    appropriate. Coreferences are linked across sentences.

    Parameters
    ----------

    text_dir: String.
                Directory of text files to parse using StanfordNLP.

    stanford_dir: String.
                    Directory that contains the StanfordNLP files.

    Returns
    --------

    output_dict : Dictionary.
                    Parsed and formated output for each input event or news
                    story. This dictionary contains the info that should pass
                    to the event coder and other postprocessing utilites.
                    Output dictionary format is of the following form. The
                    main level has story IDs, e.g., story1.txt, as keys
                    with values dictionaries as values. At this stage, the
                    value dictionary has one key, `sent_info`, which has
                    another dictionary as the value. Within the `sent_info`
                    dictionary are keys `sents` and `coref_info`. Each has
                    dictionaries as their values. The `sents` dictionary has
                    integers as keys, which represent the different sentences
                    within a text input. Each individual sentence dictionary
                    contains the keys `parse_tree` (nltk.tree),
                    `dependencies` (list), `np_words` (list),
                    `word_info` (list), `verb_phrases` (list),
                    `vp_words` (list), and `noun_phrases` (list).
                    The `coref_info` dictionary has a
                    similar structure, with each sentence having its own
                    individual dictionary with keys `shift` (integer) and
                    `corefs` (list). Given this, the final structure of the
                    output resembles:
                    {'event_id': {'sent_info': {'sents': {0: {'parse_tree': tree
                                                              'dependencies': list}
                                                          1: {...}}
                                                'coref_info': {0: {'shift': 0
                                                               'corefs': []}}
                                                }}}

    """
    output_dict = dict()
    results = corenlp.batch_parse(text_dir, stanford_dir)
    for index in xrange(len(results)):
        parsed = results[index]
        name = parsed['file_name']
        output = parse_sents(name, parsed)
        output_dict.update(output)
    for article in output_dict:
        utilities.coref_replace(output_dict, article)

    return output_dict
Beispiel #5
0
 def parse(self, stimtextdir):
     self.stimtextdir=stimtextdir
     parse=batch_parse(self.stimtextdir, corenlp_dir)
     parse=[el for el in parse if '.DS_Store' != el['file_name']]
     if len(parse)>1:
         print "warning: multiple files. using only first temptext file."
     parse=parse[0]
     print "parse completed"
     summary=summarizeparse(parse)
     return parse, summary
Beispiel #6
0
def tokenize():
    corenlp_dir = "./lib/stanford-corenlp-full-2015-01-29/"
    #parser = corenlp.StanfordCoreNLP(corenlp_path=corenlp_dir)
    parsed = corenlp.batch_parse('tmp/',
                                 corenlp_path=corenlp_dir,
                                 raw_output=True)

    for p in parsed:
        text = xmltodict.unparse(p, pretty=True)
        print(text)
Beispiel #7
0
 def parse(self, stimtextdir):
     self.stimtextdir = stimtextdir
     parse = batch_parse(self.stimtextdir, corenlp_dir)
     parse = [el for el in parse if '.DS_Store' != el['file_name']]
     if len(parse) > 1:
         print "warning: multiple files. using only first temptext file."
     parse = parse[0]
     print "parse completed"
     summary = summarizeparse(parse)
     return parse, summary
Beispiel #8
0
 def parse(self, inputfile):
     self.copyfile(inputfile)
     parse=batch_parse(self.innerdir, corenlp_dir)
     parse=[el for el in parse if 'temptext' in el['file_name']]
     if len(parse)>1:
         print "warning: multiple files. using only first temptext file."
     parse=parse[0]
     self.deletefile()
     print "parse completed"
     summary=summarizeparse(parse)
     return parse, summary
Beispiel #9
0
 def parse(self, inputfile):
     self.copyfile(inputfile)
     parse = batch_parse(self.innerdir, corenlp_dir)
     parse = [el for el in parse if 'temptext' in el['file_name']]
     if len(parse) > 1:
         print "warning: multiple files. using only first temptext file."
     parse = parse[0]
     self.deletefile()
     print "parse completed"
     summary = summarizeparse(parse)
     return parse, summary
Beispiel #10
0
def batch_process(file_dict, dbpath, memory):
    """Parses, resolves corefs, and extracts triplets from file in a
    directory.
    """
    from threading import Thread
    try:

        # Parse files with progress bar
        t = Thread(target=monitor_progress, kwargs={
            'num_files':len(file_dict)
            })
        t.daemon = True
        t.start()

        print "Starting corenlp. Wait a few moments."
        this_dir = os.path.dirname(os.path.realpath(__file__))
        corenlp_path = os.path.join(this_dir,
                "stanford-corenlp-full-2013-11-12")
        log_path = os.path.join(TEMP, 'corenlp_log.txt')
        parses = corenlp.batch_parse(TEMP, log_path, memory=memory,
                corenlp_path=corenlp_path)

        # Extract triplets and save to db
        pbar = ProgressBar(len(file_dict))
        file_name = ''
        for parse_dict in parses:
            if not pbar.has_started():
                print "Extracting triplets..."
                pbar.start()
            article_dict = file_dict[parse_dict['file_name']]

            # add article to db
            database.save_article(article_dict, dbpath)

            # resolve corefs and extract triplets
            triplets = process_parsed(parse_dict)

            # save triplet to db
            if len(triplets) > 0:
                for triplet in triplets:
                    triplet['article_path'] = article_dict['path']
                    triplet['pub_date'] = article_dict['pub_date']

                    database.save_triplet(triplet, dbpath)
            if parse_dict['file_name'] != file_name:
                file_name = parse_dict['file_name']
                pbar.tick()
    finally:  # remove temp files
        for root, dirs, fnames in os.walk(TEMP):
            for fname in fnames:
                p = os.path.join(root, fname)
                os.remove(p)
Beispiel #11
0
    def _parse(self, text, dest_file):
        '''Computes the parsing calling Stanford NLP api.

        '''
        import tempfile
        from corenlp import batch_parse
        dirname = tempfile.mkdtemp()
        with tempfile.NamedTemporaryFile('w', dir=dirname, delete=False) as f:
            filename = f.name
        with codecs.open(filename, 'w', encoding='utf8') as tmp:
            tmp.write(text)
            tmp.flush()
            result = batch_parse(os.path.dirname(tmp.name), self.folder)
            result = list(result)[0]
        cPickle.dump(result, open(dest_file, 'w'))
        return result
Beispiel #12
0
    def _parse(self, text, dest_file):
        '''Computes the parsing calling Stanford NLP api.

        '''
        import tempfile
        from corenlp import batch_parse
        dirname = tempfile.mkdtemp()
        with tempfile.NamedTemporaryFile('w', dir=dirname, delete=False) as f:
            filename = f.name
        with codecs.open(filename, 'w', encoding='utf8') as tmp:
            tmp.write(text)
            tmp.flush()
            result = batch_parse(os.path.dirname(tmp.name), self.folder)
            result = list(result)[0]
        cPickle.dump(result, open(dest_file, 'w'))
        return result
Beispiel #13
0
def batch_process(directory):
    """Parses, resolves corefs, and extracts triplets from file in a
    directory.
    """
    from threading import Thread
    try:
        file_dict = preprocess_dir(directory)

        # Parse files with progress bar
        t = Thread(target=monitor_progress, kwargs={
            'num_files':len(file_dict)
            })
        t.daemon = True
        t.start()
        print "Starting corenlp. Wait a few moments."
        parses = corenlp.batch_parse(config.TEMP, memory=config.memory)

        # Extract triplets and save to db
        pbar = ProgressBar(len(file_dict))
        file_name = ''
        for parse_dict in parses:
            if not pbar.has_started():
                print "Extracting triplets..."
                pbar.start()
            article_dict = file_dict[parse_dict['file_name']]

            # add article to db
            database.save_article(article_dict)

            # resolve corefs and extract triplets
            triplets = process_parsed(parse_dict)

            # save triplet to db
            if len(triplets) > 0:
                for triplet in triplets:
                    triplet['article_path'] = article_dict['path']
                    triplet['pub_date'] = article_dict['pub_date']

                    database.save_triplet(triplet)
            if parse_dict['file_name'] != file_name:
                file_name = parse_dict['file_name']
                pbar.tick()
    finally:  # remove temp files
        for root, dirs, fnames in os.walk(config.TEMP):
            for fname in fnames:
                p = os.path.join(root, fname)
                os.remove(p)
Beispiel #14
0
def extract_NER():
    print('テキストを解析するディレクトリの名前を入力してください')
    filename = input('>>>  ')
    print('固有名詞の抽出を開始します')
    corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2017-06-09/"
    # corenlp_dir = "stanford-corenlp-full-2017-06-09/"
    raw_text_directory = "output/sf"
    properties_file = "user.properties"
    parsed = batch_parse(raw_text_directory, corenlp_dir)
    NERlist = []
    preTag = ""
    currentNER = ""
    TempTag = ""
    count = 0
    for value in parsed:
        count += 1
        valuePh = value['sentences']
        for value2 in valuePh:
            value2Ph = value2['words']
            for value3 in value2Ph:
                value4 = value3[1]
                tempTag = value4['NamedEntityTag']
                if not tempTag == preTag and not currentNER == "":
                    NERlist.append(currentNER)
                    currentNER = ""
                if value4['NamedEntityTag'] == 'PERSON' or value4[
                        'NamedEntityTag'] == 'ORGANIZATION' or value4[
                            'NamedEntityTag'] == 'LOCATION':
                    if currentNER == "":
                        currentNER = currentNER + value3[0]
                    else:
                        currentNER = currentNER + ' ' + value3[0]
                preTag = value4['NamedEntityTag']

                print(value3[0])
                print(value4['NamedEntityTag'])
        NERlist.append(currentNER)
    NERlist_uniq = []
    for x in NERlist:
        if x not in NERlist_uniq:
            NERlist_uniq.append(x)
    fp = open('output/' + filename + '/NERlist.txt', 'a')
    for value in NERlist_uniq:
        fp.write(value + "\n")
    fp.close()
    print('固有名詞リストが' + filename + '/NERlist.txt として出力されました')
Beispiel #15
0
    def corenlp_batch_parse(self, rawtextdir=rawtextdir):
        """
        perform the batch parse on a directory full of text files, containing one "body" per line.
        return a dict mapping unique ids to mean sentiments.
        """    
        
        print "\n\ninitiating batch parse..."
        parsed      = batch_parse(rawtextdir, corenlpdir)
        parse_tree  = [x for x in parsed]
        fpath       = "data/processed/" + self.fpath + "_parse_tree.json"
        
        print "\n\nsaving parse tree to %s..." % fpath
        with open(fpath, 'wb') as parsetreefile:
            json.dump(parse_tree, parsetreefile)
        print "\n\ndone."

        return parse_tree
Beispiel #16
0
def parse_corefs(entities):
    from tempfile import NamedTemporaryFile
    import os

    # Sort the list by string length.
    entities.sort(key=len, reverse=True)

    # Put all entities in a txt file.
    entity_str = '. '.join(entities)

    temp = NamedTemporaryFile(dir=config.TEMP, delete=False)
    temp.write(entity_str)

    # And send it StanfordCoreNLP to resolve corefs.
    parses = corenlp.batch_parse(config.TEMP, memory=config.memory)

    # Clean out temp dir
    for root, dirs, fnames in os.walk(config.TEMP):
        for fname in fnames:
            p = os.path.join(root, fname)
            os.remove(p)

    return parses
from corenlp import batch_parse

x = batch_parse("../sample_raw_text", "../../stanford-corenlp-full-2014-01-04")

print x
for t in x: print t
Beispiel #18
0
from corenlp import batch_parse

x = batch_parse("../sample_raw_text", "../../stanford-corenlp-full-2014-01-04")

print x
for t in x:
    print t
Beispiel #19
0
try:
    from xml.etree.ElementTree import *
    tree = parse("nlp.txt.xml")
    elem = tree.getroot()
    for w in elem.findall(".//word"):
        print(w.text)  # solution for 53
except FileNotFoundError:
    import corenlp
    import xmltodict
    #from subprocess import call
    # corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2016-10-31"
    corenlp_dir = "/usr/local/lib/stanford-corenlp-full-2014-08-27"
    raw_text_directory = "sample_raw_text"
    # command = corenlp.init_corenlp_command(corenlp_path=corenlp_dir, memory="3g",  properties='default.properties') + ' -filelist + "./nlp.txt" -outputDirectory +"./"'
    # call(command,shell=True)
    parsed = corenlp.batch_parse(raw_text_directory,
                                 corenlp_dir,
                                 raw_output=True)
    parsed_list = []
    while True:
        parsed_list.append(parsed.__next__())
    parsed_xml = xmltodict.unparse(dict(parsed_list))
    print(parsed_xml)
Beispiel #20
0
def get_batch_parse(directory):
  return batch_parse(directory, corenlp_dir)
import requests
import pandas as pd
from corenlp import StanfordCoreNLP,batch_parse
from collections import Counter
from bs4 import BeautifulSoup

#parse using corenlp
corenlp_dir = "stanford-corenlp-full-2014-08-27"
parse = batch_parse('scripts',corenlp_dir, raw_output=True)
parsedEpisodes = []
for p in parse:
	try:
		parsedEpisodes.append(p)
	except:
		parsedEpisodes.append('')

del(parsedEpisodes[0:2]) #remove hidden files

#extract sentiment from corenlp results
# allSentiments = []
allValues = []

for e in range(0,len(parsedEpisodes)):
	try:
		sentences = parsedEpisodes[e]['root']['document']['sentences']['sentence']
		sentimentValues = []
		sentiments = []
		for s in sentences:
			try:
				sentimentValues.append(int(s['@sentimentValue'])-2)
				# sentiments.append(s['@sentiment'])
Beispiel #22
0
from corenlp import batch_parse
corenlp_dir = "../tools/corenlp-python/corenlp/stanford-corenlp-full-2014-01-04"
raw_text_directory = "../dataset/books_txt/small_sample"
parsed = batch_parse(raw_text_directory, corenlp_dir,raw_output=True)
for books in parsed:
    print books['sentences']