def read_dictionaries(validation=False): print('Verb dictionary:', PETRglobals.VerbFileName) verb_path = utilities._get_data('data/dictionaries', PETRglobals.VerbFileName) PETRreader.read_verb_dictionary(verb_path) print('Actor dictionaries:', PETRglobals.ActorFileList) for actdict in PETRglobals.ActorFileList: actor_path = utilities._get_data('data/dictionaries', actdict) PETRreader.read_actor_dictionary(actor_path) print('Agent dictionary:', PETRglobals.AgentFileName) agent_path = utilities._get_data('data/dictionaries', PETRglobals.AgentFileName) PETRreader.read_agent_dictionary(agent_path) print('Discard dictionary:', PETRglobals.DiscardFileName) discard_path = utilities._get_data('data/dictionaries', PETRglobals.DiscardFileName) PETRreader.read_discard_list(discard_path) if PETRglobals.IssueFileName != "": print('Issues dictionary:', PETRglobals.IssueFileName) issue_path = utilities._get_data('data/dictionaries', PETRglobals.IssueFileName) PETRreader.read_issue_list(issue_path)
def __init__(self, petrGlobal={}, config_folder='data/config/', config_file='PETR_config.ini'): # cli_args = petrarch2.parse_cli_args() if not petrGlobal: utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() logger.info('Using Config file: ' + config_file) PETRreader.parse_Config(utilities._get_data(config_folder, config_file)) petrarch2.read_dictionaries() print("SUCCESSFULL ON LOADING DICTIONARIES") else: print ("LOADING FROM MAP") self.load(petrGlobal)
def test_date_check(): parse = "(S (NP (NNP CARL ) (NN XVI ) (NNP GUSTAF ) ) )" test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEGOV"] test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate( "19720813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEELI"] test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("19010813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEELI"]
def test_personal1(): parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper() print('This is a test') test = ptree.Sentence(parse,"Obama said he was tired",PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1].children[0].children[0] assert phrase.get_meaning() == ["USAGOV"]
def test_reflexive(): parse = "(S (NP (NNP Obama ) ) (VP (VBD asked ) (NP (PRP himself ) ) (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper( ) test = ptree.Sentence(parse, "Obama asked himself why Biden was tired", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1] assert phrase.get_meaning() == ["USAGOV"]
def test_reflexive2(): parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) ) ) ".upper( ) test = ptree.Sentence(parse, "Obama knew that Biden liked him", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1].children[1].children[ 1].children[1] assert phrase.get_meaning() == ["RUSGOV"]
def test_personal1(): parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper( ) print('This is a test') test = ptree.Sentence(parse, "Obama said he was tired", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1].children[0].children[0] assert phrase.get_meaning() == ["USAGOV"]
def run(filepaths, out_file, s_parsed): # this is the routine called from main() events = PETRreader.read_xml_input(filepaths, s_parsed) if not s_parsed: events = utilities.stanford_parse(events) updated_events = do_coding(events) if PETRglobals.NullVerbs: PETRwriter.write_nullverbs(updated_events, 'nullverbs.' + out_file) elif PETRglobals.NullActors: PETRwriter.write_nullactors(updated_events, 'nullactors.' + out_file) else: PETRwriter.write_events(updated_events, 'evts.' + out_file)
def run_pipeline(data, out_file=None, config=None, write_output=True, parsed=False): # this is called externally utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') if config: print('Using user-specified config: {}'.format(config)) logger.info('Using user-specified config: {}'.format(config)) PETRreader.parse_Config(config) else: logger.info('Using default config file.') logger.info('Config path: {}'.format( utilities._get_data('data/config/', 'PETR_config.ini'))) PETRreader.parse_Config( utilities._get_data('data/config/', 'PETR_config.ini')) read_dictionaries() logger.info('Hitting read events...') events = PETRreader.read_pipeline_input(data) if parsed: logger.info('Hitting do_coding') updated_events = do_coding(events) else: events = utilities.stanford_parse(events) updated_events = do_coding(events) if not write_output: output_events = PETRwriter.pipe_output(updated_events) return output_events elif write_output and not out_file: print('Please specify an output file...') logger.warning('Need an output file. ¯\_(ツ)_/¯') sys.exit() elif write_output and out_file: PETRwriter.write_events(updated_events, out_file)
def main(): cli_args = parse_cli_args() utilities.init_logger('PETRARCH.log') logger = logging.getLogger('petr_log') PETRglobals.RunTimeString = time.asctime() print(cli_args) if cli_args.config: print('Using user-specified config: {}'.format(cli_args.config)) logger.info('Using user-specified config: {}'.format(cli_args.config)) PETRreader.parse_Config(cli_args.config) else: logger.info('Using default config file.') PETRreader.parse_Config( utilities._get_data('data/config/', 'PETR_config.ini')) if cli_args.nullverbs: print('Coding in null verbs mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get verb phrases that are not in the dictionary but are # associated with coded noun phrases PETRglobals.NullVerbs = True elif cli_args.nullactors: print('Coding in null actors mode; no events will be generated') logger.info('Coding in null verbs mode; no events will be generated') # Only get actor phrases that are not in the dictionary but # associated with coded verb phrases PETRglobals.NullActors = True PETRglobals.NewActorLength = int(cli_args.nullactors) read_dictionaries() start_time = time.time() print('\n\n') paths = PETRglobals.TextFileList if cli_args.inputs: if os.path.isdir(cli_args.inputs): if cli_args.inputs[-1] != '/': paths = glob.glob(cli_args.inputs + '/*.xml') else: paths = glob.glob(cli_args.inputs + '*.xml') elif os.path.isfile(cli_args.inputs): paths = [cli_args.inputs] else: print( '\nFatal runtime error:\n"' + cli_args.inputs + '" could not be located\nPlease enter a valid directory or file of source texts.' ) sys.exit() out = "" # PETRglobals.EventFileName if cli_args.outputs: out = cli_args.outputs if cli_args.command_name == 'parse': run(paths, out, cli_args.parsed) else: run(paths, out, True) # <=== print("Coding time:", time.time() - start_time) print("Finished")
def do_coding(event_dict): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 logger = logging.getLogger('petr_log') times = 0 sents = 0 for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing story {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] for sent in val['sents']: NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: if 'config' in val['sents'][sent]: for _, config in event_dict[key]['sents'][sent][ 'config'].items(): change_Config_Options(config) SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][ sent] else StoryDate Date = PETRreader.dstr_to_ordate(SentenceDate) print("\n", SentenceID) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = True NDiscardStory += 1 break t1 = time.time() sentence = PETRtree.Sentence(treestr, SentenceText, Date) print(sentence.txt) # this is the entry point into the processing in PETRtree coded_events, meta = sentence.get_events() code_time = time.time() - t1 if PETRglobals.NullVerbs or PETRglobals.NullActors: event_dict[key]['meta'] = meta event_dict[key]['text'] = sentence.txt elif PETRglobals.NullActors: event_dict[key]['events'] = coded_events coded_events = None # skips additional processing event_dict[key]['text'] = sentence.txt else: # 16.04.30 pas: we're using the key value 'meta' at two # very different event_dict[key]['meta']['verbs'] = meta # levels of event_dict -- see the code about ten lines below -- and # this is potentially confusing, so it probably would be useful to # change one of those del (sentence) times += code_time sents += 1 # print('\t\t',code_time) if coded_events: event_dict[key]['sents'][sent]['events'] = coded_events event_dict[key]['sents'][sent]['meta'] = meta #print('DC-events:', coded_events) # -- #print('DC-meta:', meta) # -- #print('+++',event_dict[key]['sents'][sent]) # -- if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot: text_dict = utilities.extract_phrases( event_dict[key]['sents'][sent], SentenceID) # -- print('DC-td1:',text_dict) # -- if text_dict: event_dict[key]['sents'][sent]['meta'][ 'actortext'] = {} event_dict[key]['sents'][sent]['meta'][ 'eventtext'] = {} event_dict[key]['sents'][sent]['meta'][ 'actorroot'] = {} # -- print('DC1:',text_dict) # -- for evt in coded_events: if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds event_dict[key]['sents'][sent]['meta'][ 'actortext'][evt] = text_dict[evt][:2] event_dict[key]['sents'][sent]['meta'][ 'eventtext'][evt] = text_dict[evt][2] event_dict[key]['sents'][sent]['meta'][ 'actorroot'][evt] = text_dict[evt][3:5] if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events NEvents += len(coded_events) if len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None print("\nSummary:") print("Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print("Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times / sents if sents else 0) # -- print('DC-exit:',event_dict) return event_dict
def test_reflexive2(): parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) ) ) ".upper() test = ptree.Sentence(parse,"Obama knew that Biden liked him",PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1].children[1].children[1].children[1] assert phrase.get_meaning() == ["RUSGOV"]
def test_reflexive(): parse = "(S (NP (NNP Obama ) ) (VP (VBD asked ) (NP (PRP himself ) ) (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper() test = ptree.Sentence(parse,"Obama asked himself why Biden was tired",PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1] assert phrase.get_meaning() == ["USAGOV"]
from petrarch2 import petrarch2, PETRglobals, PETRreader, PETRtree, utilities from ConfigParser import ConfigParser from flask import jsonify, make_response from flask.ext.httpauth import HTTPBasicAuth from flask.ext.restful import Resource, reqparse from flask.ext.restful.representations.json import output_json import os config = "/app/resources/PETR_config.ini" PETRreader.parse_Config(config) petrarch2.read_dictionaries() class PhraseExtractAPI(Resource): def __init__(self): self.reqparse = reqparse.RequestParser() self.reqparse.add_argument('text', type=unicode, location='json') self.reqparse.add_argument('parse', type=unicode, location='json') super(PhraseExtractAPI, self).__init__() def get(self): return """ This service expects a POST in the form '{"text":""Airstrikes and artillery...", "parse" : "(ROOT (S (S (NP (NP (NNP Airstrikes)) (CC and) (NP (NN artillery)))..."}' It will return a list of nouns and verbs...TBD""" def post(self): args = self.reqparse.parse_args() print args text = args['text'] parse = args['parse'] output = self.get_phrases(text, parse) return output