def test_date_check(): parse = "(S (NP (NNP CARL ) (NN XVI ) (NNP GUSTAF ) ) )" test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEGOV"] test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate( "19720813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEELI"] test = ptree.Sentence(parse,"Carl XVI Gustaf", PETRreader.dstr_to_ordate("19010813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEELI"]
def check_date(self, match): """ Method for resolving date restrictions on actor codes. Parameters ----------- match: list Dates and codes from the dictionary Returns ------- code: string The code corresponding to how the actor should be coded given the date """ code = None try: for j in match: dates = j[1] date = [] code = "" for d in dates: if d[0] in "<>": date.append(d[0] + str(PETRreader.dstr_to_ordate(d[1:]))) else: date.append(str(PETRreader.dstr_to_ordate(d))) curdate = self.date if not date: code = j[0] elif len(date) == 1: if date[0][0] == "<": if curdate < int(date[0][1:]): code = j[0] else: if curdate >= int(date[0][1:]): code = j[0] else: if curdate < int(date[1]): if curdate >= int(date[0]): code = j[0] if code: return code except Exception as e: # print(e) return code return code
def test_reflexive(): parse = "(S (NP (NNP Obama ) ) (VP (VBD asked ) (NP (PRP himself ) ) (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper( ) test = ptree.Sentence(parse, "Obama asked himself why Biden was tired", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1] assert phrase.get_meaning() == ["USAGOV"]
def test_personal1(): parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper( ) test = ptree.Sentence(parse, "Obama said he was tired", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1].children[0].children[0] assert phrase.get_meaning() == ["USAGOV"]
def test_date_check(): parse = "(S (NP (NNP CARL ) (NN XVI ) (NNP GUSTAF ) ) )" test = ptree.Sentence(parse, "Carl XVI Gustaf", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEGOV"] test = ptree.Sentence(parse, "Carl XVI Gustaf", PETRreader.dstr_to_ordate("19720813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEELI"] test = ptree.Sentence(parse, "Carl XVI Gustaf", PETRreader.dstr_to_ordate("19010813")) phrase = test.tree.children[0] assert phrase.get_meaning() == ["SWEELI"]
def test_reflexive2(): parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) ) ) ".upper( ) test = ptree.Sentence(parse, "Obama knew that Biden liked him", PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1].children[1].children[ 1].children[1] assert phrase.get_meaning() == ["RUSGOV"]
def do_coding(event_dict, out_file): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 if out_file: file = open_tex(out_file) logger = logging.getLogger('petr_log') times = 0 sents = 0 for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] StorySource = 'TEMP' for sent in val['sents']: NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: if 'config' in val['sents'][sent]: for id, config in event_dict[key][ 'sents'][sent]['config'].items(): change_Config_Options(config) SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][sent] else StoryDate Date = PETRreader.dstr_to_ordate(SentenceDate) SentenceSource = 'TEMP' #if not "SYNSET" in SentenceID: # continue #if not "Sarkozy" in SentenceText: # continue print("\t\t",SentenceID) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = True NDiscardStory += 1 break t1 = time.time() sentence = PETRtree.Sentence(treestr,SentenceText,Date) coded_events , meta = sentence.get_events() code_time = time.time()-t1 event_dict[key]['meta']['verbs'] = meta if out_file: sentence.print_to_file(sentence.tree,file = file) del(sentence) times+=code_time sents += 1 print('\t\t',code_time) if coded_events: event_dict[key]['sents'][sent]['events'] = coded_events if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events NEvents += len(coded_events) if len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None if out_file: close_tex(file) print("\nSummary:") print( "Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print( "Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times/sents if sents else 0) return event_dict
def do_coding(event_dict): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 logger = logging.getLogger('petr_log') times = 0 sents = 0 #获得发布时间 realiseTimeDic = get_releasetime(event_dict) if not realiseTimeDic: print( "realiseTimeDic have no timeinfo ,please check “get_releasetime” method" ) #获得报道时间 reporttimeDic = get_reporttime(event_dict, realiseTimeDic) for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing paragraph {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] if StoryDate == 'NULL': continue id = key.split("-") articleId = id[0] paraghId = id[1] #设置发布时间与报道时间,报道时间缺失的按发布时间确定 val["meta"]["realiseTime"] = realiseTimeDic[articleId] if articleId in reporttimeDic.keys(): val["meta"]["reportTime"] = reporttimeDic[articleId] else: val["meta"]["reportTime"] = realiseTimeDic[articleId] # if paraghId == "0000": # with open("timeinfo.txt", "a") as f: # # f.writelines(("发布时间:" + val["meta"]["realiseTime"]).decode("utf-8").encode("utf-8") + "\n") # f.writelines(("报道时间:" + val["meta"]["reportTime"]).decode("utf-8").encode("utf-8") + "\n") # with open("timeinfo.txt", "a") as f: # f.writelines(("文章段落ID:" + articleId + " " + paraghId + "\n").decode("utf-8").encode("utf-8")) for sent in sorted(val['sents']): print('\n\nProcessing sentence {}'.format(sent)) NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][ sent] else StoryDate Date = PETRreader.dstr_to_ordate( SentenceDate.split(' ')[0].replace('-', '')) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = False NDiscardStory += 1 break t1 = time.time() try: sentence = PETRtree.Sentence(treestr, SentenceText, Date) except Exception as e: message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str( e) + "\n" logging.exception(message) continue set_nt_textList(sentence) set_sentenceTimeByReport(sentence, val["meta"]["reportTime"], val['sents'], sent) # with open("timeinfo.txt", "a") as f: # f.writelines((" 句子ID:" + sent + "\n").decode("utf-8").encode("utf-8")) # f.write(" "+sentence.txt.decode("utf-8").encode("utf-8")+ "\n") # f.write(" 时间词列表: ") # for text in sentence.ntTextList: # f.write(text+",") # f.write("\n 句子时间:" +str(sentence.sentenceTime).decode("utf-8").encode("utf-8") + "\n\n") timeText = sentence.ntTextList sentenceTime = sentence.sentenceTime try: coded_events, meta = sentence.get_events() except Exception as e: message = "ERROR IN PETRARCH2 DO_CODING:" + SentenceID + "\n" + SentenceText + str( e) + "\n" logging.exception(message) # print("coded_events:",coded_events) # print("meta:",meta) #print("coded_events:",coded_events) #print("meta:",meta) # exit() # 暂时只走了最后一条分支 code_time = time.time() - t1 if PETRglobals.NullVerbs or PETRglobals.NullActors: event_dict[key]['meta'] = meta event_dict[key]['text'] = sentence.txt elif PETRglobals.NullActors: event_dict[key]['events'] = coded_events coded_events = None # skips additional processing event_dict[key]['text'] = sentence.txt else: # 16.04.30 pas: we're using the key value 'meta' at two # very different event_dict[key]['meta']['verbs'] = meta # levels of event_dict -- see the code about ten lines below -- and # this is potentially confusing, so it probably would be useful to # change one of those del (sentence) times += code_time sents += 1 # print('\t\t',code_time) if coded_events: event_dict[key]['sents'][sent]['events'] = coded_events event_dict[key]['sents'][sent]['meta'] = meta #print('DC-events:', coded_events) # -- #print('DC-meta:', meta) # -- #print('+++',event_dict[key]['sents'][sent]) # -- if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot: text_dict = utilities.extract_phrases( event_dict[key]['sents'][sent], SentenceID) print('DC-td1:', text_dict) # -- if text_dict: event_dict[key]['sents'][sent]['meta'][ 'actortext'] = {} event_dict[key]['sents'][sent]['meta'][ 'eventtext'] = {} event_dict[key]['sents'][sent]['meta'][ 'actorroot'] = {} event_dict[key]['sents'][sent]['meta'][ 'eventroot'] = {} event_dict[key]['sents'][sent]['meta'][ 'Source'] = {} event_dict[key]['sents'][sent]['meta'][ 'Target'] = {} event_dict[key]['sents'][sent]['meta'][ 'timeText'] = timeText event_dict[key]['sents'][sent]['meta'][ 'sentenceTime'] = {sentenceTime} # -- print('DC1:',text_dict) # -- for evt in coded_events: # realLocation = [] # location_initial = event_dict[key]['sents'][sent]['ner'] # # index1 = SentenceText.find(text_dict[evt][0]) + 1 # index2 = SentenceText.find(text_dict[evt][1]) - 1 # index3 = SentenceText.find(text_dict[evt][2]) - 1 # for loc in location_initial: # if (SentenceText.find(loc, index1, index2) # or SentenceText.find(loc, index1, index3)): # realLocation.append(loc) # event_dict[key]['sents'][sent]['ner'] = realLocation if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds event_dict[key]['sents'][sent]['meta'][ 'actortext'][evt] = text_dict[evt][:2] event_dict[key]['sents'][sent]['meta'][ 'eventtext'][evt] = text_dict[evt][2] event_dict[key]['sents'][sent]['meta'][ 'actorroot'][evt] = text_dict[evt][3:5] event_dict[key]['sents'][sent]['meta'][ 'eventroot'][evt] = text_dict[evt][5] event_dict[key]['sents'][sent]['meta'][ 'Source'][evt] = text_dict[evt][0] event_dict[key]['sents'][sent]['meta'][ 'Target'][evt] = text_dict[evt][1] if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events # NEvents += len(coded_events) if coded_events is not None and len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None print("\nSummary:") """ print( "Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print( "Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times / sents if sents else 0) """ # -- print('DC-exit:',event_dict) return event_dict
def do_coding(event_dict, out_file): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 if out_file: file = open_tex(out_file) logger = logging.getLogger('petr_log') times = 0 sents = 0 for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing story {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] StorySource = 'TEMP' for sent in val['sents']: NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: # if 'config' in val['sents'][sent]: # for id, config in event_dict[key][ # 'sents'][sent]['config'].items(): # change_Config_Options(config) SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][ sent] else StoryDate Date = PETRreader.dstr_to_ordate(SentenceDate) SentenceSource = 'TEMP' print("\n", SentenceID) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = True NDiscardStory += 1 break t1 = time.time() sentence = PETRtree.Sentence(treestr, SentenceText, Date) print(sentence.actor) print(sentence.agent) coded_events, meta = sentence.get_events( ) # this is the entry point into the processing in PETRtree # print(meta) code_time = time.time() - t1 event_dict[key]['meta'][ 'verbs'] = meta # 16.04.30 pas: we're using the key value 'meta' at two very different # levels of event_dict -- see the code about ten lines below -- and # this is potentially confusing, so it probably would be useful to # change one of those if out_file: sentence.print_to_file(sentence.tree, file=file) del (sentence) times += code_time sents += 1 #print('\t\t',code_time) if coded_events: event_dict[key]['sents'][sent]['events'] = coded_events event_dict[key]['sents'][sent]['meta'] = meta """print('DC-events:', coded_events) # -- print('DC-meta:', meta) # -- print('+++',event_dict[key]['sents'][sent]) # --""" if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot: text_dict = utilities.extract_phrases( event_dict[key]['sents'][sent], SentenceID) # -- print('DC-td1:',text_dict) # -- if text_dict: event_dict[key]['sents'][sent]['meta'][ 'actortext'] = {} event_dict[key]['sents'][sent]['meta'][ 'eventtext'] = {} event_dict[key]['sents'][sent]['meta'][ 'actorroot'] = {} # -- print('DC1:',text_dict) # -- for evt in coded_events: if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds event_dict[key]['sents'][sent]['meta'][ 'actortext'][evt] = text_dict[evt][:2] event_dict[key]['sents'][sent]['meta'][ 'eventtext'][evt] = text_dict[evt][2] event_dict[key]['sents'][sent]['meta'][ 'actorroot'][evt] = text_dict[evt][3:5] if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events NEvents += len(coded_events) if len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None if out_file: close_tex(file) print("\nSummary:") print("Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print("Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times / sents if sents else 0) # -- print('DC-exit:',event_dict) return event_dict
def do_coding(event_dict): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 logger = logging.getLogger('petr_log') times = 0 sents = 0 for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing story {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] for sent in val['sents']: NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: if 'config' in val['sents'][sent]: for _, config in event_dict[key]['sents'][sent][ 'config'].items(): change_Config_Options(config) SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][ sent] else StoryDate Date = PETRreader.dstr_to_ordate(SentenceDate) print("\n", SentenceID) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = True NDiscardStory += 1 break t1 = time.time() sentence = PETRgraph.Sentence(treestr, SentenceText, Date) print(sentence.txt) #raw_input("check") # this is the entry point into the processing in PETRtree coded_events = sentence.get_events() event_dict[key]['sents'][sent]['events'] = sentence.events event_dict[key]['sents'][sent]['verbs'] = sentence.verbs event_dict[key]['sents'][sent]['nouns'] = sentence.nouns event_dict[key]['sents'][sent]['triplets'] = sentence.triplets logger.debug("check events:") for eventID, event in event_dict[key]['sents'][sent][ 'events'].items(): logger.debug("event:" + eventID) logger.debug(event) #raw_input("Press Enter to continue...") code_time = time.time() - t1 ''' if PETRglobals.NullVerbs or PETRglobals.NullActors: event_dict[key]['meta'] = meta event_dict[key]['text'] = sentence.txt elif PETRglobals.NullActors: event_dict[key]['events'] = coded_events coded_events = None # skips additional processing event_dict[key]['text'] = sentence.txt else: # 16.04.30 pas: we're using the key value 'meta' at two # very different event_dict[key]['meta']['verbs'] = meta # levels of event_dict -- see the code about ten lines below -- and # this is potentially confusing, so it probably would be useful to # change one of those ''' del (sentence) times += code_time sents += 1 # print('\t\t',code_time) if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events NEvents += len(coded_events.values()) if len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None print("\nSummary:") print("Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print("Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times / sents if sents else 0) # -- print('DC-exit:',event_dict) return event_dict
def check_date(self, match): """ Method for resolving date restrictions on actor codes. Parameters ----------- match: list Dates and codes from the dictionary Returns ------- code: string The code corresponding to how the actor should be coded given the date Note <16.06.10 pas> ------------------- In a very small set of cases involving a reflexive PRP inside a PP, the system can get into an infinite recursion where it first backs up a couple levels from the (PP, then this call to child.get_meaning() drops back down to the same point via the two child invocations in NounPhrase.get_meaning() elif child.label == "PP": m = self.resolve_codes(child.get_meaning()) and in PrepPhrase.get_meaning() self.meaning = self.children[1].get_meaning() if isinstance(self.children[1],NounPhrase) else "" which takes one back to the same point at one deeper level of recursion. These structures occurred about five times in a 20M sentence corpus, and I couldn't find any fix that didn't break something else, so I just trapped it here. There are a bunch of commented-out debugging prints remaining from this futile pursuit that could presumably be removed at some point. The full record for one of the offending cases is: <Sentence date = "20150824" id ="e35ef55a-fa30-4c34-baae-965dea33d8d8_3" source = "ANOTHER INFINITE RECURSION" sentence = "True"> <Text> He started out at the bottom of the Hollywood rung, directed his own movie and managed to get noticed by Steven Spielberg himself to nab a tiny role in 1998s Saving Private Ryan . </Text> <Parse> (ROOT (S (S (NP (PRP He)) (VP (VBD started) (PRT (RP out)) (PP (IN at) (NP (NP (DT the) (NN bottom)) (PP (IN of) (NP (DT the) (NNP Hollywood) )))))) (VP (VBD rung)) (, ,) (S (VP (VP (VBD directed) (NP (PRP$ his) (JJ own) (NN movie))) (CC and) (VP (VBD managed) (S (VP (TO to) (VP (VB get) (VP (VBN noticed) (PP (IN by) (NP (NNP Steven) (NNP Spielberg) (PRP himself)) ) (S (VP (TO to) (VP (VB nab) (NP (NP (DT a) (JJ tiny) (NN role)) (PP (IN in) (NP (NP (NNS 1998s)) (VP (VBG Saving) (NP (JJ Private) (NNP Ryan)) )))))))))))))) (. .))) </Parse> </Sentence> """ code = None #try: for j in match: dates = j[1] date = [] code = "" for d in dates: if d[0] in '<>': date.append(d[0] + str(PETRreader.dstr_to_ordate(d[1:]))) else: date.append(str(PETRreader.dstr_to_ordate(d))) curdate = self.date if not date: code = j[0] elif len(date) == 1: if date[0][0] == '<': if curdate < int(date[0][1:]): code = j[0] else: if curdate >= int(date[0][1:]): code = j[0] else: if curdate < int(date[1]): if curdate >= int(date[0]): code = j[0] if code: return code #except Exception as e: # print(e) # return code return code
def do_coding(event_dict, out_file): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 """if out_file: # <16.06.18 pas> disable for now file = open_tex(out_file)""" logger = logging.getLogger('petr_log') times = 0 sents = 0 for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing story {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] StorySource = 'TEMP' for sent in val['sents']: NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: if 'config' in val['sents'][sent]: for id, config in event_dict[key][ 'sents'][sent]['config'].items(): change_Config_Options(config) SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][sent] else StoryDate Date = PETRreader.dstr_to_ordate(SentenceDate) SentenceSource = 'TEMP' print("\n",SentenceID) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = True NDiscardStory += 1 break t1 = time.time() sentence = PETRtree.Sentence(treestr,SentenceText,Date) print(sentence.txt) coded_events , meta = sentence.get_events() # this is the entry point into the processing in PETRtree code_time = time.time()-t1 if PETRglobals.NullVerbs or PETRglobals.NullActors: event_dict[key]['meta'] = meta event_dict[key]['text'] = sentence.txt elif PETRglobals.NullActors: event_dict[key]['events'] = coded_events coded_events = None # skips additional processing event_dict[key]['text'] = sentence.txt else: event_dict[key]['meta']['verbs'] = meta # 16.04.30 pas: we're using the key value 'meta' at two very different # levels of event_dict -- see the code about ten lines below -- and # this is potentially confusing, so it probably would be useful to # change one of those """if out_file: # <16.06.18 pas> This isn't doing anything useful right now, just flipping bits on the hard drive, so I'm disabling it sentence.print_to_file(sentence.tree,file = file)""" del(sentence) times+=code_time sents += 1 #print('\t\t',code_time) if coded_events: event_dict[key]['sents'][sent]['events'] = coded_events event_dict[key]['sents'][sent]['meta'] = meta """print('DC-events:', coded_events) # -- print('DC-meta:', meta) # -- print('+++',event_dict[key]['sents'][sent]) # --""" if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot : text_dict = utilities.extract_phrases(event_dict[key]['sents'][sent],SentenceID) # -- print('DC-td1:',text_dict) # -- if text_dict: event_dict[key]['sents'][sent]['meta']['actortext'] = {} event_dict[key]['sents'][sent]['meta']['eventtext'] = {} event_dict[key]['sents'][sent]['meta']['actorroot'] = {} # -- print('DC1:',text_dict) # -- for evt in coded_events: if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds event_dict[key]['sents'][sent]['meta']['actortext'][evt] = text_dict[evt][:2] event_dict[key]['sents'][sent]['meta']['eventtext'][evt] = text_dict[evt][2] event_dict[key]['sents'][sent]['meta']['actorroot'][evt] = text_dict[evt][3:5] if coded_events and PETRglobals.IssueFileName != "": event_issues = get_issues(SentenceText) if event_issues: event_dict[key]['sents'][sent]['issues'] = event_issues if PETRglobals.PauseBySentence: if len(input("Press Enter to continue...")) > 0: sys.exit() prev_code = coded_events NEvents += len(coded_events) if len(coded_events) == 0: NEmpty += 1 else: logger.info( '{} has no parse information. Passing.'.format(SentenceID)) pass if SkipStory: event_dict[key]['sents'] = None """if out_file: # <16.06.18 pas> disable for now close_tex(file)""" print("\nSummary:") print( "Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print( "Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times/sents if sents else 0) # -- print('DC-exit:',event_dict) return event_dict
def do_coding(event_dict): """ Main coding loop Note that entering any character other than 'Enter' at the prompt will stop the program: this is deliberate. <14.02.28>: Bug: PETRglobals.PauseByStory actually pauses after the first sentence of the *next* story """ treestr = "" NStory = 0 NSent = 0 NEvents = 0 NEmpty = 0 NDiscardSent = 0 NDiscardStory = 0 logger = logging.getLogger('petr_log') times = 0 sents = 0 # path = "" # dirs = os.listdir(path) # for file in dirs: # if file == 'evts.test.txt': # os.remove(path+file) for key, val in sorted(event_dict.items()): NStory += 1 prev_code = [] SkipStory = False print('\n\nProcessing story {}'.format(key)) StoryDate = event_dict[key]['meta']['date'] for sent in val['sents']: print("sent:", sent) NSent += 1 if 'parsed' in event_dict[key]['sents'][sent]: if 'config' in val['sents'][sent]: for _, config in event_dict[key]['sents'][sent][ 'config'].items(): change_Config_Options(config) SentenceID = '{}_{}'.format(key, sent) SentenceText = event_dict[key]['sents'][sent]['content'] SentenceDate = event_dict[key]['sents'][sent][ 'date'] if 'date' in event_dict[key]['sents'][ sent] else StoryDate Date = PETRreader.dstr_to_ordate(SentenceDate) print("\n", SentenceID) parsed = event_dict[key]['sents'][sent]['parsed'] treestr = parsed disc = check_discards(SentenceText) if disc[0] > 0: if disc[0] == 1: print("Discard sentence:", disc[1]) logger.info('\tSentence discard. {}'.format(disc[1])) NDiscardSent += 1 continue else: print("Discard story:", disc[1]) logger.info('\tStory discard. {}'.format(disc[1])) SkipStory = True NDiscardStory += 1 break t1 = time.time() sentence = PETRtree.Sentence(treestr, SentenceText, Date) print(sentence.txt) # this is the entry point into the processing in PETRtree coded_events, meta = sentence.get_events() # print("coded_events:",coded_events) # print("meta:",meta) print("coded_events:", coded_events) #print("meta:",meta) # exit() # code_time = time.time() - t1 # if PETRglobals.NullVerbs or PETRglobals.NullActors: # event_dict[key]['meta'] = meta # event_dict[key]['text'] = sentence.txt # elif PETRglobals.NullActors: # event_dict[key]['events'] = coded_events # coded_events = None # skips additional processing # event_dict[key]['text'] = sentence.txt # else: # # 16.04.30 pas: we're using the key value 'meta' at two # # very different # event_dict[key]['meta']['verbs'] = meta # # levels of event_dict -- see the code about ten lines below -- and # # this is potentially confusing, so it probably would be useful to # # change one of those # # del(sentence) # times += code_time # sents += 1 # # print('\t\t',code_time) # # if coded_events: # event_dict[key]['sents'][sent]['events'] = coded_events # event_dict[key]['sents'][sent]['meta'] = meta # #print('DC-events:', coded_events) # -- # #print('DC-meta:', meta) # -- # #print('+++',event_dict[key]['sents'][sent]) # -- # if PETRglobals.WriteActorText or PETRglobals.WriteEventText or PETRglobals.WriteActorRoot: # text_dict = utilities.extract_phrases(event_dict[key]['sents'][sent], SentenceID) # print('DC-td1:',text_dict) # -- # if text_dict: # event_dict[key]['sents'][sent][ # 'meta']['actortext'] = {} # event_dict[key]['sents'][sent][ # 'meta']['eventtext'] = {} # event_dict[key]['sents'][sent][ # 'meta']['actorroot'] = {} # # -- print('DC1:',text_dict) # -- # for evt in coded_events: # if evt in text_dict: # 16.04.30 pas bypasses problems with expansion of compounds # event_dict[key]['sents'][sent]['meta'][ # 'actortext'][evt] = text_dict[evt][:2] # event_dict[key]['sents'][sent]['meta'][ # 'eventtext'][evt] = text_dict[evt][2] # event_dict[key]['sents'][sent]['meta'][ # 'actorroot'][evt] = text_dict[evt][3:5] # # if coded_events and PETRglobals.IssueFileName != "": # event_issues = get_issues(SentenceText) # if event_issues: # event_dict[key]['sents'][sent]['issues'] = event_issues # # if PETRglobals.PauseBySentence: # if len(input("Press Enter to continue...")) > 0: # sys.exit() # # prev_code = coded_events # # NEvents += len(coded_events) # if len(coded_events) == 0: # NEmpty += 1 # else: # logger.info('{} has no parse information. Passing.'.format(SentenceID)) # pass # # if SkipStory: # event_dict[key]['sents'] = None # # print("\nSummary:") """ print( "Stories read:", NStory, " Sentences coded:", NSent, " Events generated:", NEvents) print( "Discards: Sentence", NDiscardSent, " Story", NDiscardStory, " Sentences without events:", NEmpty) print("Average Coding time = ", times / sents if sents else 0) """ # -- print('DC-exit:',event_dict) return event_dict
def test_reflexive2(): parse = "(S (NP (NNP Obama ) ) (VP (VBD knew ) (SBAR (IN that ) (S (NP (NNP Putin ) ) (VP (VBD liked ) (NP (PRP himself ) ) ) ) ) ) ) ".upper() test = ptree.Sentence(parse,"Obama knew that Biden liked him",PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1].children[1].children[1].children[1] assert phrase.get_meaning() == ["RUSGOV"]
def test_reflexive(): parse = "(S (NP (NNP Obama ) ) (VP (VBD asked ) (NP (PRP himself ) ) (SBAR (WHADVP (WRB why ) ) (S (NP (NNP Biden ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) )".upper() test = ptree.Sentence(parse,"Obama asked himself why Biden was tired",PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1] assert phrase.get_meaning() == ["USAGOV"]
def test_personal1(): parse = "(S (NP (NNP Obama ) ) (VP (VBD said ) (SBAR (S (NP (PRP he ) ) (VP (VBD was ) (ADJP (VBN tired ) ) ) ) ) ) ) ".upper() test = ptree.Sentence(parse,"Obama said he was tired",PETRreader.dstr_to_ordate("20150813")) phrase = test.tree.children[1].children[1].children[0].children[0] assert phrase.get_meaning() == ["USAGOV"]