def pipe_output(event_dict): """ Format the coded event data for use in the processing pipeline. Parameters ---------- event_dict: Dictionary. The main event-holding dictionary within PETRARCH. Returns ------- final_out: Dictionary. StoryIDs as the keys and a list of coded event tuples as the values, i.e., {StoryID: [(full_record), (full_record)]}. The ``full_record`` portion is structured as (story_date, source, target, code, joined_issues, ids, StorySource) with the ``joined_issues`` field being optional. The issues are joined in the format of ISSUE,COUNT;ISSUE,COUNT. The IDs are joined as ID;ID;ID. """ final_out = {} for key in event_dict: story_dict = event_dict[key] if not story_dict['sents']: continue # skip cases eliminated by story-level discard filtered_events = utilities.story_filter(story_dict, key) if 'source' in story_dict['meta']: StorySource = story_dict['meta']['source'] else: StorySource = 'NULL' if 'url' in story_dict['meta']: url = story_dict['meta']['url'] else: url = '' if filtered_events: story_output = [] for event in filtered_events: story_date = event[0] source = event[1] target = event[2] code = event[3] ids = ';'.join(filtered_events[event]['ids']) if 'issues' in filtered_events[event]: iss = filtered_events[event]['issues'] issues = ['{},{}'.format(k, v) for k, v in iss.items()] joined_issues = ';'.join(issues) event_str = (story_date, source, target, code, joined_issues, ids, url, StorySource) else: event_str = (story_date, source, target, code, ids, url, StorySource) story_output.append(event_str) final_out[key] = story_output else: pass return final_out
def write_events(event_dict, output_file): """ Formats and writes the coded event data to a file in a standard event-data format. Parameters ---------- event_dict: Dictionary. The main event-holding dictionary within PETRARCH. output_file: String. Filepath to which events should be written. """ global StorySource global NEvents global StoryIssues event_output = [] for key in event_dict: story_dict = event_dict[key] if not story_dict['sents']: continue # skip cases eliminated by story-level discard # print('WE1',story_dict) story_output = [] filtered_events = utilities.story_filter(story_dict, key) # print('WE2',filtered_events) if 'source' in story_dict['meta']: StorySource = story_dict['meta']['source'] else: StorySource = 'NULL' if 'url' in story_dict['meta']: url = story_dict['meta']['url'] else: url = '' for event in filtered_events: story_date = event[0] source = event[1] target = event[2] code = filter(lambda a: not a == '\n', event[3]) ids = ';'.join(filtered_events[event]['ids']) if 'issues' in filtered_events[event]: iss = filtered_events[event]['issues'] issues = ['{},{}'.format(k, v) for k, v in iss.items()] joined_issues = ';'.join(issues) else: joined_issues = [] print('Event: {}\t{}\t{}\t{}\t{}\t{}'.format(story_date, source, target, code, ids, StorySource)) # event_str = '{}\t{}\t{}\t{}'.format(story_date,source,target,code) # 15.04.30: a very crude hack around an error involving multi-word # verbs if not isinstance(event[3], basestring): event_str = '\t'.join( event[:3]) + '\t010\t' + '\t'.join(event[4:]) else: event_str = '\t'.join(event) # print(event_str) if joined_issues: event_str += '\t{}'.format(joined_issues) else: event_str += '\t' if url: event_str += '\t{}\t{}\t{}'.format(ids, url, StorySource) else: event_str += '\t{}\t{}'.format(ids, StorySource) if PETRglobals.WriteActorText: if 'actortext' in filtered_events[event]: event_str += '\t{}\t{}'.format( filtered_events[event]['actortext'][0], filtered_events[event]['actortext'][1]) else: event_str += '\t---\t---' if PETRglobals.WriteEventText: if 'eventtext' in filtered_events[event]: event_str += '\t{}'.format( filtered_events[event]['eventtext']) else: event_str += '\t---' if PETRglobals.WriteActorRoot: if 'actorroot' in filtered_events[event]: event_str += '\t{}\t{}'.format( filtered_events[event]['actorroot'][0], filtered_events[event]['actorroot'][1]) else: event_str += '\t---\t---' story_output.append(event_str) story_events = '\n'.join(story_output) event_output.append(story_events) # Filter out blank lines event_output = [event for event in event_output if event] if output_file: f = codecs.open(output_file, encoding='utf-8', mode='w') for str in event_output: # field = str.split('\t') # debugging # f.write(field[5] + '\n') f.write(str + '\n') f.close()
def write_events(event_dict, output_file): """ Formats and writes the coded event data to a file in a standard event-data format. Parameters ---------- event_dict: Dictionary. The main event-holding dictionary within PETRARCH. output_file: String. Filepath to which events should be written. """ global StorySource global NEvents global StoryIssues event_output = [] for key in event_dict: story_dict = event_dict[key] if not story_dict['sents']: continue # skip cases eliminated by story-level discard story_output = [] filtered_events = utilities.story_filter(story_dict, key) if 'source' in story_dict['meta']: StorySource = story_dict['meta']['source'] else: StorySource = 'NULL' if 'url' in story_dict['meta']: url = story_dict['meta']['url'] else: url = '' for event in filtered_events: story_date = event[0] source = event[1] target = event[2] code = event[3] ids = ';'.join(filtered_events[event]['ids']) if 'issues' in filtered_events[event]: iss = filtered_events[event]['issues'] issues = ['{},{}'.format(k, v) for k, v in iss.items()] joined_issues = ';'.join(issues) else: joined_issues = [] print('Event: {}\t{}\t{}\t{}\t{}\t{}'.format(story_date, source, target, code, ids, StorySource)) # event_str = '{}\t{}\t{}\t{}'.format(story_date,source,target,code) # 15.04.30: a very crude hack around an error involving multi-word # verbs if not isinstance(event[3], basestring): event_str = '\t'.join( event[:3]) + '\t010\t' + '\t'.join(event[4:]) else: event_str = '\t'.join(event) print(event_str) if joined_issues: event_str += '\t{}'.format(joined_issues) else: event_str += '\t' if url: event_str += '\t{}\t{}\t{}'.format(ids, url, StorySource) story_output.append(event_str) else: event_str += '\t{}\t{}'.format(ids, StorySource) story_output.append(event_str) story_events = '\n'.join(story_output) event_output.append(story_events) # Filter out blank lines event_output = [event for event in event_output if event] final_event_str = '\n'.join(event_output) with open(output_file, 'w') as f: f.write(final_event_str)
def write_events(event_dict, output_file): """ Formats and writes the coded event data to a file in a standard event-data format. Parameters ---------- event_dict: Dictionary. The main event-holding dictionary within PETRARCH. output_file: String. Filepath to which events should be written. """ global StorySource global NEvents global StoryIssues event_output = [] for key in event_dict: story_dict = event_dict[key] if not story_dict['sents']: continue # skip cases eliminated by story-level discard # print('WE1',story_dict) story_output = [] filtered_events = utilities.story_filter(story_dict, key) # print('WE2',filtered_events) if 'source' in story_dict['meta']: StorySource = story_dict['meta']['source'] else: StorySource = 'NULL' if 'url' in story_dict['meta']: url = story_dict['meta']['url'] else: url = '' for event in filtered_events: if not isinstance( event[3], basestring ): # occasional issue in PETR-2 due to mishandling of multi-word verb PAS 15.04.03, modified 18.06.01 continue story_date = event[0] source = event[1] target = event[2] code = event[3] #code = filter(lambda a: not a == '\n', event[3]) ids = ';'.join(filtered_events[event]['ids']) if 'issues' in filtered_events[event]: iss = filtered_events[event]['issues'] issues = ['{},{}'.format(k, v) for k, v in list(iss.items())] joined_issues = ';'.join(issues) else: joined_issues = [] print('Event: {}\t{}\t{}\t{}\t{}\t{}'.format( story_date, source, target, code, ids, StorySource)) event_str = '\t'.join(event) # print(event_str) if joined_issues: event_str += '\t{}'.format(joined_issues) else: event_str += '\t' if url: event_str += '\t{}\t{}\t{}'.format(ids, url, StorySource) else: event_str += '\t{}\t{}'.format(ids, StorySource) if PETRglobals.WriteActorText: if 'actortext' in filtered_events[event]: event_str += '\t{}\t{}'.format( filtered_events[event]['actortext'][0], filtered_events[event]['actortext'][1]) else: event_str += '\t---\t---' if PETRglobals.WriteEventText: if 'eventtext' in filtered_events[event]: event_str += '\t{}'.format( filtered_events[event]['eventtext']) else: event_str += '\t---' if PETRglobals.WriteActorRoot: if 'actorroot' in filtered_events[event]: event_str += '\t{}\t{}'.format( filtered_events[event]['actorroot'][0], filtered_events[event]['actorroot'][1]) else: event_str += '\t---\t---' story_output.append(ids + "\t" + event_str) #sort output by story ids sorted_story_output = [] for story in sorted(story_output): sorted_story_output.append(story[story.find('\t') + 1:]) story_events = '\n'.join(sorted_story_output) event_output.append(story_events) # Filter out blank lines event_output = [event for event in event_output if event] if output_file: f = codecs.open(output_file, encoding='utf-8', mode='w') for line in event_output: # field = str.split('\t') # debugging # f.write(field[5] + '\n') f.write(line + '\n') f.close()
def write_events(event_dict, output_file): """ Formats and writes the coded event data to a file in a standard event-data format. Parameters ---------- event_dict: Dictionary. The main event-holding dictionary within PETRARCH. output_file: String. Filepath to which events should be written. """ global StorySource global NEvents global StoryIssues event_output = [] for key in event_dict: story_dict = event_dict[key] if not story_dict['sents']: continue # skip cases eliminated by story-level discard # print('WE1',story_dict) sents = list(story_dict['sents'].keys()) reffilename = story_dict['sents'][sents[0]]['reffilename'] story_output = [] filtered_events = utilities.story_filter(story_dict, key) # print('WE2',filtered_events) if 'source' in story_dict['meta']: StorySource = story_dict['meta']['source'] else: StorySource = 'NULL' if 'url' in story_dict['meta']: url = story_dict['meta']['url'] else: url = '' for event in filtered_events: story_date = event[0] source = event[1] target = event[2] code = filter(lambda a: not a == '\n', event[3]) ids = ';'.join(filtered_events[event]['ids']) if 'issues' in filtered_events[event]: iss = filtered_events[event]['issues'] issues = ['{},{}'.format(k, v) for k, v in iss.items()] joined_issues = ';'.join(issues) else: joined_issues = [] print('Event: {}\t{}\t{}\t{}\t{}\t{}\t{}'.format( story_date, source, target, code, ids, StorySource, reffilename)) # event_str = '{}\t{}\t{}\t{}'.format(story_date,source,target,code) # 15.04.30: a very crude hack around an error involving multi-word # verbs if not isinstance(event[3], basestring): event_str = '\t'.join(event[:3]) + '\t010\t' + '\t'.join( event[4:]) else: event_str = '\t'.join(event) # print(event_str) if joined_issues: event_str += '\t{}'.format(joined_issues) else: event_str += '\t' if url: event_str += '\t{}\t{}\t{}'.format(ids, url, StorySource) else: event_str += '\t{}\t{}'.format(ids, StorySource) if PETRglobals.WriteActorText: # default true if 'actortext' in filtered_events[event]: event_str += '\t{}\t{}'.format( filtered_events[event]['actortext'][0], filtered_events[event]['actortext'][1]) else: event_str += '\t---\t---' if PETRglobals.WriteEventText: # default true if 'eventtext' in filtered_events[event]: event_str += '\t{}'.format( filtered_events[event]['eventtext']) else: event_str += '\t---' if PETRglobals.WriteActorRoot: if 'actorroot' in filtered_events[event]: event_str += '\t{}\t{}'.format( filtered_events[event]['actorroot'][0], filtered_events[event]['actorroot'][1]) else: event_str += '\t---\t---' event_str += '\t{}'.format(reffilename) story_output.append(event_str) story_events = '\n'.join(story_output) event_output.append(story_events) # Filter out blank lines event_output = [event for event in event_output if event] if output_file: f = codecs.open(output_file, encoding='utf-8', mode='w') for str in event_output: # field = str.split('\t') # debugging # f.write(field[5] + '\n') f.write(str + '\n') f.close()
def write_events(event_dict, output_file, flag=True): """ Formats and writes the coded event data to a file in a standard event-data format. Parameters ---------- event_dict: Dictionary. The main event-holding dictionary within PETRARCH. output_file: String. Filepath to which events should be written. """ global StorySource global NEvents global StoryIssues global StoryNer global StoryNer2 global filtered_events global event_temp event_output = [] # 测试用 flag = False import globalConfigPara as gcp if not gcp.merge_event == "": flag = gcp.merge_event for key in sorted(event_dict): story_dict = event_dict[key] if not story_dict['sents']: continue # skip cases eliminated by story-level discard # print('WE1',story_dict) story_output = [] # event_str in one story event_temp = [] # event_origin in one story filtered_events = utilities.story_filter(story_dict, key) # print('WE2',filtered_events) if 'source' in story_dict['meta']: StorySource = story_dict['meta']['source'] else: StorySource = 'NULL' if 'url' in story_dict['meta']: url = story_dict['meta']['url'] else: url = '' StoryNer = ner_to_string(story_dict['meta']['ner']) # extract_location StoryNer2 = get_loction(story_dict) # event is tuple for event in filtered_events: temp_event_dict = {} skip_flag = False story_date = event[0] source = event[1] target = event[2] code = filter(lambda a: not a == '\n', event[3]) ids = filtered_events[event]["ids"] temp_event_dict.update({"origin": event}) temp_event_dict.update({"ids": ids}) if flag: for i, pre_dict in enumerate(event_temp): pre_event = pre_dict["origin"] pre_code = filter(lambda a: not a == '\n', pre_event[3]) pre_ids = pre_dict["ids"] same_code = check_same_event(code, pre_code) # 0:不是同一事件 4:是同一事件但不是同一详细事件 if same_code == 0 or same_code == 4: continue # 补充成分 if check_successive_sent(ids, pre_ids): miss1 = check_miss_component(event) miss2 = check_miss_component(pre_event) # 共有 4*4 共 16种情况 # 均缺失source/均缺失target/同时缺失source和target/事件均完整 则不进行成分补充,进入下一步事件合并。(4种情况pass) if miss1 == miss2: pass # event成分全部缺失,则直接将该事件跳过不处理,只将ids合并。(3种情况) elif miss1 == 0: skip_flag = True modify_event(event, i, same_code, 0) # 如果pre_event成分全部缺失,便将event的成分全部补充到pre_event中。(3种情况) elif miss2 == 0: skip_flag = True modify_event(event, i, same_code, 3) # 如果pre_event缺失source,event有source且二者target相同即合并。(2种情况) elif miss2 == 1: if miss1 == 2 or (miss1 == 3 and target == pre_event[2]): modify_event(event, i, same_code, 1) if miss1 == 2: skip_flag = True # 如果pre_event缺失target,event有target且二者source相同即合并。(2种情况) elif miss2 == 2: if miss1 == 1 or (miss1 == 3 and source == pre_event[1]): modify_event(event, i, same_code, 2) if miss1 == 1: skip_flag = True # 如果pre_event成分完整,而event缺失成分,则合并ids。(2种情况) elif miss2 == 3: if (miss1 == 1 and target == pre_event[2]) or ( miss1 == 2 and source == pre_event[1]): skip_flag = True modify_event(event, i, same_code, 0) # 父子事件替换 if not skip_flag: # pre_event is modified pre_event = event_temp[i]["origin"] if story_date == pre_event[0] and source == pre_event[ 1] and target == pre_event[2]: skip_flag = True modify_event(event, i, same_code, 0) if skip_flag: continue if 'issues' in filtered_events[event]: iss = filtered_events[event]['issues'] issues = ['{},{}'.format(k, v) for k, v in iss.items()] joined_issues = ';'.join(issues) temp_event_dict.update({"joined_issues": joined_issues}) if url: temp_event_dict.update({"url": url}) if 'content' in filtered_events[event]: temp_event_dict.update( {"content": filtered_events[event]['content']}) if 'Source' in filtered_events[event]: temp_event_dict.update( {"Source": filtered_events[event]['Source']}) if 'Target' in filtered_events[event]: temp_event_dict.update( {"Target": filtered_events[event]['Target']}) if 'actortext' in filtered_events[event]: temp_event_dict.update( {"actortext": filtered_events[event]["actortext"]}) if 'eventtext' in filtered_events[event]: temp_event_dict.update( {"eventtext": filtered_events[event]['eventtext']}) # if True: if 'actorroot' in filtered_events[event]: temp_event_dict.update( {"actorroot": filtered_events[event]['actorroot']}) if 'eventroot' in filtered_events[event]: temp_event_dict.update( {"eventroot": filtered_events[event]['eventroot']}) if 'sentenceTime' in filtered_events[event]: temp_event_dict.update( {"sentenceTime": filtered_events[event]['sentenceTime']}) if 'timeText' in filtered_events[event]: temp_event_dict.update( {"timeText": filtered_events[event]['timeText']}) if 'locationText' in filtered_events[event]: temp_event_dict.update( {"locationText": filtered_events[event]['locationText']}) event_temp.append(temp_event_dict) event_str = get_event_str(event_temp, event_dict) if event_str is not None: event_output += event_str story_events = '\n'.join(story_output) event_output.append(story_events) # Filter out blank lines event_output = [event for event in event_output if event] if output_file: if flag: f = codecs.open(output_file, encoding='utf-8', mode='a') for strw in event_output: # field = str.split('\t') # debugging # f.write(field[5] + '\n') f.write(strw + '\n') f.close() else: with open("evets.result_before_merge.txt", 'a') as f: for strw in event_output: f.write(strw + '\n') if output_file: if flag: f = codecs.open(output_file, encoding='utf-8', mode='a') for strq in event_output: # field = str.split('\t') # debugging # f.write(field[5] + '\n') f.write(strq + '\n') f.close() else: story_list = [] for strp in event_output: f_list = strp.splitlines() for index in range(len(f_list)): if (index == 2): story_list.append(f_list[index][5:11]) story_list2 = list(set(story_list)) for i in range(len(story_list2)): str_name = story_list2[i] + "evets.result_before_merge.txt" with open(str_name, 'a') as f: TEXT_ROOT = os.path.abspath( os.path.dirname(os.path.dirname(__file__))) # 获取项目根目录 path = os.path.join(TEXT_ROOT, "input\\test.txt") # 文件路径 with open(path, 'r') as file_to_read: while True: line = file_to_read.readline() if not line: break # if(line[0:6]==story_list2[i]): # f.write(line) if (line.split("\t")[0] == story_list2[i]): f.write(line) event_num = 1 for strss in event_output: listk = strss.splitlines() for index in range(len(listk)): temp = listk[2].split('\t') article_id = temp[len(temp) - 1].split('-')[0] if (article_id == story_list2[i]): if (index == 0): f.write('\n') ss = "#e" + str(event_num) f.write(ss + '\n') event_num = event_num + 1 if (index == 0 or index == 2 or index == 4 or index == 5 or index == 6 or index == 8 or index == 10): f.write(listk[index] + '\n') f.close()
def write_events(event_dict, output_file): """ Formats and writes the coded event data to a file in a standard event-data format. Parameters ---------- event_dict: Dictionary. The main event-holding dictionary within PETRARCH. output_file: String. Filepath to which events should be written. """ global StorySource global NEvents global StoryIssues event_output = [] for key in event_dict: story_dict = event_dict[key] if not story_dict['sents']: continue # skip cases eliminated by story-level discard story_output = [] filtered_events = utilities.story_filter(story_dict, key) if 'source' in story_dict['meta']: StorySource = story_dict['meta']['source'] else: StorySource = 'NULL' if 'url' in story_dict['meta']: url = story_dict['meta']['url'] else: url = '' for event in filtered_events: story_date = event[0] source = event[1] target = event[2] code = filter(lambda a: not a == '\n', event[3]) ids = ';'.join(filtered_events[event]['ids']) if 'issues' in filtered_events[event]: iss = filtered_events[event]['issues'] issues = ['{},{}'.format(k, v) for k, v in iss.items()] joined_issues = ';'.join(issues) else: joined_issues = [] print('Event: {}\t{}\t{}\t{}\t{}\t{}'.format(story_date, source, target, code, ids, StorySource)) # event_str = '{}\t{}\t{}\t{}'.format(story_date,source,target,code) # 15.04.30: a very crude hack around an error involving multi-word # verbs if not isinstance(event[3], basestring): event_str = '\t'.join( event[:3]) + '\t010\t' + '\t'.join(event[4:]) else: event_str = '\t'.join(event) #print(event_str) if joined_issues: event_str += '\t{}'.format(joined_issues) else: event_str += '\t' if url: event_str += '\t{}\t{}\t{}'.format(ids, url, StorySource) story_output.append(event_str) else: event_str += '\t{}\t{}'.format(ids, StorySource) story_output.append(event_str) story_events = '\n'.join(story_output) event_output.append(story_events) # Filter out blank lines event_output = [event for event in event_output if event] final_event_str = '\n'.join(event_output)