from k54 import load_xml import re # sentence_list[sentenceIndex][tokenIndex]で任意の単語が取得できるようなリストを作成 sentence_list = [[]] # インデックス調整の為予め1つ要素を入れておく for sentence in load_xml()['root']['document']['sentences']['sentence']: token_list = [''] # インデックス調整の為(ry # 13文目の為の処理 if isinstance(sentence['tokens']['token'], dict): token_list.append(sentence['tokens']['token']['word']) else: for token in sentence['tokens']['token']: token_list.append(token['word']) sentence_list.append(token_list) for core in load_xml()['root']['document']['coreference']['coreference']: rep = '' for mention in core['mention']: if '@representative' in mention: # 代表参照表現なら文字列を取得 rep = (mention['text']) else: sent = int(mention['sentence']) start = int(mention['start']) end = int(mention['end']) # 参照表現の先頭に '「 [代表表現] ( ' を追加 sentence_list[sent][ start] = '「 ' + rep + ' ( ' + sentence_list[sent][start]
from k54 import load_xml def print_person_name(word='', pos='', ner=''): if pos == 'NNP' and ner == 'PERSON': print(word) for sentence in load_xml()['root']['document']['sentences']['sentence']: tokens = sentence['tokens']['token'] # tokenが1つしか無い時の処理 if isinstance(tokens, dict): print_person_name(tokens['word'], tokens['POS'], tokens['NER']) continue for t in tokens: print_person_name(t['word'], t['POS'], t['NER'])
from k54 import load_xml import re parse_strings = [sentences['parse'] for sentences in load_xml()['root']['document']['sentences']['sentence']] for ps in parse_strings: start = 0 # (NP の開始位置を保存 while len(ps) > start: start += 1 if ps[start:start + 3] == '(NP': # (NPが見つかった時 # 終端位置endを探す、countは()の数をカウント end, count = start + 1, 1 while count: # count が0以上の間 if ps[end] == '(': count += 1 elif ps[end] == ')': count -= 1 end += 1 outList = [] # 先頭と終端がわかったので、中に入っている単語だけ抜き取る for word in ps[start:end + 1].split(' '): if word and word[-1] == ')': outList.append(word.replace(')', '')) outStr = re.sub(r' ([,.;:?!])', r'\1', ' '.join(outList)) # カンマやピリオドの前の空白を削除 outStr = outStr.replace('-LRB- ', '(').replace(' -RRB-', ')') # ()を復元 print(outStr)