def load_chunk_cabocha(file="./data/neko.txt.cabocha") -> T: results = [] # type: T chunks = [] # type: C current = -1 # 現在参照している文節番号 srcs = {} # type: Dict[int, List[int]] # key: 文節番号, value: 文節に係る元の文節番号 for line in open(file, encoding="utf-8"): line = line.rstrip("\n") # 行頭が * で始まる行 if line.startswith("*"): current += 1 elem = line.split(" ") dst = int(elem[2][:-1]) # 文節のリストに文節を追加する chunks.append(Chunk([], dst, [])) if dst != -1: if dst in srcs: srcs[dst].append(int(elem[1])) else: srcs[dst] = [int(elem[1])] # EOS elif line == "EOS": if chunks == []: continue for key, value in srcs.items(): chunks[key].srcs = value results.append(chunks) # 初期化 chunks, current, srcs = [], -1, {} else: splitted = re.split("[\t,]", line) chunks[current].morphs.append( Morph(splitted[0], splitted[7], splitted[1], splitted[2])) return results
def __init__(self): #1文をChunkオブジェクトのリスト -> 1 sentencs : list of chunks chunks = [] self.sentences = [] #file = "test.cabocha" file = "neko.txt.cabocha" for line in open(file): line = line.strip( "\n" ) # must specific \n, if not it will erase whitespace which we treate as moji if line == "EOS": # get src for index, item in enumerate(chunks): dst = chunks[index].dst if dst != -1: #終点じゃない場合のみ chunks[dst].srcs.append(index) self.sentences.append(chunks) #reset chunks = [] elif line.startswith("*") == True: # * 0 -1D 0/0 0.000000 parts = line.split(" ") item = Chunk() item.dst = int(parts[2].replace("D", "")) chunks.append(item) else: # mecab: 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音 surface, rest = line.split("\t") mecab = rest.split(",") item = Morph(surface=surface, base=mecab[-3], pos=mecab[0], pos1=mecab[1]) chunks[-1].morphs.append(item) # -1 for last element of chunks
def get_chunks_list(path): sentences = [] chunks = {} with open(path, 'r', encoding='utf-8') as input_file: for line in input_file: line = line.rstrip() if line == 'EOS': if len(chunks) > 0: sentence = [chunks[key] for key in sorted(chunks.keys())] sentences.append(sentence[:]) chunks = {} elif line[0] == '*': s = line.split() current_num = int(s[1]) dst = int(s[2][:-1]) if current_num not in chunks: chunks[current_num] = Chunk([], dst, []) chunks[current_num].dst = dst if dst != -1: if dst not in chunks: chunks[dst] = Chunk([], -1, []) chunks[dst].srcs.append(current_num) else: elements = re.split('[\t,]', line) morph = Morph(elements[0], elements[7], elements[1], elements[2]) chunks[current_num].morphs.append(morph) return sentences
def F41(): sentenses= list() morphs = list() chunks = list() for line in sys.stdin: if line.startswith("*"): # *が来たのを合図に前のmorphsをchunkとして格納 #初めの*だけ例外 if not(len(morphs) == 0): c = Chunk(morphs,dst,srcs) chunks.append(c) morphs = list() sp_line = line.split(" ") chunk_index = int(sp_line[1]) dst = int(sp_line[2].rstrip("D")) srcs = GetSrcs(chunks,chunk_index) elif line.startswith("EOS"): #EOSでも同様に前のmorphsをchunkとして格納 #その後chunksをsenteseとして格納 c = Chunk(morphs,dst,srcs) chunks.append(c) morphs = list() sentenses.append(chunks) chunks = list() else: sp_line2 = line.split("\t") sp_line3 = sp_line2[1].split(",") surface = unicode(sp_line2[0]) pos = sp_line3[0] pos1 = sp_line3[1] base = sp_line3[6] m = Morph(surface,base,pos,pos1) morphs.append(m) return(sentenses)
def chunk_list(target): i = 0 morphs = [] #*区切りの形態素解析 chunks = [] #Chunksのリスト sentences = []#EOS区切りのリスト for line in target: line = line.strip() if line[0] == '*': if len(morphs) > 0: chunks.append(Chunks(morphs, dst)) morphs = [] dst = int(line.split(' ')[2].rstrip('D')) elif line != 'EOS': morphs.append(Morph(line)) else: chunks.append(Chunks(morphs, dst)) for a, chunk in enumerate(chunks): if chunk.dst == -1 or chunk.dst == None: continue chunks[chunk.dst].srcs.append(a) sentences.append(chunks) morphs = [] chunks = [] dst = None i += 1 if i == 150: break return sentences
def read_cabocha(): i = 0 j = 0 sentences = [] sentences.append([]) for line in open("neko.txt.cabocha", "r"): line = line.strip("\n") if line == "EOS": i += 1 j = 0 sentences.append([]) elif line[0] == "*": chunk = line.split() j = int(chunk[1]) dst = int(chunk[2].strip("D")) srcs = [] for k in range(0, j): if sentences[i][k].getDst() == j: srcs.append(k) sentences[i].append(Chunk(dst, srcs)) else: line = line.replace("\t", ",") cabocha = line.split(",") surface = cabocha[0] base = cabocha[7] pos = cabocha[1] pos1 = cabocha[2] sentences[i][j].appendMorphs(Morph(surface, base, pos, pos1)) del sentences[-1] return sentences
def cabocha_data(): with open("neko.txt.cabocha", "r") as text: k_list = [] # 全文 n_list = [] # 一文 my_dict = defaultdict(list) #リスト型の空の辞書ができる # src for line in text: words = line.strip("\n").replace("\t", ",").split(",") if words[0].startswith("*"): words = words[0].split() index = int(words[1]) kakari = int(words[2].strip("D")) my_dict[kakari].append(index) #係り先の番号に係り元のインデックスを入れる chunk = Chunk(kakari, my_dict[kakari]) n_list.append(chunk) elif line.startswith('EOS'): if len(n_list) > 0: k_list.append(n_list) n_list = [] my_dict = defaultdict(list) #初期化 else: morphs = Morph(words[0], words[7], words[1], words[2]) chunk.morphs.append(morphs) return k_list
def load_chunk(): with open(r'/Users/Naoya/Downloads/ai.ja.txt.parsed', encoding="utf-8") as f: chunks = {} srcs_dicts = {} for line in f: col = line.split('\t') if col[0] == 'EOS\n' and len(chunks) != 0: for key in srcs_dicts.keys(): chunks[key].srcs = srcs_dicts[key] yield chunks chunks = {} srcs_dicts = {} elif line[0] == '*': s_line = line.split(' ') dst = s_line[2].rstrip('D') id = s_line[1] if id not in chunks: chunks[id] = Chunk() chunks[id].dst = dst if dst != str('-1'): if dst not in srcs_dicts: srcs_dicts[dst] = [] srcs_dicts[dst].append(id) elif col[0] != 'EOS\n' and line[0] != '*': other = col[1].split(',') chunks[id].morphs.append( Morph(col[0], other[6], other[0], other[1]))
def func41(): # sentence_counter = 1 sentence = list() store = defaultdict(list) with open('../data/neko.txt.cabocha','r') as neko: for line in neko: if line.startswith('*'): line.replace(',',' ') line.replace('\t',' ') element = line.rstrip('\n').split() ind = int(element[1]) num = int(element[2].strip('D')) #dst:掛かり先のint store[num].append(ind) #store:掛かり元のリスト b = Chunk(num,ind,store[ind]) sentence.append(b) elif line.startswith('EOS'): if len(sentence) > 0: yield sentence sentence = list() store = defaultdict(list) # sentence_counter += 1 else: word,morphs = line.rstrip('\n').split('\t') morphs = morphs.split(',') # line.replace(',',' ') # line.replace('\t',' ') # words = line.rstrip('\n').split() # if len(words) > 6: # print(1) a = Morph(word,morphs[6],morphs[0],morphs[1]) # a = Morph(words[0],words[7],words[1],words[2]) b.morphAppending(a)
def get_chunk_list(): ''' 「吾輩は猫である」を係り受け解析した結果の各単語に ・形態素(Morphオブジェクト)のリスト(morphs) ・係り先文節インデックス番号(dst) ・係り元文節インデックス番号のリスト(srct) をメンバとするクラスのを作り、 一文をそのChunkオブジェクトのリストとして、 戻り値: 一文ごとのChunkクラスを要素とするリストを要素にもつリスト ''' with open('neko.cabocha', 'r') as neko_file: result = [] chunks = {} # Chunkオブジェクトのidx : Chunkオブジェクト for line in neko_file: line = line.rstrip() #stripだと先頭の空白文字が消える # 係り受け情報の場合 if line[0] == '*': idx = int(line.split(' ')[1]) # 文節番号 dst = int(line.split(' ')[2][:-1]) # 係り先の文節番号(係り先なし:-1) # Chunkオブジェクトをchunksに追加 if idx not in chunks: chunks[idx] = Chunk() chunks[idx].dst = dst # 係り先があって係り先のオブジェクトが存在しなければ作成してそのsrcsに文節番号を追加 if dst != -1: if dst not in chunks: chunks[dst] = Chunk() chunks[dst].srcs.append(idx) # 形態素情報の場合 elif line != 'EOS' and line[:1] != '*': # lineの列をそれぞれリストに格納 columm = line.split('\t')[1].split(',') columm.insert(0, line.split('\t')[0]) # 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音 morpheme = { 'surface': columm[0], 'base': columm[7], 'pos': columm[1], 'pos1': columm[2] } chunks[idx].morphs.append(Morph(**morpheme)) # 文の終わりの場合 elif line == 'EOS' and len(chunks) != 0: sentence = [chunks[i] for i in sorted(chunks.keys()) ] #keyでソートしてvalueを取り出す result.append(sentence[:]) #chunksで渡すと次の行でappendしたものが消えてしまう chunks.clear() return result
def getChunk(): class Chunk: def __init__(self, morphs, dst, srcs): self.morphs = morphs self.dst = dst self.srcs = srcs def isPos(self, pos): for x in self.morphs: if x.pos == pos: return True return False def morph_str(self, morph): temp = "" for x in self.morphs: if (morph == "surface"): temp += x.surface elif (morph == "base"): temp += x.base return temp.strip("。").strip("、") morph_list = [] chunks = [] sentence = [] morphs = [] srcs = [] dst = None for line in open("neko.txt.cabocha"): if line[0] != "*" and "EOS" not in line: words = re.split("\t|,", line) morph_obj = Morph(words[0], words[7], words[1], words[2]) morphs.append(morph_obj) morph_list.append(morph_obj) else: if line[0] == "*": if dst != None: chunks.append(Chunk(morphs, dst, srcs)) morphs = [] srcs = [] dst = None words = re.split("\s", line) dst = words[2][:-1] for i, c in enumerate(chunks[:int(words[1])]): if c.dst == words[1]: srcs.append(i) if morphs != []: chunks.append(Chunk(morphs, dst, srcs)) morphs = [] srcs = [] dst = None if "EOS" in line: sentence.append(chunks) chunks = [] i = 0 return sentence
def morph2chunk(morphlists): morphlists.append(Morph("*")) chunks = [] chunk = [] for elem in morphlists: #print(elem.show()) if elem.show()[0] != "*" : chunk.append(elem) else: if chunk != []: chunks.append(Chunk(chunk)) chunk = [] chunk.append(elem) return chunks
def get_chunk_list(): with open('sentence.cabocha', 'r') as sentence_file: result = [] chunks = {} # Chunkオブジェクトのidx : Chunkオブジェクト for line in sentence_file: line = line.rstrip() #stripだと先頭の空白文字が消える # 係り受け情報の場合 if line[0] == '*': idx = int(line.split(' ')[1]) # 文節番号 dst = int(line.split(' ')[2][:-1]) # 係り先の文節番号(係り先なし:-1) # Chunkオブジェクトをchunksに追加 if idx not in chunks: chunks[idx] = Chunk() chunks[idx].dst = dst # 係り先があって係り先のオブジェクトが存在しなければ作成してそのsrcsに文節番号を追加 if dst != -1: if dst not in chunks: chunks[dst] = Chunk() chunks[dst].srcs.append(idx) # 形態素情報の場合 elif line != 'EOS' and line[:1] != '*': # lineの列をそれぞれリストに格納 columm = line.split('\t')[1].split(',') columm.insert(0, line.split('\t')[0]) # 表層形\t品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音 morpheme = { 'surface': columm[0], 'base': columm[7], 'pos': columm[1], 'pos1': columm[2] } chunks[idx].morphs.append(Morph(**morpheme)) # 文の終わりの場合 elif line == 'EOS' and len(chunks) != 0: sentence = [chunks[i] for i in sorted(chunks.keys()) ] #keyでソートしてvalueを取り出す result.append(sentence[:]) #chunksで渡すと次の行でappendしたものが消えてしまう chunks.clear() return result
def parse(lines) -> Sentence: chunks = [] chunk = Chunk([], -1, []) srcs = defaultdict(lambda: []) for line in lines: if line[0] == '*': if len(chunk.morphs) != 0: chunks.append(chunk) cur, dst, chunk = Chunk.parse(line) assert cur == len(chunks) if dst != -1: srcs[dst].append(cur) else: chunk.add_morph(Morph.parse(line)) if len(chunk.morphs) != 0: chunks.append(chunk) for dst, curs in srcs.items(): chunks[dst].set_srcs(curs) return Sentence(chunks)
def get_chunk_list(file_path: str): with open(file_path) as f: chunks = {} # Chunkオブジェクトのidx : Chunkオブジェクト for line in f: line = line.rstrip() # 係り受け情報の場合 if line[0] == '*': idx = int(line.split(' ')[1]) # 文節番号 dst = int(line.split(' ')[2][:-1]) # 係り先の文節番号(係り先なし:-1) # Chunkオブジェクトをchunksに追加 if idx not in chunks: chunks[idx] = Chunk() chunks[idx].dst = dst # 係り先があって係り先のオブジェクトが存在しなければ作成してそのsrcsに文節番号を追加 if dst != -1: if dst not in chunks: chunks[dst] = Chunk() chunks[dst].srcs.append(idx) # 形態素情報の場合 elif line != 'EOS' and line[:1] != '*': line = ','.join(line.split("\t")).split(',') # Morphクラスをリストに追加 chunks[idx].morphs.append( Morph( line[0], # surface line[7], # base line[1], # pos line[2] # pos1 )) # 文の終わりの場合 elif line == 'EOS': if len(chunks) != 0: yield [chunks[i] for i in sorted(chunks.keys())] else: yield [] chunks = {}
def load_chunk(file_path): sentences = [] chunks = [] morphs = [] with open(file_path, 'r', encoding='utf-8') as f: for line in f: if line[0] == '*': # * 0 -1D 1/1 0.000000 if len(morphs) > 0: chunks.append(Chunk( morphs, dst)) # 直前の文節の情報に対して、Chunk()による結果を文節リストに追加 morphs = [] dst = int(line.split(' ')[2].rstrip('D')) # 直後の文節の係り先を取得 elif line != 'EOS\n': morphs.append(Morph(line)) # Morph objectを返し、形態素リストに追加 else: chunks.append(Chunk(morphs, dst)) sentences.append( Sentence(chunks)) #文節リストにSentence()を適用し、文リストに追加 morphs = [] chunks = [] dst = int() return sentences
def get_neko_list(): sentence = list() src = defaultdict(list) for line in open('./neko.txt.cabocha'): if line.startswith('*'): ids = line.rstrip('\n').split() chunk_id = int(ids[1]) dst = int(ids[2].rstrip('D')) src[dst].append(chunk_id) chunk = Chunk(dst, src[chunk_id]) sentence.append(chunk) elif line.startswith('EOS'): if len(sentence) > 0: yield sentence sentence = list() src = defaultdict(list) else: surface, morphs = line.rstrip('\n').split('\t') morphs = morphs.split(',') morph = Morph(surface, morphs[0], morphs[1], morphs[6]) chunk.append_morph(morph)
def cabocha_chunk_data(data): pettern = re.compile(r'\* ') chunk_sentence = []; morphs = []; srcs = collections.defaultdict(lambda: []) for line in data: if pettern.match(line): if morphs != []: chunk_sentence.append(Chunk(morphs, dst, srcs[num_chunk])) morphs = []; temp = line.split() num_chunk = int(temp[1]) dst = int(re.match(r'-*[0-9]+(?=D)', temp[2]).group()) if dst != -1: srcs[dst].append(num_chunk) elif line == 'EOS\n': if chunk_sentence != [] or morphs != []: chunk_sentence.append(Chunk(morphs, dst, srcs[num_chunk])) yield chunk_sentence chunk_sentence = [] morphs = []; srcs = collections.defaultdict(lambda: []) else: word = line.replace('\t', ',').split(',') morphs.append(Morph(word))
def chunk_list(text): sentences = [] sentence = [] phrase = Chunk() for line in open(text, 'r'): line = line.split() if line[0] not in '*' and line[0] not in 'EOS': l = line[0].split(',') + line[1].split(',') morph = Morph(surface=l[0], base=l[7], pos=l[1], pos1=l[2]) phrase.morphs.append(morph) else: if len(phrase.morphs) > 0: sentence.append(phrase) phrase = Chunk() if line[0] == '*': phrase.dst = int(line[2].strip('D')) elif line[0] == 'EOS': for no, s in enumerate(sentence, 0): # dstからsrcsを設定 sentence[s.dst].srcs = no sentences.append(sentence) sentence = [] return sentences
def cabocha_data(): with open('neko.txt.cabocha') as text: mlist = [] Mlist = [] src_dict = defaultdict(list) for line in text: word = line.strip('\n').replace('\t', ',').split(',') if word[0].startswith('*'): word = word[0].split() dst = int(word[2].strip('D')) chunk = Chunk(dst, src_dict[int(word[1])]) src_dict[dst].append(int(word[1])) mlist.append(chunk) elif word[0] == 'EOS': src_dict = defaultdict(list) if len(mlist) > 0: Mlist.append(mlist) mlist = [] else: morphs = Morph(word[0], word[7], word[1], word[2]) chunk.morphs.append(morphs) return Mlist
def read_file(file_path='./ai.ja.txt.parsed'): doc = [] chunk = Chunk(None, None) with open(file_path) as fp: sentence = [] for line in fp: line = line.strip() if line[0] == '*': if len(chunk.morphs) > 0: sentence.append(chunk) # define chunk items = line.split() srcs = int(items[1]) dst = int(items[2][:-1]) chunk = Chunk(dst, srcs) elif line[:3] == 'EOS': if len(chunk.morphs) > 0: sentence.append(chunk) if len(sentence) == 0: continue chunk = Chunk(None, None) doc.append(sentence) sentence = [] else: # define morph word_parts = line.split('\t') parts = word_parts[1].split(',') word = word_parts[0] surface = parts[6] pos = parts[0] pos1 = parts[1] morph = Morph(word, surface, pos, pos1) #sentence.append(morph) chunk.morphs.append(morph) return doc
def func2(): with open('neko.txt.cabocha', 'r') as f: m_tmp = list() c_tmp = list() chunk = Chunk() flag = False relate = defaultdict(lambda: list()) for line in f: line = line.replace(' ', ',') line = line.replace('D', '') words = (line.replace('\t', ',')).strip().split(',') kts = Morph() if (words[0] != '*' and words[0] != 'EOS'): #形態素解析をリストに加える kts.surface = words[0] kts.base = words[7] kts.pos = words[1] kts.pos1 = words[2] m_tmp.append(kts) elif (words[0] == 'EOS'): chunk.morphs = m_tmp #形態素解析のデータを保存 m_tmp = list() c_tmp.append(chunk) yield c_tmp relate = defaultdict(lambda: list()) c_tmp = list() flag = False elif (words[0] == '*'): if flag: chunk.morphs = m_tmp #形態素解析のデータを保存 m_tmp = list() c_tmp.append(chunk) chunk = Chunk() flag = True chunk.dst = words[2] relate[words[2]].append(words[1]) chunk.srcs = relate[words[1]] chunk.index = words[1]
from collections import defaultdict from knock40 import Morph, morph2sents from knock41 import Chunk, morph2chunk, sources if __name__ == "__main__": with open("ai.ja1.txt.parsed", "r") as ai: ai = ai.readlines() ai_morphs = [] for i in range(len(ai)): ai_morphs.append(Morph(ai[i])) sents = morph2sents(ai_morphs) for sent in sents: chunks = morph2chunk(sent) sources(chunks) for i in range(len(chunks)): #print(chunk.meta) #print(chunk.show_bunsetsu_tag()) if "サ変接続" in chunks[i].show_morph_pos1(): if "を" in chunks[i].show_only_listwords(): print(chunks[i].show_only_words(), end="") goto = chunks[i].dst verb = chunks[goto].show_base_for_X("動詞") adpos = set() dep = [] if verb != None: if chunks[goto].srcs != []: for head_id in chunks[goto].srcs: adp = chunks[head_id].show_base_for_X("助詞") if adp != None and head_id != i:
morphs = [] chunks = [] #文(Chunkオブジェクトのリスト) sentences = [] #文のリスト with open('ai.ja.txt.parsed') as file: for line in file: if line == '\n': continue #形態素をmorphsに追加 if line != 'EOS\n' and line[0] != '*': line = line.replace('\n', '').split('\t') morph = [line[0]] morph.extend(line[1].split(',')) morphs.append(Morph(morph)) #EOSか係り受け関係の時、chunksに追加 elif len(morphs) > 0: chunks.append(Chunk(morphs, dst)) morphs = [] #係り受け関係の時、dst更新 if line[0] == '*': line = line.replace('\n', '').split() dst = int(line[2].strip('D')) #EOSで文節がある時、sentencesに追加 if line == 'EOS\n' and len(chunks) > 0: for i, chunk in enumerate(chunks): if chunk.dst != -1: #係り先が存在
chunk.dst = int(line[2][:-1]) elif line == 'EOS': if lines[i - 1] == "EOS": continue elif len(chunk.morphs) > 0: sen.append(chunk) sentences.append(sen) sen = [] chunk = Chunk() else: surface, info = line.split('\t') info = info.split(',') base = info[-3] pos = info[0] pos1 = info[1] morph = Morph(surface, base, pos, pos1) chunk.morphs.append(morph) #srcs for sen in sentences: for chunk in sen: dst = chunk.dst if not dst == -1: sen[dst].srcs.append(chunk.index) if __name__ == "__main__": #out with open('chunks.txt', 'w') as f: for sen in sentences: print('------------', file=f) for chunk in sen: print(chunk, file=f)