def __load_document__(self, lines): docs = [] self.ids.sent.reset() doc = nlelement.Document() docs.append(doc) sentence = nlelement.Sentence() sentence.sid = self.ids.sent.get() self.ids.chunk.reset() self.ids.tok.reset() self.entity_ids = dict() chunk = None for self.line_num, self.line in enumerate( map(lambda x: x.rstrip('\r\n'), lines)): if self.line == 'EOT': self.__resolve_entity_id__(doc) self.ids.sent.reset() self.entity_ids = dict() doc = nlelement.Document() docs.append(doc) elif self.line == 'EOS': # NOTE: もとになってるcabochaモジュールでsidが永遠に付与されないという深すぎる闇があった for tok in sentence.tokens: tok.sid = sentence.sid self.__validate_sentence__(sentence) doc.sentences.append(sentence) sentence = nlelement.Sentence() sentence.sid = self.ids.sent.get() chunk = None self.ids.chunk.reset() self.ids.tok.reset() elif self.line[0] == '#': if self.line[1] == '!': self.__handle_comment__(self.line) elif self.line[0] == '*' or self.line[0] == '+': if chunk: chunk.set_token_info() chunk = self.__load_chunk__(self.line, sentence.sid) chunk.cid = self.ids.chunk.get() sentence.chunks.append(chunk) elif len(self.line) == 0: pass else: token = self.__load_token__(self.line) token.tid = self.ids.tok.get() chunk.tokens.append(token) chunk.token_num += 1 self.__token_post_process__(chunk, token) sentence.tokens.append(token) if doc.sentences and sentence.tokens: self.__add_exophora__() self.__resolve_entity_id__(doc) self.ids.sent.reset() self.entity_ids = dict() return docs
def sample_pth_annotation(self): """pthのアノテーション文側用 """ doc = nlelement.Document() doc.pt_annotated = True sentence = nlelement.Sentence() sentence.pt_annotated = True chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('そこ', '名詞')) chunk.tokens.append(self.maker.token('に', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('美香', '名詞')) chunk.tokens.append(self.maker.token('が', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('い', '動詞')) chunk.tokens.append(self.maker.token('た', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[0].link = sentence.chunks[2] sentence.chunks[1].link = sentence.chunks[2] doc.sentences.append(sentence) self.maker.set_id_to_sentences(doc) self.maker.add_semantic_role(doc, 0, 4, "経験者", 0, 2) self.maker.add_semantic_role(doc, 0, 4, "場所", 0, 0) self.maker.add_verb_semantic(doc, 0, 4, "状態変化なし(状態)-位置-存在") return doc
def sample_deppara_merge_b(self): """m_xml的な文分けをするやつのばあい、こうなってほしい """ doc = nlelement.Document() sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('私', '名詞')) chunk.tokens.append(self.maker.token('は', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('彼', '名詞')) chunk.tokens.append(self.maker.token('を', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('助け', '動詞')) chunk.tokens.append(self.maker.token('た', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[0].link = sentence.chunks[2] sentence.chunks[1].link = sentence.chunks[2] doc.sentences.append(sentence) sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 0) chunk.tokens.append(self.maker.token('>', '記号')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('今日', '名詞')) chunk.tokens.append(self.maker.token('の', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 0) chunk.tokens.append(self.maker.token('思い出', '名詞')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[1].link = sentence.chunks[2] doc.sentences.append(sentence) self.maker.set_id_to_sentences(doc) return doc
def sample_diffreference_converter_a(self): """DiffReferenceConverterのテストデータ(1) """ doc = nlelement.Document() sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('アプリケーション', '名詞')) chunk.tokens.append(self.maker.token('は', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(1, 2) chunk.tokens.append(self.maker.token('終了', '名詞')) chunk.tokens.append(self.maker.token('し', '動詞')) chunk.tokens.append(self.maker.token('まし', '助動詞')) chunk.tokens.append(self.maker.token('た', '助詞')) chunk.tokens.append(self.maker.token('。', '記号')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[0].link = sentence.chunks[1] doc.sentences.append(sentence) sentence = nlelement.Sentence() chunk = self.maker.chunk(3, 3) chunk.tokens.append(self.maker.token('(', '記号')) chunk.tokens.append(self.maker.token('Help', '名詞')) chunk.tokens.append(self.maker.token(':', '名詞')) chunk.tokens.append(self.maker.token('H', '名詞')) chunk.tokens.append(self.maker.token(')', '記号')) self.maker.append_chunk_to_sentence(sentence, chunk) doc.sentences.append(sentence) sentence = nlelement.Sentence() chunk = self.maker.chunk(1, 3) chunk.tokens.append(self.maker.token('続行', '名詞')) chunk.tokens.append(self.maker.token('する', '動詞')) chunk.tokens.append(self.maker.token('に', '助詞')) chunk.tokens.append(self.maker.token('は', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(1, 2) chunk.tokens.append(self.maker.token('Esc', '名詞')) chunk.tokens.append(self.maker.token('キー', '名詞')) chunk.tokens.append(self.maker.token('を', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 2) chunk.tokens.append(self.maker.token('押し', '動詞')) chunk.tokens.append(self.maker.token('て', '助詞')) chunk.tokens.append(self.maker.token('ください', '動詞')) chunk.tokens.append(self.maker.token('。', '記号')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[1].link = sentence.chunks[2] sentence.chunks[0].link = sentence.chunks[2] doc.sentences.append(sentence) self.maker.set_id_to_sentences(doc) return doc
def sample_pas_original(self): """pasのテキスト原文側用 アノテーションがなく、一部加筆がある """ doc = nlelement.Document() sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('私', '名詞')) chunk.tokens.append(self.maker.token('は', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('花屋', '名詞')) chunk.tokens.append(self.maker.token('に', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('行っ', '動詞')) chunk.tokens.append(self.maker.token('た', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[0].link = sentence.chunks[2] sentence.chunks[1].link = sentence.chunks[2] doc.sentences.append(sentence) sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('そこ', '名詞')) chunk.tokens.append(self.maker.token('に', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('美香', '名詞')) chunk.tokens.append(self.maker.token('が', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('い', '動詞')) chunk.tokens.append(self.maker.token('た', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[0].link = sentence.chunks[2] sentence.chunks[1].link = sentence.chunks[2] doc.sentences.append(sentence) sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('次回', '名詞')) chunk.tokens.append(self.maker.token('に', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('続き', '動詞')) chunk.tokens.append(self.maker.token('ます', '助動詞')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[0].link = sentence.chunks[1] doc.sentences.append(sentence) self.maker.set_id_to_sentences(doc) return doc
def sample_pas_annotation(self): """pasのアノテーション側用 述語項関係、共参照関係に加えて文に一部 """ doc = nlelement.Document() doc.pas_annotated = True sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('私', '名詞')) chunk.tokens.append(self.maker.token('は', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('花屋', '名詞')) chunk.tokens.append(self.maker.token('に', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('行っ', '動詞')) chunk.tokens.append(self.maker.token('た', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[0].link = sentence.chunks[2] sentence.chunks[1].link = sentence.chunks[2] doc.sentences.append(sentence) sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('そこ', '名詞')) chunk.tokens.append(self.maker.token('に', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('美香', '名詞')) chunk.tokens.append(self.maker.token('が', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('い', '動詞')) chunk.tokens.append(self.maker.token('た', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) sentence.chunks[0].link = sentence.chunks[2] sentence.chunks[1].link = sentence.chunks[2] doc.sentences.append(sentence) self.maker.set_id_to_sentences(doc) # NOTE: 共参照ラベルなどの値が正確に変換できないのでここでidを振る self.maker.add_coreference_link(doc, 0, 4, 'ga', 0, 0) self.maker.add_coreference_link(doc, 0, 4, 'ni', 0, 2) self.maker.add_coreference_link(doc, 1, 4, 'ga', 1, 2) self.maker.add_coreference_link(doc, 1, 4, 'ni', 1, 0) self.maker.add_coreference_link(doc, 1, 0, 'coref', 0, 2) return doc
def parse_document(self, raw_sentences, delimiter=None, name='mecab_parsed'): rawsent_iter = None if delimiter is None: rawsent_iter = raw_sentences.splitlines() elif isinstance(delimiter, str): rawsent_iter = raw_sentences.split(delimiter) document = nlelement.Document() document.name = name for i, raw_sent in enumerate(rawsent_iter): document.sentences.append(self.parse(raw_sent, i)) return document
def sample1(self): """取りあえず単純なサンプルを生成する """ doc = nlelement.Document() sentence = nlelement.Sentence() chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('私', '名詞')) chunk.tokens.append(self.maker.token('は', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('彼', '名詞')) chunk.tokens.append(self.maker.token('を', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) chunk = self.maker.chunk(0, 1) chunk.tokens.append(self.maker.token('助け', '動詞')) chunk.tokens.append(self.maker.token('た', '助詞')) self.maker.append_chunk_to_sentence(sentence, chunk) self.maker.set_link(sentence, 0, 2) self.maker.set_link(sentence, 1, 2) doc.sentences.append(sentence) self.maker.set_id_to_sentences(doc) return doc