def test_create_sent(self): doc = ttl.Document('test', TEST_DATA) # add words sent = doc.sents.new('Some happy guard-dogs barked.', 1) sent._import_tokens('Some happy guard dogs barked .'.split()) self.assertEqual(len(sent), 6) # sense tagging sent.concepts.new('01148283-a', 'wn', clemma='happy', tokens=[sent[1]]) # create a new concept and then bind a token in c = sent.concepts.new('02084071-n', 'wn', 'dog') c.tokens.append(sent[3]) sent.concepts.new(BARK_SID, "wn", 'bark').tokens.append(sent[4]) # add token object sent.concepts.new(GDOG_SID, "wn", 'guard-dog').tokens = (2, 3 ) # MWE example, add by index # verification tcmap = sent.tcmap() self.assertEqual(tcmap[sent[3]][0].clemma, 'dog') self.assertEqual(tcmap[sent[3]][1].clemma, 'guard-dog') self.assertEqual(tcmap[sent[3]][1].value, GDOG_SID) self.assertEqual(tcmap[sent[4]][0].value, BARK_SID) mwe = list(sent.mwe()) self.assertTrue(mwe) self.assertEqual(mwe[0].value, GDOG_SID)
def new_doc(self, name, corpusID, title='', lang='', ctx=None, **kwargs): doc = ttl.Document(name=name, corpusID=corpusID, title=title, lang=lang, **kwargs) newid = ctx.doc.save(doc) doc.ID = newid return doc
def test_sentids(self): doc = ttl.Document('boo') s = ttl.Sentence('odd', ID=3) self.assertEqual(s.ID, "3") doc.sents.append(s) # add sent#3 first doc.sents.new('foo') # 1 doc.sents.new('boo') # 2 moo = doc.sents.new('moo') # moo will be #4 because sent #3 exists self.assertEqual(moo.ID, "4") sids = [s.ID for s in doc] self.assertEqual(sids, ["3", "1", "2", "4"])
def test_write_json(self): doc = ttl.Document('manual', TEST_DATA) # create sents in doc raws = (sent1, sent2, sent3) mecab_outputs = (sent1_mecab, sent2_mecab, sent3_mecab) for sid, (text, mecab_output) in enumerate(zip(raws, mecab_outputs)): deko.mecab._mecab_output_to_sent(text, mecab_output, doc=doc) ttl.write_json(TEST_DATA / 'test.write.json', doc) doc_json = doc.to_dict() doc2 = ttl.read_json(TEST_DATA / 'test.write.json') doc2_json = doc2.to_dict() self.assertEqual(doc_json['sents'], doc2_json['sents'])
def test_export_to_streams(self): doc = ttl.Document('manual', TEST_DATA) # create sents in doc raws = (sent1, sent2, sent3) mecab_outputs = (sent1_mecab, sent2_mecab, sent3_mecab) for sid, (text, mecab_output) in enumerate(zip(raws, mecab_outputs)): deko.mecab._mecab_output_to_sent(text, mecab_output, doc=doc) # sense tagging doc[2][4].comment = 'to eat' doc[0].concepts.new("三毛猫", "wiki_ja", "三毛猫", tokens=[0, 1, 2]).comment = 'Calico cat, you know?' doc[1].concepts.new("02756821-v", "wn", "降る", tokens=(2, )) doc[2].concepts.new("10084295-n", "wn", "女の子", tokens=(0, )) doc[2].concepts.new("01166351-v", "wn", "食べる", (4, )) # tags doc[0].tags.new("WIKI", "src", 0, 3) doc[0].tags.new("https://ja.wikipedia.org/wiki/三毛猫", "url", 0, 3) doc[2].tags.new("WIKI", "src", 0, 3) doc[2].tags.new("https://ja.wikipedia.org/wiki/少女", "url", 0, 3) # export doc concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) as writer: writer.write_doc(doc) getLogger().debug("sents\n{}".format(sents.content())) getLogger().debug("words\n{}".format(words.content())) getLogger().debug("concepts\n{}".format(concepts.content())) getLogger().debug("links\n{}".format(links.content())) getLogger().debug("tags\n{}".format(tags.content())) self.assertTrue(sents.content()) self.assertTrue(words.content()) self.assertTrue(concepts.content()) self.assertTrue(links.content()) self.assertTrue(tags.content()) for text in doc: logging.debug(json.dumps(text.to_dict(), ensure_ascii=False))
def test_sentid(self): doc = ttl.Document('mydoc') sent = doc.sents.new('First sentence.') self.assertEqual(sent.ID, "1") sent2 = doc.sents.new('Second sentence.') self.assertEqual(sent2.ID, "2") # add some sentences manually sentm1 = ttl.Sentence('Another one', ID=3) sentm2 = ttl.Sentence('Another one 2', ID='5') doc.sents.append(sentm1) doc.sents.append(sentm2) doc.sents.new('Third sentence.') doc.sents.new('Fourth sentence.') sent5 = doc.sents.new('Fifth sentence.') self.assertEqual(sent5.ID, "7") # cannot add 3 again sent_foo = ttl.Sentence('Foo sentence.', ID=3) self.assertRaises(Exception, lambda: doc._add_sent_obj(sent_foo)) # cannot add a None sentence self.assertRaises(Exception, lambda: doc._add_sent_obj(None)) # document should have 5 created sentences + 2 imported sentences self.assertEqual(len(doc), 7)