Ejemplo n.º 1
0
 def test_create_sent(self):
     doc = ttl.Document('test', TEST_DATA)
     # add words
     sent = doc.sents.new('Some happy guard-dogs barked.', 1)
     sent._import_tokens('Some happy guard dogs barked .'.split())
     self.assertEqual(len(sent), 6)
     # sense tagging
     sent.concepts.new('01148283-a', 'wn', clemma='happy', tokens=[sent[1]])
     # create a new concept and then bind a token in
     c = sent.concepts.new('02084071-n', 'wn', 'dog')
     c.tokens.append(sent[3])
     sent.concepts.new(BARK_SID, "wn",
                       'bark').tokens.append(sent[4])  # add token object
     sent.concepts.new(GDOG_SID, "wn",
                       'guard-dog').tokens = (2, 3
                                              )  # MWE example, add by index
     # verification
     tcmap = sent.tcmap()
     self.assertEqual(tcmap[sent[3]][0].clemma, 'dog')
     self.assertEqual(tcmap[sent[3]][1].clemma, 'guard-dog')
     self.assertEqual(tcmap[sent[3]][1].value, GDOG_SID)
     self.assertEqual(tcmap[sent[4]][0].value, BARK_SID)
     mwe = list(sent.mwe())
     self.assertTrue(mwe)
     self.assertEqual(mwe[0].value, GDOG_SID)
Ejemplo n.º 2
0
 def new_doc(self, name, corpusID, title='', lang='', ctx=None, **kwargs):
     doc = ttl.Document(name=name,
                        corpusID=corpusID,
                        title=title,
                        lang=lang,
                        **kwargs)
     newid = ctx.doc.save(doc)
     doc.ID = newid
     return doc
Ejemplo n.º 3
0
 def test_sentids(self):
     doc = ttl.Document('boo')
     s = ttl.Sentence('odd', ID=3)
     self.assertEqual(s.ID, "3")
     doc.sents.append(s)  # add sent#3 first
     doc.sents.new('foo')  # 1
     doc.sents.new('boo')  # 2
     moo = doc.sents.new('moo')  # moo will be #4 because sent #3 exists
     self.assertEqual(moo.ID, "4")
     sids = [s.ID for s in doc]
     self.assertEqual(sids, ["3", "1", "2", "4"])
Ejemplo n.º 4
0
 def test_write_json(self):
     doc = ttl.Document('manual', TEST_DATA)
     # create sents in doc
     raws = (sent1, sent2, sent3)
     mecab_outputs = (sent1_mecab, sent2_mecab, sent3_mecab)
     for sid, (text, mecab_output) in enumerate(zip(raws, mecab_outputs)):
         deko.mecab._mecab_output_to_sent(text, mecab_output, doc=doc)
     ttl.write_json(TEST_DATA / 'test.write.json', doc)
     doc_json = doc.to_dict()
     doc2 = ttl.read_json(TEST_DATA / 'test.write.json')
     doc2_json = doc2.to_dict()
     self.assertEqual(doc_json['sents'], doc2_json['sents'])
Ejemplo n.º 5
0
 def test_export_to_streams(self):
     doc = ttl.Document('manual', TEST_DATA)
     # create sents in doc
     raws = (sent1, sent2, sent3)
     mecab_outputs = (sent1_mecab, sent2_mecab, sent3_mecab)
     for sid, (text, mecab_output) in enumerate(zip(raws, mecab_outputs)):
         deko.mecab._mecab_output_to_sent(text, mecab_output, doc=doc)
     # sense tagging
     doc[2][4].comment = 'to eat'
     doc[0].concepts.new("三毛猫", "wiki_ja", "三毛猫",
                         tokens=[0, 1, 2]).comment = 'Calico cat, you know?'
     doc[1].concepts.new("02756821-v", "wn", "降る", tokens=(2, ))
     doc[2].concepts.new("10084295-n", "wn", "女の子", tokens=(0, ))
     doc[2].concepts.new("01166351-v", "wn", "食べる", (4, ))
     # tags
     doc[0].tags.new("WIKI", "src", 0, 3)
     doc[0].tags.new("https://ja.wikipedia.org/wiki/三毛猫", "url", 0, 3)
     doc[2].tags.new("WIKI", "src", 0, 3)
     doc[2].tags.new("https://ja.wikipedia.org/wiki/少女", "url", 0, 3)
     # export doc
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file,
                        tags.file) as writer:
         writer.write_doc(doc)
         getLogger().debug("sents\n{}".format(sents.content()))
         getLogger().debug("words\n{}".format(words.content()))
         getLogger().debug("concepts\n{}".format(concepts.content()))
         getLogger().debug("links\n{}".format(links.content()))
         getLogger().debug("tags\n{}".format(tags.content()))
         self.assertTrue(sents.content())
         self.assertTrue(words.content())
         self.assertTrue(concepts.content())
         self.assertTrue(links.content())
         self.assertTrue(tags.content())
         for text in doc:
             logging.debug(json.dumps(text.to_dict(), ensure_ascii=False))
Ejemplo n.º 6
0
 def test_sentid(self):
     doc = ttl.Document('mydoc')
     sent = doc.sents.new('First sentence.')
     self.assertEqual(sent.ID, "1")
     sent2 = doc.sents.new('Second sentence.')
     self.assertEqual(sent2.ID, "2")
     # add some sentences manually
     sentm1 = ttl.Sentence('Another one', ID=3)
     sentm2 = ttl.Sentence('Another one 2', ID='5')
     doc.sents.append(sentm1)
     doc.sents.append(sentm2)
     doc.sents.new('Third sentence.')
     doc.sents.new('Fourth sentence.')
     sent5 = doc.sents.new('Fifth sentence.')
     self.assertEqual(sent5.ID, "7")
     # cannot add 3 again
     sent_foo = ttl.Sentence('Foo sentence.', ID=3)
     self.assertRaises(Exception, lambda: doc._add_sent_obj(sent_foo))
     # cannot add a None sentence
     self.assertRaises(Exception, lambda: doc._add_sent_obj(None))
     # document should have 5 created sentences + 2 imported sentences
     self.assertEqual(len(doc), 7)