Example #1
0
 def test_kata2hira(self):
     rp = TextReport.string()
     for k in KATAKANA[1:87]:
         h = simple_kata2hira(k)
         rp.write(h, k, '|', separator='')
     expected = TestTool.ALL_MAPPING
     self.assertEqual(rp.content(), expected)
Example #2
0
 def test_kata2hira(self):
     rp = TextReport.string()
     for k in KATAKANA[1:87]:
         h = simple_kata2hira(k)
         rp.write(h, k, '|', separator='')
     expected = TestTool.ALL_MAPPING
     self.assertEqual(rp.content(), expected)
Example #3
0
 def test_export_to_streams(self):
     doc = ttl.Document('manual', TEST_DATA)
     # create sents in doc
     raws = ("三毛猫が好きです。", "雨が降る。", "女の子はケーキを食べる。")
     for sid, r in enumerate(raws):
         msent = txt2mecab(r)
         tsent = doc.new_sent(msent.surface, sid)
         tsent.import_tokens(msent.words)
         # pos tagging
         for mtk, tk in zip(msent, tsent):
             tk.pos = mtk.pos3()
             tk.new_tag(mtk.reading_hira(), tagtype="Reading", source=ttl.Tag.MECAB)
     # sense tagging
     doc[2][4].comment = 'to eat'
     doc[0].new_concept("三毛猫", "wiki.ja:三毛猫", tokens=[0, 1, 2]).comment = 'Calico cat, you know?'
     doc[1].new_concept("降る", "02756821-v", tokens=(2,))
     doc[2].new_concept("女の子", "10084295-n", tokens=(0,))
     doc[2].new_concept("食べる", "01166351-v", (4,))
     # tags
     doc[0].new_tag("WIKI", 0, 3, tagtype="SRC")
     doc[0].new_tag("https://ja.wikipedia.org/wiki/三毛猫", 0, 3, tagtype="URL")
     doc[2].new_tag("WIKI", 0, 3, tagtype="SRC")
     doc[2].new_tag("https://ja.wikipedia.org/wiki/少女", 0, 3, tagtype="URL")
     # export doc
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) as writer:
         writer.write_doc(doc)
         getLogger().debug("sents\n{}".format(sents.content()))
         getLogger().debug("words\n{}".format(words.content()))
         getLogger().debug("concepts\n{}".format(concepts.content()))
         getLogger().debug("links\n{}".format(links.content()))
         getLogger().debug("tags\n{}".format(tags.content()))
         self.assertTrue(sents.content())
         self.assertTrue(words.content())
         self.assertTrue(concepts.content())
         self.assertTrue(links.content())
         self.assertTrue(tags.content())
         for sent in doc:
             logging.debug(json.dumps(sent.to_json(), ensure_ascii=False))
Example #4
0
 def test_export_to_streams(self):
     doc = ttl.Document('manual', TEST_DATA)
     # create sents in doc
     raws = (sent1, sent2, sent3)
     mecab_outputs = (sent1_mecab, sent2_mecab, sent3_mecab)
     for sid, (text, mecab_output) in enumerate(zip(raws, mecab_outputs)):
         deko.mecab._mecab_output_to_sent(text, mecab_output, doc=doc)
     # sense tagging
     doc[2][4].comment = 'to eat'
     doc[0].concepts.new("三毛猫", "wiki_ja", "三毛猫",
                         tokens=[0, 1, 2]).comment = 'Calico cat, you know?'
     doc[1].concepts.new("02756821-v", "wn", "降る", tokens=(2, ))
     doc[2].concepts.new("10084295-n", "wn", "女の子", tokens=(0, ))
     doc[2].concepts.new("01166351-v", "wn", "食べる", (4, ))
     # tags
     doc[0].tags.new("WIKI", "src", 0, 3)
     doc[0].tags.new("https://ja.wikipedia.org/wiki/三毛猫", "url", 0, 3)
     doc[2].tags.new("WIKI", "src", 0, 3)
     doc[2].tags.new("https://ja.wikipedia.org/wiki/少女", "url", 0, 3)
     # export doc
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file,
                        tags.file) as writer:
         writer.write_doc(doc)
         getLogger().debug("sents\n{}".format(sents.content()))
         getLogger().debug("words\n{}".format(words.content()))
         getLogger().debug("concepts\n{}".format(concepts.content()))
         getLogger().debug("links\n{}".format(links.content()))
         getLogger().debug("tags\n{}".format(tags.content()))
         self.assertTrue(sents.content())
         self.assertTrue(words.content())
         self.assertTrue(concepts.content())
         self.assertTrue(links.content())
         self.assertTrue(tags.content())
         for text in doc:
             logging.debug(json.dumps(text.to_dict(), ensure_ascii=False))
Example #5
0
 def test_ttl_tsv_serialization(self):
     sent = self.build_test_sent()
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     writer = ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file)
     writer.write_sent(sent)
     sents_txt = sents.content()
     words_txt = words.content()
     concepts_txt = concepts.content()
     links_txt = links.content()
     tags_txt = tags.content()
     getLogger().debug("sents\n{}".format(sents_txt))
     getLogger().debug("words\n{}".format(words_txt))
     getLogger().debug("concepts\n{}".format(concepts_txt))
     getLogger().debug("links\n{}".format(links_txt))
     getLogger().debug("tags\n{}".format(tags_txt))
     # read it back
     reader = ttl.TxtReader(io.StringIO(sents_txt),
                            io.StringIO(words_txt),
                            io.StringIO(concepts_txt),
                            io.StringIO(links_txt),
                            io.StringIO(tags_txt))
     docx = reader.read()
     # patch sent.ID
     sent.ID = 1
     jo = sent.to_json()
     jr = docx[0].to_json()
     getLogger().debug(jo)
     getLogger().debug(jr)
     self.assertEqual(jo['text'], jr['text'])
     self.assertEqual(jo['tokens'], jr['tokens'])
     self.assertEqual(jo['concepts'], jr['concepts'])
     self.assertEqual(jo['tags'], jr['tags'])
     self.assertEqual(jo['flag'], jr['flag'])
     self.assertEqual(jo['comment'], jr['comment'])
     self.assertEqual(jo, jr)
Example #6
0
 def test_ttl_tsv_serialization(self):
     sent = self.build_test_sent()
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     writer = ttl.TxtWriter(sents.file, words.file, concepts.file,
                            links.file, tags.file)
     writer.write_sent(sent)
     sents_txt = sents.content()
     words_txt = words.content()
     concepts_txt = concepts.content()
     links_txt = links.content()
     tags_txt = tags.content()
     getLogger().debug("sents\n{}".format(sents_txt))
     getLogger().debug("words\n{}".format(words_txt))
     getLogger().debug("concepts\n{}".format(concepts_txt))
     getLogger().debug("links\n{}".format(links_txt))
     getLogger().debug("tags\n{}".format(tags_txt))
     # read it back
     reader = ttl.TxtReader(io.StringIO(sents_txt), io.StringIO(words_txt),
                            io.StringIO(concepts_txt),
                            io.StringIO(links_txt), io.StringIO(tags_txt))
     docx = reader.read()
     # patch sent.ID
     sent.ID = 1
     jo = sent.to_dict()
     jr = docx[0].to_dict()
     getLogger().debug(jo)
     getLogger().debug(jr)
     self.assertEqual(jo['text'], jr['text'])
     self.assertEqual(jo['tokens'], jr['tokens'])
     self.assertEqual(jo['concepts'], jr['concepts'])
     self.assertEqual(jo['tags'], jr['tags'])
     self.assertEqual(jo['flag'], jr['flag'])
     self.assertEqual(jo['comment'], jr['comment'])
     self.assertEqual(jo, jr)
Example #7
0
        vc.count("Vowels")
    else:
        vc.count("Consonants")
vc.summarise()
ct.summarise(byfreq=True, limit=5)


# ------------------------------------------------------------------------------
# Sample text report
# ------------------------------------------------------------------------------
# a string report
rp = TextReport()  # by default, TextReport will write to standard output, i.e. terminal
rp = TextReport(TextReport.STDOUT)  # same as above
rp = TextReport('~/tmp/my-report.txt')  # output to a file
rp = TextReport.null()  # ouptut to /dev/null, i.e. nowhere
rp = TextReport.string()  # output to a string. Call rp.content() to get the string
rp = TextReport(TextReport.STRINGIO)  # same as above

# TextReport will close the output stream automatically by using the with statement
with TextReport.string() as rp:
    rp.header("Lorem Ipsum Analysis", level="h0")
    rp.header("Raw", level="h1")
    rp.print(LOREM_IPSUM)
    rp.header("Character Frequency")
    ct.summarise(report=rp)
    print(rp.content())


# ------------------------------------------------------------------------------
# Web fetcher
# ------------------------------------------------------------------------------
Example #8
0
# ------------------------------------------------------------------------------
# a string report
rp = TextReport(
)  # by default, TextReport will write to standard output, i.e. terminal
rp.write("This line goes to standard output")

rp1 = TextReport(TextReport.STDOUT)  # same as above
rp1.write("This line goes to standard output")

rp2 = TextReport('~/tmp/my-report.txt')  # output to a file
rp2.write("This is a line in my-report.txt")

rp3 = TextReport.null()  # ouptut to /dev/null, i.e. nowhere
rp3.write("This line goes no where")

rp4 = TextReport.string(
)  # output to a string. Call rp.content() to get the string
rp4.write("This line will be stored in a string buffer")

rp5 = TextReport(TextReport.STRINGIO)  # same as above
rp5.write("This line will also be stored in a string buffer")

# TextReport will close the output stream automatically by using the with statement
with TextReport.string() as rp:
    rp.header("Lorem Ipsum Analysis", level="h0")
    rp.header("Raw", level="h1")
    rp.print(LOREM_IPSUM)
    rp.header("Character Frequency")
    ct.summarise(report=rp)
    print(rp.content())

# ------------------------------------------------------------------------------