def test_kata2hira(self): rp = TextReport.string() for k in KATAKANA[1:87]: h = simple_kata2hira(k) rp.write(h, k, '|', separator='') expected = TestTool.ALL_MAPPING self.assertEqual(rp.content(), expected)
def test_export_to_streams(self): doc = ttl.Document('manual', TEST_DATA) # create sents in doc raws = ("三毛猫が好きです。", "雨が降る。", "女の子はケーキを食べる。") for sid, r in enumerate(raws): msent = txt2mecab(r) tsent = doc.new_sent(msent.surface, sid) tsent.import_tokens(msent.words) # pos tagging for mtk, tk in zip(msent, tsent): tk.pos = mtk.pos3() tk.new_tag(mtk.reading_hira(), tagtype="Reading", source=ttl.Tag.MECAB) # sense tagging doc[2][4].comment = 'to eat' doc[0].new_concept("三毛猫", "wiki.ja:三毛猫", tokens=[0, 1, 2]).comment = 'Calico cat, you know?' doc[1].new_concept("降る", "02756821-v", tokens=(2,)) doc[2].new_concept("女の子", "10084295-n", tokens=(0,)) doc[2].new_concept("食べる", "01166351-v", (4,)) # tags doc[0].new_tag("WIKI", 0, 3, tagtype="SRC") doc[0].new_tag("https://ja.wikipedia.org/wiki/三毛猫", 0, 3, tagtype="URL") doc[2].new_tag("WIKI", 0, 3, tagtype="SRC") doc[2].new_tag("https://ja.wikipedia.org/wiki/少女", 0, 3, tagtype="URL") # export doc concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) as writer: writer.write_doc(doc) getLogger().debug("sents\n{}".format(sents.content())) getLogger().debug("words\n{}".format(words.content())) getLogger().debug("concepts\n{}".format(concepts.content())) getLogger().debug("links\n{}".format(links.content())) getLogger().debug("tags\n{}".format(tags.content())) self.assertTrue(sents.content()) self.assertTrue(words.content()) self.assertTrue(concepts.content()) self.assertTrue(links.content()) self.assertTrue(tags.content()) for sent in doc: logging.debug(json.dumps(sent.to_json(), ensure_ascii=False))
def test_export_to_streams(self): doc = ttl.Document('manual', TEST_DATA) # create sents in doc raws = (sent1, sent2, sent3) mecab_outputs = (sent1_mecab, sent2_mecab, sent3_mecab) for sid, (text, mecab_output) in enumerate(zip(raws, mecab_outputs)): deko.mecab._mecab_output_to_sent(text, mecab_output, doc=doc) # sense tagging doc[2][4].comment = 'to eat' doc[0].concepts.new("三毛猫", "wiki_ja", "三毛猫", tokens=[0, 1, 2]).comment = 'Calico cat, you know?' doc[1].concepts.new("02756821-v", "wn", "降る", tokens=(2, )) doc[2].concepts.new("10084295-n", "wn", "女の子", tokens=(0, )) doc[2].concepts.new("01166351-v", "wn", "食べる", (4, )) # tags doc[0].tags.new("WIKI", "src", 0, 3) doc[0].tags.new("https://ja.wikipedia.org/wiki/三毛猫", "url", 0, 3) doc[2].tags.new("WIKI", "src", 0, 3) doc[2].tags.new("https://ja.wikipedia.org/wiki/少女", "url", 0, 3) # export doc concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) as writer: writer.write_doc(doc) getLogger().debug("sents\n{}".format(sents.content())) getLogger().debug("words\n{}".format(words.content())) getLogger().debug("concepts\n{}".format(concepts.content())) getLogger().debug("links\n{}".format(links.content())) getLogger().debug("tags\n{}".format(tags.content())) self.assertTrue(sents.content()) self.assertTrue(words.content()) self.assertTrue(concepts.content()) self.assertTrue(links.content()) self.assertTrue(tags.content()) for text in doc: logging.debug(json.dumps(text.to_dict(), ensure_ascii=False))
def test_ttl_tsv_serialization(self): sent = self.build_test_sent() concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() writer = ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) writer.write_sent(sent) sents_txt = sents.content() words_txt = words.content() concepts_txt = concepts.content() links_txt = links.content() tags_txt = tags.content() getLogger().debug("sents\n{}".format(sents_txt)) getLogger().debug("words\n{}".format(words_txt)) getLogger().debug("concepts\n{}".format(concepts_txt)) getLogger().debug("links\n{}".format(links_txt)) getLogger().debug("tags\n{}".format(tags_txt)) # read it back reader = ttl.TxtReader(io.StringIO(sents_txt), io.StringIO(words_txt), io.StringIO(concepts_txt), io.StringIO(links_txt), io.StringIO(tags_txt)) docx = reader.read() # patch sent.ID sent.ID = 1 jo = sent.to_json() jr = docx[0].to_json() getLogger().debug(jo) getLogger().debug(jr) self.assertEqual(jo['text'], jr['text']) self.assertEqual(jo['tokens'], jr['tokens']) self.assertEqual(jo['concepts'], jr['concepts']) self.assertEqual(jo['tags'], jr['tags']) self.assertEqual(jo['flag'], jr['flag']) self.assertEqual(jo['comment'], jr['comment']) self.assertEqual(jo, jr)
def test_ttl_tsv_serialization(self): sent = self.build_test_sent() concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() writer = ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) writer.write_sent(sent) sents_txt = sents.content() words_txt = words.content() concepts_txt = concepts.content() links_txt = links.content() tags_txt = tags.content() getLogger().debug("sents\n{}".format(sents_txt)) getLogger().debug("words\n{}".format(words_txt)) getLogger().debug("concepts\n{}".format(concepts_txt)) getLogger().debug("links\n{}".format(links_txt)) getLogger().debug("tags\n{}".format(tags_txt)) # read it back reader = ttl.TxtReader(io.StringIO(sents_txt), io.StringIO(words_txt), io.StringIO(concepts_txt), io.StringIO(links_txt), io.StringIO(tags_txt)) docx = reader.read() # patch sent.ID sent.ID = 1 jo = sent.to_dict() jr = docx[0].to_dict() getLogger().debug(jo) getLogger().debug(jr) self.assertEqual(jo['text'], jr['text']) self.assertEqual(jo['tokens'], jr['tokens']) self.assertEqual(jo['concepts'], jr['concepts']) self.assertEqual(jo['tags'], jr['tags']) self.assertEqual(jo['flag'], jr['flag']) self.assertEqual(jo['comment'], jr['comment']) self.assertEqual(jo, jr)
vc.count("Vowels") else: vc.count("Consonants") vc.summarise() ct.summarise(byfreq=True, limit=5) # ------------------------------------------------------------------------------ # Sample text report # ------------------------------------------------------------------------------ # a string report rp = TextReport() # by default, TextReport will write to standard output, i.e. terminal rp = TextReport(TextReport.STDOUT) # same as above rp = TextReport('~/tmp/my-report.txt') # output to a file rp = TextReport.null() # ouptut to /dev/null, i.e. nowhere rp = TextReport.string() # output to a string. Call rp.content() to get the string rp = TextReport(TextReport.STRINGIO) # same as above # TextReport will close the output stream automatically by using the with statement with TextReport.string() as rp: rp.header("Lorem Ipsum Analysis", level="h0") rp.header("Raw", level="h1") rp.print(LOREM_IPSUM) rp.header("Character Frequency") ct.summarise(report=rp) print(rp.content()) # ------------------------------------------------------------------------------ # Web fetcher # ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------ # a string report rp = TextReport( ) # by default, TextReport will write to standard output, i.e. terminal rp.write("This line goes to standard output") rp1 = TextReport(TextReport.STDOUT) # same as above rp1.write("This line goes to standard output") rp2 = TextReport('~/tmp/my-report.txt') # output to a file rp2.write("This is a line in my-report.txt") rp3 = TextReport.null() # ouptut to /dev/null, i.e. nowhere rp3.write("This line goes no where") rp4 = TextReport.string( ) # output to a string. Call rp.content() to get the string rp4.write("This line will be stored in a string buffer") rp5 = TextReport(TextReport.STRINGIO) # same as above rp5.write("This line will also be stored in a string buffer") # TextReport will close the output stream automatically by using the with statement with TextReport.string() as rp: rp.header("Lorem Ipsum Analysis", level="h0") rp.header("Raw", level="h1") rp.print(LOREM_IPSUM) rp.header("Character Frequency") ct.summarise(report=rp) print(rp.content()) # ------------------------------------------------------------------------------