def get_synset_by_sk(gwn, sk, report_file=None, compact=True): ''' Search synset in WordNet Gloss Corpus by sensekey''' if report_file is None: report_file = TextReport() # Default to stdout report_file.print("Looking for synsets by sensekey (Provided: %s)\n" % sk) synset = gwn.get_synset_by_sk(sk) dump_synset(synset, report_file=report_file, compact=compact) return synset
def get_synsets_by_term(gwn, t, pos=None, report_file=None, compact=True): ''' Search synset in WordNet Gloss Corpus by term''' if report_file is None: report_file = TextReport() # Default to stdout report_file.print( "Looking for synsets by term (Provided: %s | pos = %s)\n" % (t, pos)) synsets = gwn.get_synsets_by_term(t, pos) dump_synsets(synsets, report_file, compact=compact) return synsets
def get_synsets_by_term(wng_db_loc, t, pos, report_file=None): ''' Search synset in WordNet Gloss Corpus by term ''' if report_file == None: report_file = TextReport() # Default to stdout report_file.print("Looking for synsets by term (Provided: %s | pos = %s)\n" % (t, pos)) db = SQLiteGWordNet(wng_db_loc) synsets = db.get_synsets_by_term(t, pos) dump_synsets(synsets, report_file)
def get_synset_by_sk(wng_db_loc, sk, report_file=None): ''' Search synset in WordNet Gloss Corpus by sensekey ''' if report_file == None: report_file = TextReport() # Default to stdout report_file.print("Looking for synsets by sensekey (Provided: %s)\n" % sk) db = SQLiteGWordNet(wng_db_loc) synsets = db.get_synset_by_sk(sk) dump_synsets(synsets, report_file)
def get_synset_by_id(wng_db_loc, synsetid, report_file=None): ''' Search synset in WordNet Gloss Corpus by synset ID ''' if report_file == None: report_file = TextReport() # Default to stdout report_file.print("Looking for synsets by synsetid (Provided: %s)\n" % synsetid) db = SQLiteGWordNet(wng_db_loc) if len(synsetid) == 10 and synsetid[8] == '-': synsetid = synsetid[9] + synsetid[:8] synsets = db.get_synset_by_id(synsetid) dump_synsets(synsets, report_file)
def omw_vs_gwn(): rp = TextReport("data/omw_new.txt") omw_ssids = set(get_omw_synsets()) gwn_ssids = set(get_gwn_synsets()) omw_new = omw_ssids - gwn_ssids for ss in omw_new: s = omw.get_synset(ss) lemma = s.lemma if s.lemma else "** no lex **" if len(s.definitions) == 0: rp.print(join("\t", s.ID, lemma, "** no def **")) elif len(s.definitions) == 1: rp.print(join("\t", s.ID, lemma, s.definition)) else: rp.print(join("\t", s.ID, lemma, s.definitions[0])) for d in s.definitions[1:]: rp.print("\t\t" + d)
def dump_synsets(synsets, report_file=None): ''' Dump a SynsetCollection to stdout Arguments: synsets -- List of synsets to dump report_file -- An instance of TextReport ''' if report_file == None: report_file = TextReport() # Default to stdout if synsets: for synset in synsets: dump_synset(synset, report_file=report_file) report_file.print("Found %s synset(s)" % synsets.count()) else: report_file.print("None was found!")
class TestHelperMethods(unittest.TestCase): nullrep = TextReport('/dev/null') def test_dump_synset(self): print("Test get synset by ID") gwn = get_gwn() ss = get_synset_by_id(gwn, '01775535-v') self.assertIsNotNone(ss) self.assertGreater(len(ss.lemmas), 0) self.assertGreater(len(ss.keys), 0) self.assertGreater(len(ss.glosses), 0) dump_synset(ss) pass def test_dump_synsets(self): dump_synsets(None) def test_get_by_term(self): sses = get_synsets_by_term(GWNSQL(YLConfig.GWN30_DB), 'test', report_file=self.nullrep) self.assertEqual(len(sses), 13) def test_get_by_sk(self): ss = get_synset_by_sk(get_gwn(), 'test%2:41:00::', report_file=self.nullrep) self.assertIsNotNone(ss)
def test_timer(self): rp = TextReport.string() t = Timer(report=rp) msg = "Do something expensive" t.start(msg) do_expensive() t.stop(msg) getLogger().debug(rp.content()) self.assertIn("Started", rp.content()) self.assertIn("Stopped", rp.content()) # test do() rp = TextReport.string() t = Timer(report=rp) t.do(lambda: do_expensive(), desc=msg) self.assertIn("Started", rp.content()) self.assertIn("Stopped", rp.content()) getLogger().debug(rp.content())
def get_synset_by_id(gwn, synsetid_str, report_file=None, compact=True): ''' Search synset in WordNet Gloss Corpus by synset ID''' if report_file is None: report_file = TextReport() # Default to stdout report_file.print("Looking for synsets by synsetid (Provided: %s)\n" % synsetid_str) # Get synset infro from GlossWordnet try: synsetid = SynsetID.from_string(synsetid_str) synset = gwn.get_synset_by_id(synsetid.to_gwnsql()) dump_synset(synset, report_file=report_file, compact=compact) return synset except Exception as e: logger.exception(e) logger.error( " >>>> Error: (Synset ID should be in this format 12345678-n)")
def test_textreport(self): with TextReport.null() as rp: rp.writeline("null") rp.writeline(123) self.assertEqual(rp.content(), '') with TextReport() as rp: rp.writeline("stdout") rp.writeline(123) self.assertEqual(rp.content(), '') with TextReport(os.path.join(TEST_DATA, "del.me")) as rp: rp.writeline("ABC") rp.writeline(123) self.assertEqual(rp.content(), '') self.assertTrue(rp.closed) # test string report with TextReport.string() as rp: rp.writeline("ABC") rp.writeline(123, 456, 789) self.assertEqual(rp.content(), 'ABC\n123 456 789\n')
def test_locate_config_file(self): cfg = AppConfig(name='foo', mode=AppConfig.JSON) actual = cfg.potentials() expected = ['./.foo', './.foo.json', './foo', './foo.json', './data/foo', './data/foo.json', './data/.foo', './data/.foo.json', '~/.foo', '~/.foo.json', '~/.foo/config', '~/.foo.json/config', '~/.foo/config.json', '~/.foo.json/config.json', '~/.config/foo', '~/.config/foo.json', '~/.config/.foo', '~/.config/.foo.json', '~/.config/foo/config', '~/.config/foo.json/config', '~/.config/foo/config.json', '~/.config/foo.json/config.json', '~/.config/foo/foo', '~/.config/foo.json/foo.json'] self.assertEqual(actual, expected) # default mode is INI cfg_ini = AppConfig('chirptest', working_dir=os.path.dirname(__file__)) self.assertEqual(cfg_ini.config.sections(), ['AUTHOR']) self.assertEqual(cfg_ini.config['DEFAULT']['package'], 'chirptext.test') self.assertEqual(cfg_ini.config['DEFAULT']['tester'], 'unittest') self.assertEqual(cfg_ini.config['AUTHOR']['name'], 'Le Tuan Anh') self.assertEqual(cfg_ini.config['AUTHOR']['tester'], 'unittest') self.assertEqual(cfg_ini.config.get('AUTHOR', 'desc', fallback='nothing'), 'nothing') # test writing config with TextReport.string() as strfile: cfg_ini.config['AUTHOR']['desc'] = 'An author' cfg_ini.config.write(strfile.file) self.assertIn('desc = An author', strfile.content())
def dump_synsets(synsets, report_file=None, compact=True): ''' Dump a SynsetCollection to stdout Arguments: synsets -- List of synsets to dump report_file -- An instance of TextReport ''' if report_file is None: report_file = TextReport() # Default to stdout if synsets is not None: for synset in synsets: dump_synset(synset, report_file=report_file, compact=compact) report_file.print("Found %s synset(s)" % synsets.count()) else: report_file.print("None was found!")
def gen_stats(corpus_file, report_path=None): ''' Generate statistics for a text corpus (word count, most frequent words, etc.) ''' report = TextReport(report_path) report.header("Stat for %s" % corpus_file) line_count = -1 word_count = 0 c = Counter() with open(corpus_file, 'r', encoding='utf8') as infile: lines = infile.readlines() line_count = len(lines) for line in lines: tokens = line.split() for token in tokens: parts = token.split("/") if len(parts) == 2: word = parts[0] POS = parts[1] else: word = parts[0] POS = None for spechar in SPECIAL_CHARS: word = word.replace(spechar, '') word = word.lower().replace("_", " ") # original word form if word == '': print(token) c.count(word) word_count += 1 report.writeline("Line count: %s" % line_count) report.writeline("Word count: %s" % word_count) report.writeline("Word class: %s" % len(c.sorted_by_count())) report.writeline("Top %d :" % TOP_K) for item in c.sorted_by_count()[:TOP_K]: report.writeline("%s: %s" % (item[0], item[1]), level=1) report.writeline("Bottom %d :" % TOP_K) for item in c.sorted_by_count()[-TOP_K:]: report.writeline("%s: %s" % (item[0], item[1]), level=1) report.writeline("-" * 80) for item in c.group_by_count(): report.writeline("%s: %s" % (item[0], ', '.join(DaoPhay.vn_sorted(item[1]))))
def dump_synset(ss, compact_gloss=False, compact_tags=False, more_compact=True, report_file=None, compact=True): ''' Print synset details for debugging purpose Arguments: ss -- Synset object to dump compact_gloss -- Don't dump gloss tokens' details compact_tags -- Don't dump tagged senses' details more_compact -- Don't dump full details of synset report_file -- Report file to write to ''' if report_file is None: report_file = TextReport() # Default to stdout if more_compact: report_file.header( "Synset: %s (lemmas=%s | keys=%s)" % (ss.sid.to_canonical(), ss.lemmas, ss.keys), 'h0') else: report_file.header("Synset: %s" % ss, 'h0') for rgloss in ss.raw_glosses: if more_compact: if rgloss.cat != 'orig': continue report_file.print(rgloss) gloss_count = itertools.count(1) for gloss in ss.glosses: if compact: report_file.print("({cat}) {txt}".format(cat=gloss.cat, txt=gloss.text())) else: report_file.print('') report_file.header("Gloss #%s: %s" % (next(gloss_count), gloss), 'h2') # Dump gloss items if compact_gloss: report_file.print("Tokens => %s" % gloss.get_gramwords(), level=2) else: for item in gloss.items: # print("\t%s - { %s }" % (uniquify(item.get_gramwords()), item)) report_file.print("%s - { %s }" % (set(item.get_gramwords()), item), level=2) report_file.print(("-" * 10), level=1) # Dump tags if compact_tags: report_file.print("Tags => %s" % gloss.get_tagged_sensekey(), level=2) else: for tag in gloss.tags: report_file.print("%s" % tag, level=1) report_file.print('')
def omw_vs_gwn_def(): rp = TextReport("data/omw_gwn_report.txt") rpdiff = TextReport("data/omw_gwn_diff.txt") rptypo = TextReport("data/omw_gwn_typo.txt") rpsn = TextReport("data/omw_gwn_sciname.txt") c = Counter(TAGS.ORDER) # ssids to compare diff_ssids = [] ssids = read_diff_ssids() if not ssids: print("Generating synset ID list") omw_ssids = set(get_omw_synsets()) gwn_ssids = set(get_gwn_synsets()) # only care about old GWN synsets ssids = omw_ssids.intersection(gwn_ssids) else: print("Comparing {} synsets loaded from {}".format(len(ssids), ssid_filepath)) lang = 'eng' with omw.ctx() as omw_ctx, gwn.ctx() as gwn_ctx: print("Comparing {} synsets".format(len(ssids))) for ss in list(ssids): ss = str(ss) c.count("total") tags, odef, gdef = compare_synset(omw, gwn, ss, omw_ctx, gwn_ctx) omwss = omw.get_synset(ss, ctx=omw_ctx) tags_str = ' '.join('[{}]'.format(t.upper()) for t in tags) if TAGS.DIFF in tags: diff_ssids.append(ss) # [FCB] why did we change? gwnss = gwn.get_synset(ss, ctx=gwn_ctx) glosses = gwn_ctx.gloss.select('surface = ?', (gwnss.definition,)) if glosses and len(glosses) > 1: tags.add(TAGS.DUP) ssids = [str(SynsetID.from_string(g.sid)) for g in glosses] reason = "Not unique (Shared among {}) so OMW team changed it".format(' '.join(ssids)) else: tags.add(TAGS.OMW) defs = omw.sdef.select('synset=? and lang=?', (ss, lang)) usrs = {d.usr for d in defs if d.usr} usrs_str = ', '.join(usrs) if usrs else "someone in NTU" reason = "{} made this change.".format(usrs_str) tags_str = ' '.join('[{}]'.format(t.upper()) for t in tags) rpdiff.header("{} {}".format(tags_str, ss)) rpdiff.print("OMW: {}".format(omwss.definition)) rpdiff.print("GWN: {}".format(gdef)) rpdiff.print("Reason: {}".format(reason)) if TAGS.SCINAME in tags: rpsn.header("{} {}".format(tags_str, ss)) rpsn.print("OMW: {}".format(omwss.definition)) rpsn.print("GWN: {}".format(gdef)) if TAGS.REP in tags or TAGS.TYPO in tags: rptypo.header("{} {}".format(tags_str, ss)) rptypo.print("OMW: {}".format(omwss.definition)) rptypo.print("GWN: {}".format(gdef)) # master report for tag in tags: c.count(tag) if not tags: c.count(TAGS.IDENT) rp.header("{} {}".format(tags_str, ss)) rp.print("OMW: {}".format(omwss.definition)) rp.print("GWN: {}".format(gdef)) # done c.summarise(report=rp) with open('data/omw_gwn_diff_ssids.txt', 'wt') as diff_ssid_file: for ss in diff_ssids: diff_ssid_file.write('{}\n'.format(ss))
def dump_synset(ss, compact_gloss=False, compact_tags=False, more_compact=True, report_file=None): ''' Print synset details for debugging purpose Arguments: ss -- Synset object to dump compact_gloss -- Don't dump gloss tokens' details compact_tags -- Don't dump tagged senses' details more_compact -- Don't dump full details of synset report_file -- Report file to write to ''' if report_file == None: report_file = TextReport() # Default to stdout if more_compact: report_file.header("Synset: %s (terms=%s | keys=%s)" % (ss.get_synsetid(), ss.terms, ss.keys), 'h0') else: report_file.header("Synset: %s" % ss, 'h0') for rgloss in ss.raw_glosses: if more_compact: if rgloss.cat != 'orig': continue report_file.print(rgloss) gloss_count = itertools.count(1) for gloss in ss.glosses: print() report_file.header("Gloss #%s: %s" % (next(gloss_count), gloss), 'h2') # Dump gloss items if compact_gloss: report_file.print("Tokens => %s" % gloss.get_gramwords(), level=2) else: for item in gloss.items: # print("\t%s - { %s }" % (uniquify(item.get_gramwords()), item)) report_file.print("%s - { %s }" % (set(item.get_gramwords()), item), level=2) report_file.print(("-" * 10), level=1) # Dump tags if compact_tags: report_file.print("Tags => %s" % gloss.get_tagged_sensekey(), level=2) else: for tag in gloss.tags: report_file.print("%s" % tag, level=1) report_file.print('')
def do_expensive(n=10000): s = TextReport.string() for i in range(n): s.print("This is string number #{}".format(i)) return str(s.content())