def mark_adj_concepts(self): with self.db.ctx() as ctx: lus = self.db.find_lexunits(flag=LexUnit.PROCESSED, pos='a', ctx=ctx) for lu in lus[:50]: self.db.get_lexunit(lu, ctx=ctx) header(lu) mark_adj_concepts(lu, ctx)
def main(): header("Main method") c = Counter() t = Timer() t.start("Doing some time-consuming tasks ...") logging.info("Count even & odd numbers ...") for i in range(10000): if i % 2 == 0: c.count("even") else: c.count("odd") c.summarise() logging.info("Creating report dir ...") FileHelper.create_dir(DATA_DIR) report = TextReport(REPORT_LOC) logging.info("Now try to create a text report (Located at: %s)" % (report.get_path())) generate_report(report) # try to report to stdout logging.info("The same report to stdout ...") generate_report(TextReport()) t.end("Done")
def list_unksense(args): header("List unknown sensekeys in Semcor") semxml = SemcorXML(SEMCOR_TTL) unk = Counter() sids = Counter() c = Counter() out = TextReport() if not args.out else TextReport(args.out) for f in semxml.files[:args.limit] if args.limit else ttl.files: doc = ttl.Document.from_json_file(ttl.files.abspath(f)) for s in doc: for concept in s.concepts: try: sid = SynsetID.from_string(concept.tag) sids.count((sid, concept.clemma)) c.count("Known instances") except: sid = None unk.count((concept.tag, concept.clemma)) c.count("Unknown instances") out.header("Known concepts") out.writeline("\t".join(("synsetID", "lemma", "count"))) for k, v in sids.sorted_by_count(): sid, lemma = k out.writeline("\t".join((str(sid), lemma, str(v)))) out.header("Unknown concepts") out.writeline("\t".join(("sensekey", "lemma", "count"))) for k, v in unk.sorted_by_count(): sk, lemma = k out.writeline("\t".join((sk, lemma, str(v)))) out.header("Total") out.writeline("Known: {}".format(len(sids))) out.writeline("Unknown: {}".format(len(unk))) c.summarise(out)
def test_parsing(self): ghub = GrammarHub() # noun sent = ghub.ERG.parse('dog', extra_args=['-r', 'root_wn_n']) self.assertGreater(len(sent), 0) dmrs = sent[0].dmrs().obj() eps = dmrs.eps() self.assertEqual(len(eps), 1) self.assertEqual(eps[0].pred.pos, 'n') # verb sent = ghub.ERG.parse('love', extra_args=['-r', 'root_wn_v']) self.assertGreater(len(sent), 0) dmrs = sent[0].dmrs().obj() eps = dmrs.eps() self.assertEqual(len(eps), 1) self.assertEqual(eps[0].pred.pos, 'v') # adjective sent = ghub.ERG.parse('nice', extra_args=['-r', 'root_wn_adj']) self.assertGreater(len(sent), 0) dmrs = sent[0].dmrs().obj() eps = dmrs.eps() print(sent[0].mrs()) self.assertEqual(len(eps), 1) self.assertEqual(eps[0].pred.pos, 'a') # test iterative parsing words = ['drink', 'eat', 'eat', 'drink'] for parses in ghub.ERG.parse_many_iterative(words, extra_args=['-r', 'root_frag'], ignore_cache=True): header(parses) for p in parses: print(p.mrs())
def test_get_usr(self): header("Test get information") sid = '02386612-a' lang = 'eng' defs = omw.sdef.select('synset=? and lang=?', (sid, lang)) usrs = {d.usr for d in defs if d.usr} self.assertTrue(usrs)
def test_remove_sciname(self): header("Test removing scientific name") d = 'triggerfishes ❲Balistidae❳' self.assertTrue(has_sciname(d)) self.assertTrue(SCIENTIFIC_NAME.search(d)) d_nosn = remove_sciname(d) expected = 'triggerfishes' self.assertEqual(d_nosn, expected)
def test_compare_synset(self): header("Test compare synset") with omw.ctx() as omw_ctx, gwn.ctx() as gwn_ctx: ss = '01850676-n' tags, odef, gdef = compare_synset(omw, gwn, ss, omw_ctx, gwn_ctx) self.assertEqual(tags, {TAGS.SAME, TAGS.SCINAME, TAGS.REP}) ss = '00445467-v' tags, odef, gdef = compare_synset(omw, gwn, ss, omw_ctx, gwn_ctx) self.assertEqual(tags, set())
def test_merge_compound(self): header("Test merge compound") n = self.data.get_named() ne = n.edit(0) head = ne.top['ARG'] comps = Integral.get_comps(head) self.assertEqual(len(comps), 1) Integral.merge_compound(comps[0], True) self.assertEqual(head.predstr, "named") self.assertEqual(head.carg, "Charles Bond")
def test_detect_dup(self): header("Detect duplication in OMW definition") ss = omw.get_synset('01850676-n') fixed_def, dup_defs = join_definitions(ss) self.assertEqual( ss.definition, 'canvasback; redhead; pochard; canvasback; redhead; pochard; etc. ❲Aythya❳' ) self.assertEqual(fixed_def, 'canvasback; redhead; pochard; etc. ❲Aythya❳') self.assertEqual(set(dup_defs), {'canvasback', 'redhead', 'pochard'})
def wn2ttl(args): inpath = FileHelper.abspath(args.inpath) header("WN to TTL format") wn = GWordnetXML() wn.read(inpath) print("Found senses: {}".format(len(wn.synsets))) outpath = FileHelper.abspath(args.outpath) if args.outpath else None with TextReport(outpath, 'w') as outfile: if args.format == 'json': convert_json(wn.synsets, outfile) elif args.format == 'xml': convert_xml(wn.synsets, outfile) print("Done!")
def test_tag_one_sent(self): getLogger().debug("Test tagging one sentence") sid = '10081' sents = self.gold() smap = {str(s.ident): s for s in sents} sent = smap[sid] doc = ttl.Document('gold', TEST_GOLD_DIR).read() sent.shallow = doc.get(sid) if sent.text != sent.shallow.text: getLogger().debug("WARNING: Inconsistent") getLogger().debug(sent.text) getLogger().debug(sent.shallow.text) dmrs = sent[0].dmrs() # print(sent.to_xml_str()) m, n, ignored = tag_gold(dmrs, sent.shallow, sent.text) header("#{}: {}".format(sid, sent.text), 'h0') getLogger().debug(sent[0].dmrs()) header('Available concepts') for c in sent.shallow.concepts: getLogger().debug("{} {}".format(c, c.tokens)) header('Matched') for con, nid, pred in m: getLogger().debug("{}::{} => #{}::{}".format(con.tag, con.clemma, nid, pred)) header("Not matched") if n: for c in n: getLogger().debug(c) else: getLogger().debug("All was matched.") sent.tag(method=ttl.Tag.MFS) xml_str = sent.tag_xml().to_xml_str() self.assertTrue(xml_str) self.assertIn("<sensegold", xml_str) self.assertIn("<sense", xml_str) getLogger().debug(sent.to_xml_str())
def test_generate_from_gold(self): header("Test generating from gold (THIS CAN BE VERY SLOW)") sents = self.gold() c = Counter() for sent in sents[:2]: if len(sent) > 0: try: gsents = ERG.generate(sent[0]) c.count("OK" if gsents else "ERR") except: c.count("BROKEN") else: c.count("SKIP") c.summarise()
def test_find_surface(self): sid = '00445467-v' omwss = omw.get_synset(sid) gwnss = gwn.get_synset(sid) odef_orig = '; '.join(omwss.definitions) odef, is_duplicated = join_definitions(omwss) gdef = gwnss.get_def().text() if gdef.endswith(";"): gdef = gdef[:-1].strip() if odef != gdef: header(sid) print("OMW-orig: {} | is_dup: {}".format(odef_orig, is_duplicated)) print("OMW:", odef) print("GWN:", gdef)
def test_sent(self): header("Test model") doc = Document("test") s = doc.new("It rains.") s.add('''[ TOP: h0 INDEX: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] RELS: < [ _rain_v_1<3:9> LBL: h1 ARG0: e2 ] > HCONS: < h0 qeq h1 > ]''') # test full work flow: # mrs_str > dmrs() > xml > layout > dmrs > mrs > mrs_str expected = '''[ TOP: h0 RELS: < [ _rain_v_1<3:9> LBL: h1 ARG0: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] ] > HCONS: < h0 qeq h1 > ]''' actual = DMRSLayout.from_xml(s[0].edit().to_dmrs().xml()).to_dmrs().to_mrs().tostring(False) self.assertEqual(actual, expected) # Test different formats xstr = s.to_xml_str() self.assertTrue(xstr) lts = s.to_latex() # LaTeX scripts self.assertTrue(lts)
def test_def_dup(self): header("Check if a definition is not unique") sid = '11937102-n' omwss = omw.get_synset(sid) gwnss = gwn.get_synset(sid) self.assertEqual(omwss.definition, 'a variety of aster (Symphyotrichum lateriflorum)') self.assertEqual(gwnss.definition, 'a variety of aster;') glosses = gwn.gloss.select('surface = ?', (gwnss.definition, )) ssids = {str(SynsetID.from_string(g.sid)) for g in glosses} self.assertEqual( ssids, { '11935627-n', '11935715-n', '11935794-n', '11935877-n', '11935953-n', '11936027-n', '11936113-n', '11936199-n', '11936287-n', '11936369-n', '11936448-n', '11936539-n', '11936624-n', '11936707-n', '11936782-n', '11936864-n', '11936946-n', '11937023-n', '11937102-n', '11937195-n', '11937278-n', '11937360-n', '11937446-n' })
def test_dmrs(self): header("Test building a DMRS from scratch") corpus = Corpus(name="manual") doc = corpus.new(name="testdoc") sent = doc.new("It rains.") self.assertIsInstance(sent, Sentence) reading = sent.add('[]') dmrs = reading.dmrs() l = dmrs.layout n = Node(10000, "_rain_v_1", 3, 9) n.sortinfo.update({'sf': 'prop', 'tense': 'pres', 'mood': 'indicative', 'prog': '-', 'perf': '-', 'sarcasm': '-'}) l.add_node(n) l.add_link(Link(0, 10000, '', 'H')) # this is top l.save() # sense tag the DMRS sent.tag(ttl.Tag.MFS) self.assertGreaterEqual(len(dmrs.tags), 1) self.assertTrue(sent.to_xml_str())
def mine_rdf_values(sc, limit=None): ot_values = set() rdf_counter = Counter() rdf_with_key_counter = Counter() for f in sc.files if not limit else sc.files[:limit]: for sj in sc.iterparse(f): for t in sj['tokens']: if 'rdf' in t: l = t['lemma'] if 'lemma' in t else t.text r = t['rdf'] sk = t['sk'] if 'sk' in t else '' item = (l, r, sk) rdf_counter.count(item) if sk: rdf_with_key_counter.count(item) if 'ot' in t: ot_values.add(t['ot']) header("RDF values") for k, v in rdf_counter.most_common(): print("{}: {}".format(k, v)) header("RDF values (with valid keys)") for k, v in rdf_with_key_counter.most_common(): print("{}: {}".format(k, v)) header("OT values") for o in ot_values: print(o)
def dump(lu): header(lu) for r in lu: print(r.mode, r.dmrs())
def compare_gwn_wn30(self): header("GWN and WN30 are equal") # compare synset IDso gwn_ssids = set(get_gwn_synsets()) wn_ssids = set(get_wn30_synsets()) self.assertEqual(gwn_ssids, wn_ssids)
def archive_corpus(bib, corpus, ctx): header("Archiving corpus {}".format(corpus.name)) docs = bib.sqldao.get_docs(corpus.ID, ctx=ctx) for doc in docs: archive_doc(bib, corpus, doc.name, ctx=ctx)
def archive_collection(bib, ctx): header("Archiving collection {}".format(bib.name), level="h0") corpuses = ctx.corpus.select() for corpus in corpuses: archive_corpus(bib, corpus, ctx)
def test_all_grammars(self): header("Verify available grammars (JACY/VRG/ERG)") for n in ('JACY', 'VRG', 'ERG'): self.assertIn(n, self.ghub.names)
def test_omw(self): header("Ensure that OMW is working") omw_ssids = get_omw_synsets() print("OMW synsets: {}".format(len(omw_ssids))) print(omw_ssids[:5])
def test_xml_to_ttl(self): header("Test fixed 3rada to TTL format") sc = SemcorXML(SEMCOR_FIXED) scttl = FileSet(SEMCOR_TTL) sc.convert_to_ttl(scttl, limit=1, with_nonsense=False)
def test_xml2json(self): header("Test fixed 3rada to JSON") sc = SemcorXML(SEMCOR_FIXED) sc_json = FileSet(SEMCOR_JSON) for f in sc.files: xml2json(f, sc, sc_json)
def test_fix_3rada(self): header("Test fix original 3rada dataset") fix_3rada(SEMCOR_ORIG, SEMCOR_FIXED)
def test_gwn(self): header("Ensure that GWN is working") gwn_ssids = get_gwn_synsets() print("GWN synsets: {}".format(len(gwn_ssids))) print(gwn_ssids[:5])
def test_wn30(self): header("Ensure that PWN-3.0 is working") wn_ssids = get_wn30_synsets() print("WN synsets: {}".format(len(wn_ssids))) print(wn_ssids[:5])