Beispiel #1
0
 def mark_adj_concepts(self):
     with self.db.ctx() as ctx:
         lus = self.db.find_lexunits(flag=LexUnit.PROCESSED, pos='a', ctx=ctx)
         for lu in lus[:50]:
             self.db.get_lexunit(lu, ctx=ctx)
             header(lu)
             mark_adj_concepts(lu, ctx)
Beispiel #2
0
def main():
    header("Main method")
    c = Counter()
    t = Timer()
    t.start("Doing some time-consuming tasks ...")

    logging.info("Count even & odd numbers ...")
    for i in range(10000):
        if i % 2 == 0:
            c.count("even")
        else:
            c.count("odd")
    c.summarise()

    logging.info("Creating report dir ...")
    FileHelper.create_dir(DATA_DIR)

    report = TextReport(REPORT_LOC)
    logging.info("Now try to create a text report (Located at: %s)" %
                 (report.get_path()))
    generate_report(report)

    # try to report to stdout
    logging.info("The same report to stdout ...")
    generate_report(TextReport())
    t.end("Done")
Beispiel #3
0
def list_unksense(args):
    header("List unknown sensekeys in Semcor")
    semxml = SemcorXML(SEMCOR_TTL)
    unk = Counter()
    sids = Counter()
    c = Counter()
    out = TextReport() if not args.out else TextReport(args.out)
    for f in semxml.files[:args.limit] if args.limit else ttl.files:
        doc = ttl.Document.from_json_file(ttl.files.abspath(f))
        for s in doc:
            for concept in s.concepts:
                try:
                    sid = SynsetID.from_string(concept.tag)
                    sids.count((sid, concept.clemma))
                    c.count("Known instances")
                except:
                    sid = None
                    unk.count((concept.tag, concept.clemma))
                    c.count("Unknown instances")
    out.header("Known concepts")
    out.writeline("\t".join(("synsetID", "lemma", "count")))
    for k, v in sids.sorted_by_count():
        sid, lemma = k
        out.writeline("\t".join((str(sid), lemma, str(v))))
    out.header("Unknown concepts")
    out.writeline("\t".join(("sensekey", "lemma", "count")))
    for k, v in unk.sorted_by_count():
        sk, lemma = k
        out.writeline("\t".join((sk, lemma, str(v))))
    out.header("Total")
    out.writeline("Known: {}".format(len(sids)))
    out.writeline("Unknown: {}".format(len(unk)))
    c.summarise(out)
Beispiel #4
0
 def test_parsing(self):
     ghub = GrammarHub()
     # noun
     sent = ghub.ERG.parse('dog', extra_args=['-r', 'root_wn_n'])
     self.assertGreater(len(sent), 0)
     dmrs = sent[0].dmrs().obj()
     eps = dmrs.eps()
     self.assertEqual(len(eps), 1)
     self.assertEqual(eps[0].pred.pos, 'n')
     # verb
     sent = ghub.ERG.parse('love', extra_args=['-r', 'root_wn_v'])
     self.assertGreater(len(sent), 0)
     dmrs = sent[0].dmrs().obj()
     eps = dmrs.eps()
     self.assertEqual(len(eps), 1)
     self.assertEqual(eps[0].pred.pos, 'v')
     # adjective
     sent = ghub.ERG.parse('nice', extra_args=['-r', 'root_wn_adj'])
     self.assertGreater(len(sent), 0)
     dmrs = sent[0].dmrs().obj()
     eps = dmrs.eps()
     print(sent[0].mrs())
     self.assertEqual(len(eps), 1)
     self.assertEqual(eps[0].pred.pos, 'a')
     # test iterative parsing
     words = ['drink', 'eat', 'eat', 'drink']
     for parses in ghub.ERG.parse_many_iterative(words, extra_args=['-r', 'root_frag'], ignore_cache=True):
         header(parses)
         for p in parses:
             print(p.mrs())
Beispiel #5
0
def list_unksense(args):
    header("List unknown sensekeys in Semcor")
    semxml = SemcorXML(SEMCOR_TTL)
    unk = Counter()
    sids = Counter()
    c = Counter()
    out = TextReport() if not args.out else TextReport(args.out)
    for f in semxml.files[:args.limit] if args.limit else ttl.files:
        doc = ttl.Document.from_json_file(ttl.files.abspath(f))
        for s in doc:
            for concept in s.concepts:
                try:
                    sid = SynsetID.from_string(concept.tag)
                    sids.count((sid, concept.clemma))
                    c.count("Known instances")
                except:
                    sid = None
                    unk.count((concept.tag, concept.clemma))
                    c.count("Unknown instances")
    out.header("Known concepts")
    out.writeline("\t".join(("synsetID", "lemma", "count")))
    for k, v in sids.sorted_by_count():
        sid, lemma = k
        out.writeline("\t".join((str(sid), lemma, str(v))))
    out.header("Unknown concepts")
    out.writeline("\t".join(("sensekey", "lemma", "count")))
    for k, v in unk.sorted_by_count():
        sk, lemma = k
        out.writeline("\t".join((sk, lemma, str(v))))
    out.header("Total")
    out.writeline("Known: {}".format(len(sids)))
    out.writeline("Unknown: {}".format(len(unk)))
    c.summarise(out)
Beispiel #6
0
def main():
    header("Main method")
    c = Counter()
    t = Timer()
    t.start("Doing some time-consuming tasks ...")

    logging.info("Count even & odd numbers ...")
    for i in range(10000):
        if i % 2 == 0:
            c.count("even")
        else:
            c.count("odd")
    c.summarise()

    logging.info("Creating report dir ...")
    FileHelper.create_dir(DATA_DIR)

    report = TextReport(REPORT_LOC)
    logging.info("Now try to create a text report (Located at: %s)" % (report.get_path()))
    generate_report(report)

    # try to report to stdout
    logging.info("The same report to stdout ...")
    generate_report(TextReport())
    t.end("Done")
Beispiel #7
0
 def test_get_usr(self):
     header("Test get information")
     sid = '02386612-a'
     lang = 'eng'
     defs = omw.sdef.select('synset=? and lang=?', (sid, lang))
     usrs = {d.usr for d in defs if d.usr}
     self.assertTrue(usrs)
Beispiel #8
0
 def test_remove_sciname(self):
     header("Test removing scientific name")
     d = 'triggerfishes ❲Balistidae❳'
     self.assertTrue(has_sciname(d))
     self.assertTrue(SCIENTIFIC_NAME.search(d))
     d_nosn = remove_sciname(d)
     expected = 'triggerfishes'
     self.assertEqual(d_nosn, expected)
Beispiel #9
0
 def test_compare_synset(self):
     header("Test compare synset")
     with omw.ctx() as omw_ctx, gwn.ctx() as gwn_ctx:
         ss = '01850676-n'
         tags, odef, gdef = compare_synset(omw, gwn, ss, omw_ctx, gwn_ctx)
         self.assertEqual(tags, {TAGS.SAME, TAGS.SCINAME, TAGS.REP})
         ss = '00445467-v'
         tags, odef, gdef = compare_synset(omw, gwn, ss, omw_ctx, gwn_ctx)
         self.assertEqual(tags, set())
Beispiel #10
0
 def test_merge_compound(self):
     header("Test merge compound")
     n = self.data.get_named()
     ne = n.edit(0)
     head = ne.top['ARG']
     comps = Integral.get_comps(head)
     self.assertEqual(len(comps), 1)
     Integral.merge_compound(comps[0], True)
     self.assertEqual(head.predstr, "named")
     self.assertEqual(head.carg, "Charles Bond")
Beispiel #11
0
 def test_detect_dup(self):
     header("Detect duplication in OMW definition")
     ss = omw.get_synset('01850676-n')
     fixed_def, dup_defs = join_definitions(ss)
     self.assertEqual(
         ss.definition,
         'canvasback; redhead; pochard; canvasback; redhead; pochard; etc. ❲Aythya❳'
     )
     self.assertEqual(fixed_def,
                      'canvasback; redhead; pochard; etc. ❲Aythya❳')
     self.assertEqual(set(dup_defs), {'canvasback', 'redhead', 'pochard'})
Beispiel #12
0
def wn2ttl(args):
    inpath = FileHelper.abspath(args.inpath)
    header("WN to TTL format")
    wn = GWordnetXML()
    wn.read(inpath)
    print("Found senses: {}".format(len(wn.synsets)))
    outpath = FileHelper.abspath(args.outpath) if args.outpath else None
    with TextReport(outpath, 'w') as outfile:
        if args.format == 'json':
            convert_json(wn.synsets, outfile)
        elif args.format == 'xml':
            convert_xml(wn.synsets, outfile)
    print("Done!")
Beispiel #13
0
 def test_tag_one_sent(self):
     getLogger().debug("Test tagging one sentence")
     sid = '10081'
     sents = self.gold()
     smap = {str(s.ident): s for s in sents}
     sent = smap[sid]
     doc = ttl.Document('gold', TEST_GOLD_DIR).read()
     sent.shallow = doc.get(sid)
     if sent.text != sent.shallow.text:
         getLogger().debug("WARNING: Inconsistent")
         getLogger().debug(sent.text)
         getLogger().debug(sent.shallow.text)
     dmrs = sent[0].dmrs()
     # print(sent.to_xml_str())
     m, n, ignored = tag_gold(dmrs, sent.shallow, sent.text)
     header("#{}: {}".format(sid, sent.text), 'h0')
     getLogger().debug(sent[0].dmrs())
     header('Available concepts')
     for c in sent.shallow.concepts:
         getLogger().debug("{} {}".format(c, c.tokens))
     header('Matched')
     for con, nid, pred in m:
         getLogger().debug("{}::{} => #{}::{}".format(con.tag, con.clemma, nid, pred))
     header("Not matched")
     if n:
         for c in n:
             getLogger().debug(c)
     else:
         getLogger().debug("All was matched.")
     sent.tag(method=ttl.Tag.MFS)
     xml_str = sent.tag_xml().to_xml_str()
     self.assertTrue(xml_str)
     self.assertIn("<sensegold", xml_str)
     self.assertIn("<sense", xml_str)
     getLogger().debug(sent.to_xml_str())
Beispiel #14
0
 def test_generate_from_gold(self):
     header("Test generating from gold (THIS CAN BE VERY SLOW)")
     sents = self.gold()
     c = Counter()
     for sent in sents[:2]:
         if len(sent) > 0:
             try:
                 gsents = ERG.generate(sent[0])
                 c.count("OK" if gsents else "ERR")
             except:
                 c.count("BROKEN")
         else:
             c.count("SKIP")
     c.summarise()
Beispiel #15
0
 def test_find_surface(self):
     sid = '00445467-v'
     omwss = omw.get_synset(sid)
     gwnss = gwn.get_synset(sid)
     odef_orig = '; '.join(omwss.definitions)
     odef, is_duplicated = join_definitions(omwss)
     gdef = gwnss.get_def().text()
     if gdef.endswith(";"):
         gdef = gdef[:-1].strip()
     if odef != gdef:
         header(sid)
         print("OMW-orig: {} | is_dup: {}".format(odef_orig, is_duplicated))
         print("OMW:", odef)
         print("GWN:", gdef)
Beispiel #16
0
   def test_sent(self):
       header("Test model")
       doc = Document("test")
       s = doc.new("It rains.")
       s.add('''[ TOP: h0
 INDEX: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ]
 RELS: < [ _rain_v_1<3:9> LBL: h1 ARG0: e2 ] >
 HCONS: < h0 qeq h1 > ]''')
       # test full work flow:
       #   mrs_str > dmrs() > xml > layout > dmrs > mrs > mrs_str
       expected = '''[ TOP: h0 RELS: < [ _rain_v_1<3:9> LBL: h1 ARG0: e2 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] ] > HCONS: < h0 qeq h1 > ]'''
       actual = DMRSLayout.from_xml(s[0].edit().to_dmrs().xml()).to_dmrs().to_mrs().tostring(False)
       self.assertEqual(actual, expected)
       # Test different formats
       xstr = s.to_xml_str()
       self.assertTrue(xstr)
       lts = s.to_latex()  # LaTeX scripts
       self.assertTrue(lts)
Beispiel #17
0
 def test_def_dup(self):
     header("Check if a definition is not unique")
     sid = '11937102-n'
     omwss = omw.get_synset(sid)
     gwnss = gwn.get_synset(sid)
     self.assertEqual(omwss.definition,
                      'a variety of aster (Symphyotrichum lateriflorum)')
     self.assertEqual(gwnss.definition, 'a variety of aster;')
     glosses = gwn.gloss.select('surface = ?', (gwnss.definition, ))
     ssids = {str(SynsetID.from_string(g.sid)) for g in glosses}
     self.assertEqual(
         ssids, {
             '11935627-n', '11935715-n', '11935794-n', '11935877-n',
             '11935953-n', '11936027-n', '11936113-n', '11936199-n',
             '11936287-n', '11936369-n', '11936448-n', '11936539-n',
             '11936624-n', '11936707-n', '11936782-n', '11936864-n',
             '11936946-n', '11937023-n', '11937102-n', '11937195-n',
             '11937278-n', '11937360-n', '11937446-n'
         })
Beispiel #18
0
 def test_dmrs(self):
     header("Test building a DMRS from scratch")
     corpus = Corpus(name="manual")
     doc = corpus.new(name="testdoc")
     sent = doc.new("It rains.")
     self.assertIsInstance(sent, Sentence)
     reading = sent.add('[]')
     dmrs = reading.dmrs()
     l = dmrs.layout
     n = Node(10000, "_rain_v_1", 3, 9)
     n.sortinfo.update({'sf': 'prop', 'tense': 'pres', 'mood': 'indicative',
                        'prog': '-', 'perf': '-', 'sarcasm': '-'})
     l.add_node(n)
     l.add_link(Link(0, 10000, '', 'H'))  # this is top
     l.save()
     # sense tag the DMRS
     sent.tag(ttl.Tag.MFS)
     self.assertGreaterEqual(len(dmrs.tags), 1)
     self.assertTrue(sent.to_xml_str())
Beispiel #19
0
def mine_rdf_values(sc, limit=None):
    ot_values = set()
    rdf_counter = Counter()
    rdf_with_key_counter = Counter()
    for f in sc.files if not limit else sc.files[:limit]:
        for sj in sc.iterparse(f):
            for t in sj['tokens']:
                if 'rdf' in t:
                    l = t['lemma'] if 'lemma' in t else t.text
                    r = t['rdf']
                    sk = t['sk'] if 'sk' in t else ''
                    item = (l, r, sk)
                    rdf_counter.count(item)
                    if sk:
                        rdf_with_key_counter.count(item)
                if 'ot' in t:
                    ot_values.add(t['ot'])
    header("RDF values")
    for k, v in rdf_counter.most_common():
        print("{}: {}".format(k, v))
    header("RDF values (with valid keys)")
    for k, v in rdf_with_key_counter.most_common():
        print("{}: {}".format(k, v))
    header("OT values")
    for o in ot_values:
        print(o)
Beispiel #20
0
def dump(lu):
    header(lu)
    for r in lu:
        print(r.mode, r.dmrs())
Beispiel #21
0
 def compare_gwn_wn30(self):
     header("GWN and WN30 are equal")
     # compare synset IDso
     gwn_ssids = set(get_gwn_synsets())
     wn_ssids = set(get_wn30_synsets())
     self.assertEqual(gwn_ssids, wn_ssids)
Beispiel #22
0
def archive_corpus(bib, corpus, ctx):
    header("Archiving corpus {}".format(corpus.name))
    docs = bib.sqldao.get_docs(corpus.ID, ctx=ctx)
    for doc in docs:
        archive_doc(bib, corpus, doc.name, ctx=ctx)
Beispiel #23
0
def archive_collection(bib, ctx):
    header("Archiving collection {}".format(bib.name), level="h0")
    corpuses = ctx.corpus.select()
    for corpus in corpuses:
        archive_corpus(bib, corpus, ctx)
Beispiel #24
0
 def test_all_grammars(self):
     header("Verify available grammars (JACY/VRG/ERG)")
     for n in ('JACY', 'VRG', 'ERG'):
         self.assertIn(n, self.ghub.names)
Beispiel #25
0
 def test_omw(self):
     header("Ensure that OMW is working")
     omw_ssids = get_omw_synsets()
     print("OMW synsets: {}".format(len(omw_ssids)))
     print(omw_ssids[:5])
Beispiel #26
0
 def test_xml_to_ttl(self):
     header("Test fixed 3rada to TTL format")
     sc = SemcorXML(SEMCOR_FIXED)
     scttl = FileSet(SEMCOR_TTL)
     sc.convert_to_ttl(scttl, limit=1, with_nonsense=False)
Beispiel #27
0
 def test_xml2json(self):
     header("Test fixed 3rada to JSON")
     sc = SemcorXML(SEMCOR_FIXED)
     sc_json = FileSet(SEMCOR_JSON)
     for f in sc.files:
         xml2json(f, sc, sc_json)
Beispiel #28
0
 def test_fix_3rada(self):
     header("Test fix original 3rada dataset")
     fix_3rada(SEMCOR_ORIG, SEMCOR_FIXED)
Beispiel #29
0
 def test_gwn(self):
     header("Ensure that GWN is working")
     gwn_ssids = get_gwn_synsets()
     print("GWN synsets: {}".format(len(gwn_ssids)))
     print(gwn_ssids[:5])
Beispiel #30
0
 def test_wn30(self):
     header("Ensure that PWN-3.0 is working")
     wn_ssids = get_wn30_synsets()
     print("WN synsets: {}".format(len(wn_ssids)))
     print(wn_ssids[:5])