def test_as_dict_key(self): s = SynsetID.from_string('12345678n') s2 = SynsetID.from_string('112345678') d = {} d['12345678-n'] = 'abc' self.assertEqual(d[s], 'abc') self.assertIn(s2, d)
def test_unusual_sid(self): s = SynsetID.from_string('80000683-x') s = SynsetID.from_string(s) self.assertEqual(s.pos, 'x') self.assertEqual(s.offset, '80000683') # ? s = SynsetID.from_string('02315002-a')
def test_pos(self): self.assertEqual(POS.num2pos(1), 'n') self.assertEqual(POS.num2pos("1"), 'n') self.assertRaises(WordnetException, lambda: POS.num2pos("7")) self.assertRaises(WordnetException, lambda: POS.num2pos(None)) self.assertRaises(WordnetException, lambda: POS.num2pos(0)) self.assertEqual(POS.pos2num("n"), '1') self.assertEqual(POS.pos2num("v"), '2') self.assertEqual(POS.pos2num("a"), '3') self.assertEqual(POS.pos2num("r"), '4') self.assertEqual(POS.pos2num("s"), '5') self.assertEqual(POS.pos2num("x"), '6') self.assertRaises(WordnetException, lambda: POS.pos2num(None)) self.assertRaises(WordnetException, lambda: POS.pos2num("g")) self.assertRaises(WordnetException, lambda: POS.pos2num("")) self.assertRaises(WordnetException, lambda: POS.pos2num(1)) self.assertRaises(WordnetException, lambda: POS.pos2num("n ")) self.assertEqual(SynsetID.from_string('112345678').pos, 'n') self.assertEqual(SynsetID.from_string('212345678').pos, 'v') self.assertEqual(SynsetID.from_string('312345678').pos, 'a') self.assertEqual(SynsetID.from_string('412345678').pos, 'r') self.assertEqual(SynsetID.from_string('512345678').pos, 's') self.assertEqual(SynsetID.from_string('n12345678').pos, 'n') self.assertEqual(SynsetID.from_string('v12345678').pos, 'v') self.assertEqual(SynsetID.from_string('a12345678').pos, 'a') self.assertEqual(SynsetID.from_string('r12345678').pos, 'r') self.assertEqual(SynsetID.from_string('s12345678').pos, 's')
def test_synset_wrong_format(self): print("Test invalid synset formats") self.assertRaises(Exception, lambda: SynsetID.from_string(None)) # wrong POS (canonical) self.assertRaises(Exception, lambda: SynsetID.from_string('12345678g')) # wrong POS (WNSQL) self.assertRaises(Exception, lambda: SynsetID.from_string('712345678')) # wrong POS (WNSQL) #2 self.assertRaises(Exception, lambda: SynsetID.from_string('k12345678')) # no POS self.assertRaises(Exception, lambda: SynsetID.from_string('12345678'))
def test_pos(self): self.assertEqual(SynsetID.from_string('112345678').pos, 'n') self.assertEqual(SynsetID.from_string('212345678').pos, 'v') self.assertEqual(SynsetID.from_string('312345678').pos, 'a') self.assertEqual(SynsetID.from_string('412345678').pos, 'r') self.assertEqual(SynsetID.from_string('512345678').pos, 's') self.assertEqual(SynsetID.from_string('n12345678').pos, 'n') self.assertEqual(SynsetID.from_string('v12345678').pos, 'v') self.assertEqual(SynsetID.from_string('a12345678').pos, 'a') self.assertEqual(SynsetID.from_string('r12345678').pos, 'r') self.assertEqual(SynsetID.from_string('s12345678').pos, 's')
def export_wnsql_synsets(args): print( "Exporting synsets' info (lemmas/defs/examples) from WordnetSQL (Princeton Wordnet 3.0) to text file" ) show_info(args) output_with_sid_file = os.path.abspath('./data/wn30_lemmas.txt') output_without_sid_file = os.path.abspath('./data/wn30_lemmas_noss.txt') output_defs = os.path.abspath('./data/wn30_defs.txt') output_exes = os.path.abspath('./data/wn30_exes.txt') wn = get_wn(args) # Extract lemmas records = wn.get_all_synsets() synsets_lemmas = [] for r in records: synsets_lemmas.append( (SynsetID.from_string(str(r.synsetid)).to_canonical(), r.lemma)) synsets_lemmas.sort(key=itemgetter(0, 1)) with open(output_with_sid_file, 'w') as with_sid, open(output_without_sid_file, 'w') as without_sid: for row in synsets_lemmas: with_sid.write('%s\t%s\n' % row) without_sid.write('%s\n' % (row[1], )) # just the lemma # Extract synset definitions records = wn.schema.ss.select(orderby='synsetid') synsets_defs = [] for r in records: synsets_defs.append( (SynsetID.from_string(r.synsetid).to_canonical(), r.definition)) synsets_defs.sort(key=itemgetter(0)) with open(output_defs, 'w') as def_file: for row in synsets_defs: def_file.write('%s\t%s\n' % row) # Extract examples records = wn.schema.ex.select(orderby='synsetid') synsets_examples = [] for r in records: synsets_examples.append( (SynsetID.from_string(r.synsetid).to_canonical(), r.sample)) synsets_examples.sort(key=itemgetter(0)) with open(output_exes, 'w') as ex_file: for row in synsets_examples: ex_file.write('%s\t%s\n' % row) # summary print("Data has been extracted to:") print(" + {}".format(output_with_sid_file)) print(" + {}".format(output_without_sid_file)) print(" + {}".format(output_defs)) print(" + {}".format(output_exes)) print("Done!")
def list_unksense(args): header("List unknown sensekeys in Semcor") semxml = SemcorXML(SEMCOR_TTL) unk = Counter() sids = Counter() c = Counter() out = TextReport() if not args.out else TextReport(args.out) for f in semxml.files[:args.limit] if args.limit else ttl.files: doc = ttl.Document.from_json_file(ttl.files.abspath(f)) for s in doc: for concept in s.concepts: try: sid = SynsetID.from_string(concept.tag) sids.count((sid, concept.clemma)) c.count("Known instances") except: sid = None unk.count((concept.tag, concept.clemma)) c.count("Unknown instances") out.header("Known concepts") out.writeline("\t".join(("synsetID", "lemma", "count"))) for k, v in sids.sorted_by_count(): sid, lemma = k out.writeline("\t".join((str(sid), lemma, str(v)))) out.header("Unknown concepts") out.writeline("\t".join(("sensekey", "lemma", "count"))) for k, v in unk.sorted_by_count(): sk, lemma = k out.writeline("\t".join((sk, lemma, str(v)))) out.header("Total") out.writeline("Known: {}".format(len(sids))) out.writeline("Unknown: {}".format(len(unk))) c.summarise(out)
def omw_fix_dup(cli, args): rp = TextReport(args.output) omw = get_omw() c = Counter() with omw.ctx() as ctx: senses = ctx.sense.select(limit=args.topk, columns=('synset', )) synsetids = {s.synset for s in senses} rp.print("-- OMW synsets: {}\n".format(len(synsetids))) for sid in synsetids: try: sid = SynsetID.from_string(sid) except: cli.logger.warning("Ignored synset ID: {}".format(sid)) continue ss = omw.get_synset(sid, ctx=ctx) fixed_def, dup_defs = join_definitions(ss) if dup_defs: c.count("Duplicated") rp.print("-- Original {}: {}".format(ss.ID, ss.definition)) rp.print("-- Fixed {}: {}".format(ss.ID, fixed_def)) for dup in dup_defs: rp.print( "DELETE FROM synset_def WHERE synset='{}' and def='{}';" .format(ss.ID, to_sqlite_string(dup))) rp.print() c.summarise() pass
def test_assign_synsetid(self): s1 = SynsetID.from_string('12345678-n') ss = Synset(s1) ssid = ss.ID self.assertIs(ss.ID, ssid) self.assertEqual(ss.ID, s1) self.assertIsNot(ss.ID, s1) # must not be the same instance SynsetID
def get_omw_synsets(): omw_ss = omw.sense.select() omw_ssids = set() for x in omw_ss: try: omw_ssids.add(SynsetID.from_string(x.synset)) except: print(x) return list(omw_ssids)
def bake_doc(doc): ''' Convert concepts to tags ''' for sent in doc: for concept in sent.concepts: cfrom = min(t.cfrom for t in concept.tokens) cto = min(t.cto for t in concept.tokens) sid = SynsetID.from_string(concept.tag, default=None) # must be a valid synsetID if cfrom >= 0 and cto >= 0 and sid is not None: sent.new_tag(concept.tag, cfrom, cto, tagtype='WN') return doc
def filter_bad_synsetids(concepts): good_concepts = [] bad_concepts = [] for c in concepts: valid_sid = SynsetID.from_string(c.tag, default=None) if valid_sid is None: getLogger().warning("{} is not a valid synset ID".format(c.tag)) bad_concepts.append(c) else: good_concepts.append(c) return good_concepts, bad_concepts
def test_synset_id(self): print("Test synset ID") sid = SynsetID('12345678', 'n') expected_canonical = '12345678-n' expected_wnsql = '112345678' # validate formats self.assertEqual(str(sid), expected_canonical) self.assertEqual(sid.to_canonical(), expected_canonical) self.assertEqual(sid.to_wnsql(), expected_wnsql) # comparing synset sid2 = SynsetID.from_string('112345678') sid3 = SynsetID.from_string('12345678-n') sid4 = SynsetID.from_string('12345678n') sid5 = SynsetID.from_string('n12345678') sid6 = SynsetID.from_string(112345678) self.assertEqual(sid, sid2) self.assertEqual(sid, sid3) self.assertEqual(sid, sid4) self.assertEqual(sid, sid5) self.assertEqual(sid, sid6)
def to_ttl(sent, with_nonsense=True, sk_map=None, wnctx=None): tokens = sent['tokens'] text = detokenize(tokens) s = ttl.Sentence(text=text) s.new_tag(sent['sid'], tagtype='origid') s.import_tokens((t.text for t in tokens)) for tinfo, tk in zip(tokens, s): for k, v in tinfo.data: if (k, v) == ('tag', 'wf') or k == 'sk': continue if k == 'lemma': tk.lemma = v elif k == 'pos': tk.pos = v else: tk.new_tag(label=v, tagtype=k) # if sensekey exists, add it as a concept lemma = tinfo.lemma sk = fix_sensekey(tinfo.get('sk')) rdf = tinfo.get('rdf') comment = None if sk and (with_nonsense or not is_nonsense(lemma, sk, rdf)): sensetag = sk if sk_map is not None and sk in sk_map: sensetag = sk_map[sk] elif wnctx is not None: # try to determine synsetID ss = wnctx.senses.select_single('sensekey=?', (sk, )) if ss is not None: sid = str(SynsetID.from_string(ss.synsetid)) if sk_map is not None: sk_map[sk] = sid sensetag = sid else: # sensekey not found getLogger().warning( "There is no synsetID with sensekey={} | rdf={}". format(sk, rdf)) comment = 'sensekey' s.new_concept(clemma=lemma, tag=sensetag, tokens=(tk, ), comment=comment) return s
def test_get_mfs(self): words = 'we this sing full cat tongue name dry die horn sun with mountain eye belly old big red woman live head animal because cloud louse sleep ear wet know salt walk eat seed green bite say person all child count thin stand father laugh night give stone heavy if bone sister other yellow small work snake smoke kill white swim short grease worm narrow flower neck path drink flesh good sharp ash snow hot fire mouth see dirty hand egg skin cold fly wood mother come I warm where one play foot sea year new earth smooth two water what burn fish vomit bird how long hunt sit rope feather nose dust round wind tooth correct bark root ice not blood tail dull brother man heart lie liver many pig rain claw who day grass knee when leaf wide hair meat black dog star dance breasts wife sand husband You bad hear moon river tree that'.split( ) with omw.ctx() as ctx, TextReport( 'data/mfs1500.txt') as rp, TextReport( "data/wndef.txt") as deffile: query = 'wordid in (SELECT wordid FROM word WHERE lemma in {})'.format( repr(tuple(words))) rows = ctx.sense.select(query) ssids = [SynsetID.from_string(r.synset) for r in rows] for ssid in ssids: ss = omw.get_synset(ssid, ctx=ctx) if ss.lemmas and ss.definition: rp.print("{id} ({lm}): {df}".format( id=ss.ID, lm=", ".join(ss.lemmas), df=ss.definition.strip())) deffile.print(ss.definition.strip()) print("Done!")
def test_def_dup(self): header("Check if a definition is not unique") sid = '11937102-n' omwss = omw.get_synset(sid) gwnss = gwn.get_synset(sid) self.assertEqual(omwss.definition, 'a variety of aster (Symphyotrichum lateriflorum)') self.assertEqual(gwnss.definition, 'a variety of aster;') glosses = gwn.gloss.select('surface = ?', (gwnss.definition, )) ssids = {str(SynsetID.from_string(g.sid)) for g in glosses} self.assertEqual( ssids, { '11935627-n', '11935715-n', '11935794-n', '11935877-n', '11935953-n', '11936027-n', '11936113-n', '11936199-n', '11936287-n', '11936369-n', '11936448-n', '11936539-n', '11936624-n', '11936707-n', '11936782-n', '11936864-n', '11936946-n', '11937023-n', '11937102-n', '11937195-n', '11937278-n', '11937360-n', '11937446-n' })
def search(query): # assume that query is a synset? try: sid = SynsetID.from_string(query) ss = wsql.get_synset_by_id(sid) if ss is not None: return SynsetCollection().add(ss).to_json_str() except Exception as e: # not synsetid logger.exception(e, "Invalid synset ID") pass # try search by lemma synsets = wsql.get_synsets_by_lemma(query) if synsets: return synsets.to_json_str() else: # search by sensekey ss = wsql.get_synset_by_sk(query) if ss: return SynsetCollection().add(ss).to_json_str() # invalid query abort(404)
def to_ttl(sent, with_nonsense=True, sk_map=None, wnctx=None): tokens = sent['tokens'] text = detokenize(tokens) s = ttl.Sentence(text=text) s.new_tag(sent['sid'], tagtype='origid') s.import_tokens((t.text for t in tokens)) for tinfo, tk in zip(tokens, s): for k, v in tinfo.data: if (k, v) == ('tag', 'wf') or k == 'sk': continue if k == 'lemma': tk.lemma = v elif k == 'pos': tk.pos = v else: tk.new_tag(label=v, tagtype=k) # if sensekey exists, add it as a concept lemma = tinfo.lemma sk = fix_sensekey(tinfo.get('sk')) rdf = tinfo.get('rdf') comment = None if sk and (with_nonsense or not is_nonsense(lemma, sk, rdf)): sensetag = sk if sk_map is not None and sk in sk_map: sensetag = sk_map[sk] elif wnctx is not None: # try to determine synsetID ss = wnctx.senses.select_single('sensekey=?', (sk,)) if ss is not None: sid = str(SynsetID.from_string(ss.synsetid)) if sk_map is not None: sk_map[sk] = sid sensetag = sid else: # sensekey not found getLogger().warning("There is no synsetID with sensekey={} | rdf={}".format(sk, rdf)) comment = 'sensekey' s.new_concept(clemma=lemma, tag=sensetag, tokens=(tk,), comment=comment) return s
def search(request, query): """ Search by lemma, sensekey or synsetID Mapping: /yawol/search/<query> """ # assume that query is a synset? sid = SynsetID.from_string(query.strip(), default=None) if sid: ss = wsql.get_synset(sid) if ss is None and omwsql is not None: # try to search by OMW print("Searching in OMW") ss = omwsql.get_synset(sid) print("OMW SS", ss) if ss is not None: return SynsetCollection().add(ss).to_json() # try to search by lemma synsets = wsql.search(lemma=query) if synsets: logger.info("Query: {} - Results: {}".format(query, synsets)) return synsets.to_json() else: if not synsets and omwsql is not None: print("Try to search {} in OMW".format(query)) synsets = omwsql.search(lemma=query) if synsets: logger.info("Query: {} - Results: {}".format(query, synsets)) return synsets.to_json() else: logger.warning("Not found {} in OMW".format(query)) else: # try to search by sensekey try: ss = wsql.get_by_key(query) except: ss = None if ss: return SynsetCollection().add(ss).to_json() # invalid query raise Http404('Invalid query')
def search(request, query): ''' Search by lemma, sensekey or synsetID Mapping: /yawol/search/<query> ''' # assume that query is a synset? try: sid = SynsetID.from_string(query) ss = wsql.get_synset_by_id(sid) if ss is not None: return SynsetCollection().add(ss).to_json() except: pass # try to search by lemma synsets = wsql.get_synsets_by_lemma(query) if synsets: return synsets.to_json() else: # try to search by sensekey ss = wsql.get_synset_by_sk(query) if ss: return SynsetCollection().add(ss).to_json() # invalid query raise Http404('Invalid query')
def get_gwn_synsets(): gwn_ss = gwn.synset.select(columns=('ID',)) return [SynsetID.from_string(x.ID) for x in gwn_ss]
def generate_lelesk_test_profile(): '''Generate test profile for lelesk (new format 31st Mar 2015) ''' # Read all tags tagmap = dd(list) # sentence ID > tags list # a sample line: 10000 4 13 00796315-n adventure NN TagInfo = namedtuple('TagInfo', 'sentid cfrom cto synsetid word pos'.split()) c = Counter() with open(TAG_FILE, 'r') as tags: for tag in tags: # br-k22-1 8 11 not%4:02:00:: not parts = [x.strip() for x in tag.split('\t')] if len(parts) == 6: if parts[3][0] in '=!': parts[3] = parts[3][1:] parts[3] = SynsetID.from_string(parts[3]) tag = TagInfo(*parts) tagmap[tag.sentid].append(tag) else: print("WARNING: Invalid line | ``{}''".format(parts)) # read in words wordmap = dd(list) with open(TOKEN_FILE, 'r') as wordfile: for line in wordfile: if line.startswith('#') or len(line.strip()) == 0: continue # sid word parts = [x.strip() for x in line.split('\t')] if len(parts) == 2: (sid, word) = parts wordmap[sid].append(word) # build test profile sentences = [] line_count = 0 with open(RAW_FILE, 'r') as lines: for line in lines: line_count += 1 if line.startswith('#'): continue else: parts = [x.strip() for x in line.split('\t')] if len(parts) == 2: sid, sent = parts tokens = '' if sid in wordmap: tokens = '|'.join(wordmap[sid]) if sid in tagmap: print(sent) # found tags pos = pos2wnpos(tag.pos, tag.word) c.count(pos) for tag in tagmap[sid]: sentences.append((tag.word, tag.synsetid, pos, sent, tokens)) # write to file with open(TEST_PROFILE_OUTPUT, 'w') as outputfile: for k, v in c.sorted_by_count(): outputfile.write("# %s: %s\n" % (k, v)) for sentence in sentences: outputfile.write("%s\t%s\t%s\t%s\t%s\n" % sentence) # write dev profile random.seed(31032015) itemids = sorted(random.sample(range(line_count), 500)) with open(TEST_PROFILE_OUTPUT_DEV, 'w') as outputfile: for itemid in itemids: outputfile.write("%s\t%s\t%s\t%s\t%s\n" % sentences[itemid]) pass
def get_wn30_synsets(): wn_ss = wn30.schema.ss.select() wn_ssids = [SynsetID.from_string(x.synsetid) for x in wn_ss] return wn_ssids
def omw_vs_gwn_def(): rp = TextReport("data/omw_gwn_report.txt") rpdiff = TextReport("data/omw_gwn_diff.txt") rptypo = TextReport("data/omw_gwn_typo.txt") rpsn = TextReport("data/omw_gwn_sciname.txt") c = Counter(TAGS.ORDER) # ssids to compare diff_ssids = [] ssids = read_diff_ssids() if not ssids: print("Generating synset ID list") omw_ssids = set(get_omw_synsets()) gwn_ssids = set(get_gwn_synsets()) # only care about old GWN synsets ssids = omw_ssids.intersection(gwn_ssids) else: print("Comparing {} synsets loaded from {}".format(len(ssids), ssid_filepath)) lang = 'eng' with omw.ctx() as omw_ctx, gwn.ctx() as gwn_ctx: print("Comparing {} synsets".format(len(ssids))) for ss in list(ssids): ss = str(ss) c.count("total") tags, odef, gdef = compare_synset(omw, gwn, ss, omw_ctx, gwn_ctx) omwss = omw.get_synset(ss, ctx=omw_ctx) tags_str = ' '.join('[{}]'.format(t.upper()) for t in tags) if TAGS.DIFF in tags: diff_ssids.append(ss) # [FCB] why did we change? gwnss = gwn.get_synset(ss, ctx=gwn_ctx) glosses = gwn_ctx.gloss.select('surface = ?', (gwnss.definition,)) if glosses and len(glosses) > 1: tags.add(TAGS.DUP) ssids = [str(SynsetID.from_string(g.sid)) for g in glosses] reason = "Not unique (Shared among {}) so OMW team changed it".format(' '.join(ssids)) else: tags.add(TAGS.OMW) defs = omw.sdef.select('synset=? and lang=?', (ss, lang)) usrs = {d.usr for d in defs if d.usr} usrs_str = ', '.join(usrs) if usrs else "someone in NTU" reason = "{} made this change.".format(usrs_str) tags_str = ' '.join('[{}]'.format(t.upper()) for t in tags) rpdiff.header("{} {}".format(tags_str, ss)) rpdiff.print("OMW: {}".format(omwss.definition)) rpdiff.print("GWN: {}".format(gdef)) rpdiff.print("Reason: {}".format(reason)) if TAGS.SCINAME in tags: rpsn.header("{} {}".format(tags_str, ss)) rpsn.print("OMW: {}".format(omwss.definition)) rpsn.print("GWN: {}".format(gdef)) if TAGS.REP in tags or TAGS.TYPO in tags: rptypo.header("{} {}".format(tags_str, ss)) rptypo.print("OMW: {}".format(omwss.definition)) rptypo.print("GWN: {}".format(gdef)) # master report for tag in tags: c.count(tag) if not tags: c.count(TAGS.IDENT) rp.header("{} {}".format(tags_str, ss)) rp.print("OMW: {}".format(omwss.definition)) rp.print("GWN: {}".format(gdef)) # done c.summarise(report=rp) with open('data/omw_gwn_diff_ssids.txt', 'wt') as diff_ssid_file: for ss in diff_ssids: diff_ssid_file.write('{}\n'.format(ss))