def omw_fix_dup(cli, args): rp = TextReport(args.output) omw = get_omw() c = Counter() with omw.ctx() as ctx: senses = ctx.sense.select(limit=args.topk, columns=('synset', )) synsetids = {s.synset for s in senses} rp.print("-- OMW synsets: {}\n".format(len(synsetids))) for sid in synsetids: try: sid = SynsetID.from_string(sid) except: cli.logger.warning("Ignored synset ID: {}".format(sid)) continue ss = omw.get_synset(sid, ctx=ctx) fixed_def, dup_defs = join_definitions(ss) if dup_defs: c.count("Duplicated") rp.print("-- Original {}: {}".format(ss.ID, ss.definition)) rp.print("-- Fixed {}: {}".format(ss.ID, fixed_def)) for dup in dup_defs: rp.print( "DELETE FROM synset_def WHERE synset='{}' and def='{}';" .format(ss.ID, to_sqlite_string(dup))) rp.print() c.summarise() pass
def manual_patch(cli, args): rp = TextReport() omw = get_omw() if not args.input or not os.path.isfile(args.input): raise Exception("Input file could not be found") with open(args.input, 'r') as infile, omw.ctx() as ctx: synsets = json.loads(infile.read()) # for ss in synsets: # rp.print(ss['synset'], ss['definition']) # rp.print("Found synsets:", len(synsets)) for sinfo in synsets: sid, fixed_def = sinfo['synset'], sinfo['definition'] ss = omw.get_synset(sid, ctx=ctx) orig_def = remove_puncs(ss.definition) if remove_puncs(fixed_def) != orig_def: rp.header("WARNING:", sid) rp.print(ss.definition) rp.print(fixed_def)
def verify_patch(cli, args): rp = TextReport() c = Counter() if not args.input or not os.path.isfile(args.input): raise Exception("Patch file not found") # load patches with open(args.input) as infile: patches = [DefPatch.from_dict(p) for p in yaml.safe_load(infile)] rp.print("Found {} patches.".format(len(patches))) # Validate against GWN-30 # gwn = get_gwn() # don't use GWN, for now omw = get_omw() wn = get_wn() with omw.ctx() as ctx, wn.ctx() as wnctx: for patch in patches: try: sid = wn.sk2sid(patch.sensekey, ctx=wnctx) if not sid: raise Exception("sensekey `{}' does not exist.".format( patch.sensekey)) ss = omw.get_synset(sid, ctx=ctx) ssdef = ss.definition[:-1] if ss.definition.endswith( ';') else ss.definition if patch.orig_def == ssdef: c.count("Found") rp.print("-", "{} [{}]".format(patch.orig_def, patch.sensekey)) rp.print(" ", patch.new_def) if patch.comment: rp.print("C", patch.comment) else: c.count("Found - diff") rp.print("[DIFF]", "{} [{}]".format(patch.orig_def, patch.sensekey)) rp.print("New: ", "{} [{}]".format(patch.new_def, patch.sensekey)) rp.print(" ", ssdef) rp.print("Note: ", patch.comment) except: getLogger().warn("sensekey `{}' couldn't be found".format( patch.sensekey)) c.count("Not found") continue c.summarise(report=rp)
def find_omw_typo(cli, args): omw = get_omw() with omw.ctx() as ctx: defs = ctx.synset_def.select( "lang='eng' and (def like '% )%' or def like '% %' or def like '% e.g.' or def like '% ,%' or def like '%:')" ) if args.action == 'list': print("Found {} definitions with typo".format(len(defs))) for d in defs: print(d) print("Fixed: {}".format(repr(fix_typo(d._2)))) elif args.action == 'patch': patch_script = TextReport(args.output) for d in defs: fixed_def = fix_typo(d._2) patch_script.writeline("-- Orig : {} [{}]".format( d._2, d.synset)) patch_script.writeline("-- Fixed: {}".format(fixed_def)) patch_script.writeline( "UPDATE synset_def SET def = '{}' WHERE synset='{}' AND def='{}';\n" .format(to_sqlite_string(fixed_def), d.synset, to_sqlite_string(d._2)))
def extract_omw(cli, args): ''' OMW Extractor ''' rp = TextReport() omw = get_omw() WN_POS = 'nvar' with omw.ctx() as ctx: for pos in WN_POS: rp.header("POS: {}".format(pos)) query = '''SELECT lemma, sense.synset, def as sdef FROM sense LEFT JOIN word ON sense.wordid = word.wordid and sense.lang=word.lang LEFT JOIN synset_def ON sense.synset = synset_def.synset AND sense.lang = synset_def.lang WHERE sense.lang='eng' AND word.lang='eng' AND synset_def.lang='eng' AND pos=? ORDER By freq DESC ''' params = [pos] if args.topk: query += ' LIMIT ?' params.append(args.topk) results = ctx.select(query, params) senses = OrderedDict() potential_names = 0 for lemma, sid, sdef in results: if lemma.lower() != lemma: # if pos not in 'nar': # rp.print("{} - {}".format(lemma, pos)) potential_names += 1 if (lemma, sid) in senses: senses[(lemma, sid)] += "; " + sdef else: senses[(lemma, sid)] = sdef print("Found {} sense in OMW".format(len(senses.keys()))) print("Potential name: {}".format(potential_names)) if args.output: out_path = "{}_{}.txt".format(args.output, pos) wordsenses = (k + (v, ) for k, v in senses.items()) CSV.write_tsv(out_path, wordsenses, quoting=CSV.QUOTE_MINIMAL) print("Written to {}".format(out_path)) lemma_out_path = "{}_{}_lemma.txt".format(args.output, pos) with open(lemma_out_path, 'w') as outfile: for l, sid in senses.keys(): outfile.write(l) outfile.write('\n') print("Written to {}".format(lemma_out_path))
def compare_ttls(cli, args): ''' Compare TTL to gold ''' rp = TextReport() omw = get_omw() ctx = omw.ctx() gold = None profile = None ignored_ids = [] if args.ignore: ignored_ids = [x.strip() for x in read_file(args.ignore).splitlines() if x.strip()] getLogger().debug("Ignored sentence IDs: {}".format(', '.join(ignored_ids))) if args.gold_profile: gold = read_ttl(args.gold_profile, ttl_format=args.ttl_format) # remove ignored sentences if ignored_ids: for sid in ignored_ids: gold.pop(sid, default=None) if not args.batch: rp.header("Gold sentences: {} | Loc: {}".format(len(gold), args.gold_profile)) if args.verbose and not args.batch: for s in gold: rp.print("Sent #{}: {} tags".format(s.ID, len(s.tags))) elif not args.batch: print("Oops, no gold!") # read profile if args.profile: profile = read_ttl(args.profile, ttl_format=args.ttl_format) if not args.batch: rp.header("Profile sentences: {} | Loc: {}".format(len(profile), args.profile)) # remove ignored sentences if ignored_ids: for sid in ignored_ids: profile.pop(sid, default=None) if not args.batch: rp.header("Profile sentences: {} (ignored: {}) | Loc: {}".format(len(profile), len(ignored_ids), args.profile)) if args.verbose and not args.batch: for s in profile: getLogger().debug("Profile/Sent #{}: {} tags".format(s.ID, len(s.tags))) elif not args.batch: print("Oops, no profile to evaluate") # calculate precision and recall if gold and profile: gold_tags, gold_tags_len, gold_ignored = prepare_tags(gold, args=args, nonsense=args.nonsense) profile_tags, profile_tags_len, profile_ignored = prepare_tags(profile, args=args, nonsense=args.nonsense) if gold_tags_len == 0: rp.print("WARNING: There was no tag found in the gold profile. Please make sure that the tags for comparison are *sentence level* tags") if profile_tags_len == 0: rp.print("WARNING: There was no tag found in the evaluating profile. Please make sure that the tags for comparison are *sentence level* tags") getLogger().debug("Gold tags: {}".format(gold_tags_len)) getLogger().debug(list(gold_tags.items())[:5]) getLogger().debug("Profile tags: {}".format(profile_tags_len)) getLogger().debug(list(profile_tags.items())[:5]) true_positive, false_negative = score(gold_tags, profile_tags, args=args) precision = len(true_positive) / profile_tags_len recall = len(true_positive) / gold_tags_len f1 = 2 * precision * recall / (precision + recall) getLogger().debug("TP: {}".format(len(true_positive))) getLogger().debug("FN: {}".format(len(false_negative))) getLogger().debug("Recall (TP/Gtags): {}".format(recall)) getLogger().debug("Precision (TP/Ptags): {}".format(precision)) getLogger().debug("F1 (2*p*r/(p+r)): {}".format(f1)) rc_text = "{:.2f}%".format(recall * 100) pr_text = "{:.2f}%".format(precision * 100) f1_text = "{:.2f}%".format(f1 * 100) if not args.batch: rp.print("True positive: {}".format(len(true_positive))) rp.print("False Negative: {}".format(len(false_negative))) rp.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored)) rp.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored)) rp.print("Recall: {}".format(rc_text)) rp.print("Precision: {}".format(pr_text)) rp.print("F1 : {}".format(f1_text)) if args.org: # output org-mode columns = [rc_text, pr_text, f1_text] if args.cols: columns = args.cols + columns rp.print('| {} |'.format(' | '.join(columns))) if args.debug: if not args.batch: print("Debug file: {}".format(args.debug)) debugfile = TextReport(args.debug) debugfile.print(".:: Table of content ::.") debugfile.print("") debugfile.print("[Misisng senses]") debugfile.print("[By classes]") debugfile.print("[Summary]") debugfile.print("") ss_map = {} debugfile.header("[Missing senses]") for sid, cfrom, cto, label in sorted(false_negative): if label not in ss_map: ss = omw.get_synset(label, ctx=ctx) ss_map[label] = ss else: ss = ss_map[label] # get the surface form surface = gold.get(sid).text[int(cfrom):int(cto)] debugfile.print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(sid, cfrom, cto, surface, label, ss.definition, ss.lemmas)) # by classes c = Counter() c.update(synsetID for sentID, cfrom, cto, synsetID in false_negative) debugfile.header("[By classes]") for synsetID, freq in c.most_common(): ss = ss_map[synsetID] debugfile.print("{}: {} | ({}) - {}".format(synsetID, freq, ', '.join(ss.lemmas), ss.definition)) # summary debugfile.header("[Summary]") debugfile.print("True positive: {}".format(len(true_positive))) debugfile.print("False positive: {}".format(len(false_negative))) debugfile.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored)) debugfile.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored)) debugfile.print("Recall (TP/Gtags) : {}".format(rc_text)) debugfile.print("Precision (TP/Ptags): {}".format(pr_text)) debugfile.print("F1 (2*p*r/(p+r)) : {}".format(f1_text)) ctx.close()
from puchikarui import Schema, with_ctx from coolisf import GrammarHub from chirptext.leutile import grouper from chirptext.io import CSV from chirptext import TextReport, FileHelper, Counter, FileHub from chirptext.cli import CLIApp, setup_logging from yawlib.helpers import get_gwn from yawlib.helpers import get_wn, get_omw # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- DATA_FOLDER = os.path.abspath(os.path.expanduser('./data')) omw = get_omw() gwn = get_gwn() wn = get_wn() setup_logging('logging.json', 'logs') ghub = GrammarHub() MY_DIR = os.path.dirname(__file__) SETUP_FILE = os.path.join(MY_DIR, 'scripts', 'ewdb.sql') ROOTS = {'n': 'root_wn_n', 'v': 'root_wn_v', 'a': 'root_wn_adj', 'r': ''} DEFAULT_DB_PATH = FileHelper.abspath('data/ewmap.db') class EWDB(Schema): class Flags: PROCESSED = 1 NO_PARSE = 2 MWE = 3
# :license: MIT, see LICENSE for more details. import os import json import logging import django from django.http import HttpResponse, Http404 from yawlib import SynsetID, SynsetCollection from yawlib.helpers import get_omw, get_wn # --------------------------------------------------------------------- # CONFIGURATION # --------------------------------------------------------------------- logger = logging.getLogger(__name__) wsql = get_wn() omwsql = get_omw() print("OMW: {}".format(omwsql)) def jsonp(func): """ JSON/JSONP decorator """ def decorator(request, *args, **kwargs): objects = func(request, *args, **kwargs) # ignore HttpResponse if isinstance(objects, HttpResponse): return objects # JSON/JSONP response data = json.dumps(objects) if 'callback' in request.GET: callback = request.GET['callback'] elif 'callback' in request.POST: