def process_lemma(cli, args): limit = int(args.topk) if args.topk and int(args.topk) > 0 else None pos = args.pos db = EWDB(args.db) rp = TextReport() rp.header("DB location: {}".format(db.ds.path)) with db.ctx() as ctx: if args.flag: query = ['(flag IS NULL OR flag = ?)'] params = [args.flag] else: query = ['flag IS NULL'] params = [] if pos: query.append('pos=?') params.append(pos) senses = ctx.sense.select(' AND '.join(query), params, limit=limit) print("Found {} senses for {}".format(len(senses), pos)) for idx, sense in enumerate(senses): if idx % 50 == 0: print("Processed {} / {}".format(idx, len(senses))) found_gold = is_gold(sense, db, ctx) # non zero = True if found_gold: # flag this sense as gold db.flag_sense(sense.ID, found_gold, ctx=ctx) elif sense.flag != EWDB.Flags.PROCESSED: db.flag_sense(sense.ID, EWDB.Flags.PROCESSED, ctx=ctx) pass
def gen_mfs_5500(cli, args): ''' Generate 3rd round tree banking ''' rp = TextReport(args.output) topk_synsets = topk_mfs(5500) # finished treebanking first_round = read_lines('data/omw3000_synsets.txt') second_round = read_lines('data/omw5000_synsets.txt') done_synsets = set(first_round + second_round) # new third_round = topk_synsets.difference(done_synsets) # report print("All :", len(topk_synsets)) print("Done :", len(done_synsets)) print("New :", len(third_round)) # write to a synset file with open('data/omw5300_synsets.txt', 'w') as outfile: outfile.write('\n'.join(third_round)) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: profile = 'omw5300' filename = 'omw5300A' for idx, sid in enumerate(third_round): ss = omw.get_synset(sid, ctx=ctx) hub[profile].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub[profile].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def gen_vocab(cli, args): ''' Generate vocabulary list from a tokenized file ''' if args.topk and args.topk <= 0: topk = None cli.logger.warning("Invalid k will be ignored (k should be greater than or equal to 1)") else: topk = args.topk if args.stopwords: with open(args.stopwords, 'r') as swfile: stopwords = swfile.read().splitlines() else: stopwords = [] if os.path.isfile(args.input): cli.logger.info("Generating vocabulary list from file {}".format(args.input)) with codecs.open(args.input, encoding='utf-8') as infile: if args.output: cli.logger.info("Output: {}".format(args.output)) rp = TextReport(args.output) lines = infile.read().splitlines() c = Counter() for line in lines: words = line.split() c.update(w for w in words if w not in stopwords) # report vocab word_freq = c.most_common(topk) words = [k for k, v in word_freq] rp.header("Lexicon") rp.writeline("\n".join(textwrap.wrap(" ".join(w for w in words), width=70))) for k, v in word_freq: rp.print("{}: {}".format(k, v)) else: cli.logger.warning("File {} does not exist".format(args.input))
def map_preds(cli, args): rp = TextReport(args.output) ctx = PredSense.wn.ctx() not_found = [] pred_file = 'data/erg_preds_interesting.txt' if args.all: pred_file = 'data/erg_preds_sorted.txt' name, ext = os.path.splitext(pred_file) not_found_file = name + "_notfound" + ext with open(pred_file, 'r') as infile: for p_str in infile.read().splitlines(): p = Predicate.from_string(p_str) candidates = None if p.pos == 'x' and p.sense == 'subord': continue # ignore these for now # if (p.pos == 'x' and p.sense == 'deg') or p.pos == 'p': if args.all or (p.pos and p.pos in 'xpq'): rp.header(p, p.lemma, p.pos, p.sense) candidates = PredSense.search_pred_string(p, ctx=ctx) for c in candidates: rp.print(c.ID, c.lemmas, c.definition) if not candidates: not_found.append(p_str) with TextReport(not_found_file, 'w') as outfile: for p in not_found: outfile.print(p) if args.output: print("Written to: {}".format(args.output)) print("Done")
def find_lesk_candidates(cli, args): doc = Document.from_file(args.gold) ne = 0 for s in doc: if len(s): ne += 1 print("Gold ISF: {} | not empty sents: {}".format(args.gold, ne)) # candidates = dd(lambda: dd(set)) notfound = dd(list) ident_sent_map = {} all_preds = Counter() missing_preds = Counter() found_preds = Counter() with PredSense.wn.ctx() as ctx: for idx, sent in enumerate(doc): if not len(sent): continue elif args.ident and sent.ident not in args.ident: continue if args.topk and args.topk < idx: break print(sent) ident_sent_map[sent.ident] = sent dmrs = sent[0].dmrs() if dmrs.tags: for ep in dmrs.get_lexical_preds(): all_preds.count(str(ep.pred)) if ep.nodeid in dmrs.tags: # if there is a tag for this node ep_synsets = PredSense.search_ep(ep, ctx=ctx) # return a SynsetCollection() for tag in dmrs.tags[ep.nodeid]: if tag.synset.ID not in ep_synsets: notfound[sent.ident].append((ep.nodeid, str(ep.pred), tag.synset.ID, tag.synset.lemma, [(x.ID, x.lemma) for x in ep_synsets])) missing_preds.count(str(ep.pred)) else: found_preds.count(str(ep.pred)) output = TextReport(args.output) # summarise total_found = sum(c for pred, c in found_preds.most_common()) total_missing = sum(c for pred, c in missing_preds.most_common()) output.print("Found : {}".format(total_found)) output.print("Not found: {}".format(total_missing)) ratio = (total_missing * 100) / (total_found + total_missing) output.print("Missing %: {}".format(ratio)) # preds by sentences output.header("By sentences") for sid in sorted(notfound.keys()): sent = ident_sent_map[sid] output.print((sid, sent.text)) items = notfound[sid] for item in items: output.print(item) output.print() # by preds output.header("By preds") for pred, occurrence in missing_preds.most_common(): output.print("{}: {}".format(pred, occurrence)) print("Done")
def show_stats(cli, args): db = EWDB(args.db) rp = TextReport() rp.header("DB location: {}".format(db.ds.path)) with db.ctx() as ctx: for pos in 'nvar': senses = ctx.sense.select("pos=?", (pos, )) print("pos={}: {}".format(pos, len(senses))) senses = ctx.sense.select("pos=? AND flag=?", (pos, EWDB.Flags.GOLD)) print("GOLD pos={}: {}".format(pos, len(senses))) pass
def show_info(cli, args): ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' output = TextReport(args.output) if 'output' in args else TextReport() output.print("Jamdict " + version_info.__version__) output.print(version_info.__description__) output.header("Basic configuration") output.print("JAMDICT_HOME : {}".format(config.home_dir())) output.print("Config file location: {}".format(config._get_config_manager().locate_config())) output.header("Data files") output.print("Jamdict DB location: {} - {}".format(args.jdb, file_status(args.jdb))) output.print("JMDict XML file : {} - {}".format(args.jmdxml, file_status(args.jmdxml))) output.print("KanjiDic2 XML file : {} - {}".format(args.kd2xml, file_status(args.kd2xml))) output.print("JMnedict XML file : {} - {}".format(args.jmnexml, file_status(args.jmnexml)))
def create_ewdb(cli, args): db = EWDB(args.db) c = Counter() rp = TextReport() rp.header("DB location: {}".format(db.ds.path)) with db.ctx() as ctx: for pos in 'nvar': file_name = 'data/tsdb/skeletons/omw_{}.txt'.format(pos) rp.print("Reading file: {}".format(file_name)) for idx, row in enumerate(iter_tsv(file_name)): lemma, sid, sdef = row db.add_sense(sid, lemma, pos, sdef, ctx=ctx) c.count("Added") c.summarise() pass
def show_info(cli, args): ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' output = TextReport(args.output) if 'output' in args else TextReport() if args.config: jamdict.config.read_config(args.config) output.print("Jamdict " + jamdict.version_info.__version__) output.print(jamdict.version_info.__description__) jam = get_jam(cli, args) output.header("Basic configuration") jamdict_home = jamdict.config.home_dir() if not os.path.isdir(os.path.expanduser(jamdict_home)): jamdict_home += " [Missing]" else: jamdict_home += " [OK]" output.print(f"JAMDICT_HOME: {jamdict_home}") if jamdict.util._JAMDICT_DATA_AVAILABLE: import jamdict_data data_pkg = f"version {jamdict_data.__version__} [OK]" else: data_pkg = "Not installed" output.print(f"jamdict-data: {data_pkg}") if args.config: _config_path = args.config + " [Custom]" if not os.path.isfile(args.config): _config_path += " [Missing]" else: _config_path = jamdict.config._get_config_manager().locate_config() if not _config_path: _config_path = "Not available.\n Run `python3 -m jamdict config` to create configuration file if needed." output.print(f"Config file : {_config_path}") output.header("Data files") output.print( f"Jamdict DB location: {jam.db_file} - {file_status(jam.db_file)}") output.print( f"JMDict XML file : {jam.jmd_xml_file} - {file_status(jam.jmd_xml_file)}" ) output.print( f"KanjiDic2 XML file : {jam.kd2_xml_file} - {file_status(jam.kd2_xml_file)}" ) output.print( f"JMnedict XML file : {jam.jmnedict_xml_file} - {file_status(jam.jmnedict_xml_file)}" ) if jam.ready: output.header("Jamdict database metadata") try: for meta in jam.jmdict.meta.select(): output.print(f"{meta.key}: {meta.value}") except Exception as e: print(e) output.print("Error happened while retrieving database meta data") output.header("Others") output.print(f"puchikarui: version {puchikarui_version}") output.print(f"chirptext : version {chirptext_version}") output.print(f"lxml : {jamdict.jmdict._LXML_AVAILABLE}")
def manual_patch(cli, args): rp = TextReport() omw = get_omw() if not args.input or not os.path.isfile(args.input): raise Exception("Input file could not be found") with open(args.input, 'r') as infile, omw.ctx() as ctx: synsets = json.loads(infile.read()) # for ss in synsets: # rp.print(ss['synset'], ss['definition']) # rp.print("Found synsets:", len(synsets)) for sinfo in synsets: sid, fixed_def = sinfo['synset'], sinfo['definition'] ss = omw.get_synset(sid, ctx=ctx) orig_def = remove_puncs(ss.definition) if remove_puncs(fixed_def) != orig_def: rp.header("WARNING:", sid) rp.print(ss.definition) rp.print(fixed_def)
def gen_mfs_3000(cli, args): rp = TextReport(args.output) ssids = list(topk_mfs(3000)) random.shuffle(ssids) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: filename = 'omw3000A' for idx, sid in enumerate(ssids): ss = omw.get_synset(sid, ctx=ctx) if idx > len(ssids) / 2: filename = 'omw3000B' hub['omw3000'].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub['omw3000'].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def list_unksense(args): header("List unknown sensekeys in Semcor") semxml = SemcorXML(SEMCOR_TTL) unk = Counter() sids = Counter() c = Counter() out = TextReport() if not args.out else TextReport(args.out) for f in semxml.files[:args.limit] if args.limit else ttl.files: doc = ttl.Document.from_json_file(ttl.files.abspath(f)) for s in doc: for concept in s.concepts: try: sid = SynsetID.from_string(concept.tag) sids.count((sid, concept.clemma)) c.count("Known instances") except: sid = None unk.count((concept.tag, concept.clemma)) c.count("Unknown instances") out.header("Known concepts") out.writeline("\t".join(("synsetID", "lemma", "count"))) for k, v in sids.sorted_by_count(): sid, lemma = k out.writeline("\t".join((str(sid), lemma, str(v)))) out.header("Unknown concepts") out.writeline("\t".join(("sensekey", "lemma", "count"))) for k, v in unk.sorted_by_count(): sk, lemma = k out.writeline("\t".join((sk, lemma, str(v)))) out.header("Total") out.writeline("Known: {}".format(len(sids))) out.writeline("Unknown: {}".format(len(unk))) c.summarise(out)
def gen_mfs_5000(cli, args): rp = TextReport(args.output) from omwtk.wn_ntumc_top3000 import WN_NTUMC_TOP3000 first_round = set(x['synset'] for x in WN_NTUMC_TOP3000) top5000 = topk_mfs(5000) round2 = list(top5000.difference(first_round)) random.shuffle(round2) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: filename = 'omw5000A' for idx, sid in enumerate(round2): ss = omw.get_synset(sid, ctx=ctx) if idx > 200: filename = 'omw5000B' hub['omw5000'].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub['omw5000'].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def extract_omw(cli, args): ''' OMW Extractor ''' rp = TextReport() omw = get_omw() WN_POS = 'nvar' with omw.ctx() as ctx: for pos in WN_POS: rp.header("POS: {}".format(pos)) query = '''SELECT lemma, sense.synset, def as sdef FROM sense LEFT JOIN word ON sense.wordid = word.wordid and sense.lang=word.lang LEFT JOIN synset_def ON sense.synset = synset_def.synset AND sense.lang = synset_def.lang WHERE sense.lang='eng' AND word.lang='eng' AND synset_def.lang='eng' AND pos=? ORDER By freq DESC ''' params = [pos] if args.topk: query += ' LIMIT ?' params.append(args.topk) results = ctx.select(query, params) senses = OrderedDict() potential_names = 0 for lemma, sid, sdef in results: if lemma.lower() != lemma: # if pos not in 'nar': # rp.print("{} - {}".format(lemma, pos)) potential_names += 1 if (lemma, sid) in senses: senses[(lemma, sid)] += "; " + sdef else: senses[(lemma, sid)] = sdef print("Found {} sense in OMW".format(len(senses.keys()))) print("Potential name: {}".format(potential_names)) if args.output: out_path = "{}_{}.txt".format(args.output, pos) wordsenses = (k + (v, ) for k, v in senses.items()) CSV.write_tsv(out_path, wordsenses, quoting=CSV.QUOTE_MINIMAL) print("Written to {}".format(out_path)) lemma_out_path = "{}_{}_lemma.txt".format(args.output, pos) with open(lemma_out_path, 'w') as outfile: for l, sid in senses.keys(): outfile.write(l) outfile.write('\n') print("Written to {}".format(lemma_out_path))
def gen_vocab(cli, args): ''' Generate vocabulary list from a tokenized file ''' if args.topk and args.topk <= 0: topk = None cli.logger.warning( "Invalid k will be ignored (k should be greater than or equal to 1)" ) else: topk = args.topk if args.stopwords: with open(args.stopwords, 'r') as swfile: stopwords = swfile.read().splitlines() else: stopwords = [] if os.path.isfile(args.input): cli.logger.info("Generating vocabulary list from file {}".format( args.input)) with codecs.open(args.input, encoding='utf-8') as infile: if args.output: cli.logger.info("Output: {}".format(args.output)) rp = TextReport(args.output) lines = infile.read().splitlines() c = Counter() for line in lines: words = line.split() c.update(w for w in words if w not in stopwords) # report vocab word_freq = c.most_common(topk) words = [k for k, v in word_freq] rp.header("Lexicon") rp.writeline("\n".join( textwrap.wrap(" ".join(w for w in words), width=70))) for k, v in word_freq: rp.print("{}: {}".format(k, v)) else: cli.logger.warning("File {} does not exist".format(args.input))
def test_tagging_all(self): getLogger().debug("Tagging everything ...") sents = self.gold() smap = {str(s.ident): s for s in sents} # reag tags doc = ttl.Document('gold', TEST_GOLD_DIR).read() filter_wrong_senses(doc) count_good_bad = Counter() perfects = [] to_be_checked = dd(list) tbc_concepts = dd(list) concept_count = Counter() fix_texts = [] instances = Counter() tag_map = dd(set) report = TextReport('data/gold_report.txt') matched_report = TextReport('data/gold_matched.txt') not_matched_report = TextReport('data/gold_notmatched.txt') for s in sents[:5]: sid = str(s.ident) if not doc.has_id(sid): raise Exception("Cannot find sentence {}".format(sid)) elif len(s) == 0: logging.warning("Empty sentence: {}".format(s)) else: tagged = doc.get(sid) if s.text != tagged.text: fix_texts.append((s.ident, s.text, tagged.text)) # try to tag ... dmrs = s[0].dmrs() matched, not_matched, ignored = tag_gold(dmrs, tagged, s.text, mode=Lexsem.ROBUST) if not not_matched: count_good_bad.count("Perfect") perfects.append((s, matched)) else: for nm in not_matched: tag_map[nm.tag].add(nm.clemma) tbc_concepts[nm.tag].append(s.ident) concept_count.count(nm.tag) instances.count('instances') to_be_checked[s.ident].append(nm) count_good_bad.count("To be checked") # report matched for sent, m in perfects: tagged = doc.get(str(sent.ident)) matched_report.header("#{}: {}".format(sent.ident, sent.text), "h0") matched_report.writeline(sent[0].dmrs()) matched_report.header("Concepts") for c, nid, pred in m: matched_report.writeline("{} ===> {}:{}".format(c, nid, pred)) matched_report.writeline() matched_report.writeline() # report not matched not_matched_report.header("By senses", "h0") for k, v in concept_count.most_common(): sids = ' '.join(["#{}".format(x) for x in tbc_concepts[k]]) not_matched_report.print("{}: {} | {} => {}".format(k, v, sids, tag_map[k])) not_matched_report.header("By sentences", "h0") for sid, nm in to_be_checked.items(): not_matched_report.print("#{}: {} | {}".format(sid, nm, smap[str(sid)].text)) # full details for sid, nm in to_be_checked.items(): sent = smap[str(sid)] tagged = doc.get(str(sid)) not_matched_report.header("#{}: {}".format(sid, sent.text)) not_matched_report.writeline(sent[0].dmrs()) for n in nm: not_matched_report.writeline(n) # for i, t1, t2 in fix_texts: # getLogger().debug(i) # getLogger().debug(t1) # getLogger().debug(t2) count_good_bad.summarise(report=report) instances.summarise(report=report)
def remove_msw_ttl(cli, args): doc = read_ttl(args.path) rp = TextReport(args.debug) rp.print("Doc size: {}".format(len(doc))) orig_tag_count = 0 orig_concept_count = 0 for s in doc: orig_concept_count += len(s.concepts) orig_tag_count += len(s.tags) print("# tags: {}".format(orig_tag_count)) print("# concepts: {}".format(orig_concept_count)) manual = dd(lambda: dd(dict)) nonsenses = set() # just ignore any tag with these sense IDs if args.manual: entries = CSV.read_tsv(args.manual) for sid, wid, tag, keep, lemma in entries: sid, wid, keep = int(sid), int(wid), int(keep) if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'): nonsenses.add(tag) if not lemma: manual[sid][wid][tag] = keep else: manual[sid][wid][(tag, lemma)] = keep wn = get_wn() ctx = wn.ctx() nope_synsets = set() ok_synsets = set() if args.wn30: rp.print("WN30 filter is activated") for sidx, sent in enumerate(doc): if args.topk and sidx > int(args.topk): break getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc))) getLogger().debug("Before concepts: {}".format(sent.concepts)) getLogger().debug("Before tags: {}".format(sent.tags)) # remove concepts that are not in PWN 3.0 if args.wn30: remove_tags = set() for tag in sent.tags: if tag.tagtype == 'OMW' or tag.label in nonsenses: remove_tags.add(tag) for tag in remove_tags: sent.tags.remove(tag) remove_concepts = set() for c in sent.concepts: if c.tag in ok_synsets: pass elif c.tag in nope_synsets: remove_concepts.add(c) # pop_concept(sent, c) elif wn.get_synset(c.tag, ctx=ctx) is None: # remove it nope_synsets.add(c.tag) remove_concepts.add(c) # pop_concept(sent, c) else: ok_synsets.add(c.tag) for c in remove_concepts: pop_concept(sent, c) msw = list(sent.msw()) tcmap = sent.tcmap() # remove_tags = set() if msw: keep_remove = [] for w in msw: max_len = 0 keep = [] remove = set() wid = sent.tokens.index(w) for c in tcmap[w]: if c.tag in manual[sent.ID][wid]: if manual[sent.ID][wid][c.tag]: keep.append(c) else: remove.add(c) elif (c.tag, c.clemma) in manual[sent.ID][wid]: if manual[sent.ID][wid][(c.tag, c.clemma)]: keep.append(c) else: remove.add(c) elif len(c.tokens) == 1 or len(c.tokens) < max_len: remove.add(c) elif c.tag in nonsenses: remove.add(c) else: max_len = len(c.tokens) keep.append(c) if len(keep) != 1: keep_remove.append((w, keep, remove)) else: # everything is OK, remove them now for c in remove: if args.debug: rp.print("Removing concept {} from {}".format(c, sent.ID)) getLogger().debug("Removing concept {} from {}".format(c, sent.ID)) pop_concept(sent, c) if keep_remove: rp.header(sent) for w, keep, remove in keep_remove: rp.write(w) rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove)) # remove sent's tags # for tag in remove_tags: # getLogger().debug("removing tag: {}".format(tag)) # sent.tags.remove(tag) getLogger().debug("After concepts: {}".format(sent.concepts)) getLogger().debug("After tags: {}".format(sent.tags)) if nope_synsets: rp.print("Noped synsets: {}".format(nope_synsets)) if args.output: doc_path = os.path.dirname(args.output) doc_name = os.path.basename(args.output) new_doc = ttl.Document(doc_name, doc_path) sents = doc if not args.topk else list(doc)[:int(args.topk)] for s in sents: new_doc.add_sent(s) tag_count = 0 concept_count = 0 for s in sents: concept_count += len(s.concepts) tag_count += len(s.tags) # baking ... if args.bake: print("Baking doc ...") bake_doc(new_doc) print("[New] # tags: {}".format(tag_count)) print("[New] # concepts: {}".format(concept_count)) rp.print("Writing fixed TTL to {}".format(new_doc.sent_path)) new_doc.write_ttl()
rp2 = TextReport('~/tmp/my-report.txt') # output to a file rp2.write("This is a line in my-report.txt") rp3 = TextReport.null() # ouptut to /dev/null, i.e. nowhere rp3.write("This line goes no where") rp4 = TextReport.string( ) # output to a string. Call rp.content() to get the string rp4.write("This line will be stored in a string buffer") rp5 = TextReport(TextReport.STRINGIO) # same as above rp5.write("This line will also be stored in a string buffer") # TextReport will close the output stream automatically by using the with statement with TextReport.string() as rp: rp.header("Lorem Ipsum Analysis", level="h0") rp.header("Raw", level="h1") rp.print(LOREM_IPSUM) rp.header("Character Frequency") ct.summarise(report=rp) print(rp.content()) # ------------------------------------------------------------------------------ # Web fetcher # ------------------------------------------------------------------------------ from chirptext import WebHelper web = WebHelper('~/tmp/webcache.db') data = web.fetch('https://letuananh.github.io/test/data.json') print(data) data_json = web.fetch_json('https://letuananh.github.io/test/data.json')
def compare_ttls(cli, args): ''' Compare TTL to gold ''' rp = TextReport() omw = get_omw() ctx = omw.ctx() gold = None profile = None ignored_ids = [] if args.ignore: ignored_ids = [x.strip() for x in read_file(args.ignore).splitlines() if x.strip()] getLogger().debug("Ignored sentence IDs: {}".format(', '.join(ignored_ids))) if args.gold_profile: gold = read_ttl(args.gold_profile, ttl_format=args.ttl_format) # remove ignored sentences if ignored_ids: for sid in ignored_ids: gold.pop(sid, default=None) if not args.batch: rp.header("Gold sentences: {} | Loc: {}".format(len(gold), args.gold_profile)) if args.verbose and not args.batch: for s in gold: rp.print("Sent #{}: {} tags".format(s.ID, len(s.tags))) elif not args.batch: print("Oops, no gold!") # read profile if args.profile: profile = read_ttl(args.profile, ttl_format=args.ttl_format) if not args.batch: rp.header("Profile sentences: {} | Loc: {}".format(len(profile), args.profile)) # remove ignored sentences if ignored_ids: for sid in ignored_ids: profile.pop(sid, default=None) if not args.batch: rp.header("Profile sentences: {} (ignored: {}) | Loc: {}".format(len(profile), len(ignored_ids), args.profile)) if args.verbose and not args.batch: for s in profile: getLogger().debug("Profile/Sent #{}: {} tags".format(s.ID, len(s.tags))) elif not args.batch: print("Oops, no profile to evaluate") # calculate precision and recall if gold and profile: gold_tags, gold_tags_len, gold_ignored = prepare_tags(gold, args=args, nonsense=args.nonsense) profile_tags, profile_tags_len, profile_ignored = prepare_tags(profile, args=args, nonsense=args.nonsense) if gold_tags_len == 0: rp.print("WARNING: There was no tag found in the gold profile. Please make sure that the tags for comparison are *sentence level* tags") if profile_tags_len == 0: rp.print("WARNING: There was no tag found in the evaluating profile. Please make sure that the tags for comparison are *sentence level* tags") getLogger().debug("Gold tags: {}".format(gold_tags_len)) getLogger().debug(list(gold_tags.items())[:5]) getLogger().debug("Profile tags: {}".format(profile_tags_len)) getLogger().debug(list(profile_tags.items())[:5]) true_positive, false_negative = score(gold_tags, profile_tags, args=args) precision = len(true_positive) / profile_tags_len recall = len(true_positive) / gold_tags_len f1 = 2 * precision * recall / (precision + recall) getLogger().debug("TP: {}".format(len(true_positive))) getLogger().debug("FN: {}".format(len(false_negative))) getLogger().debug("Recall (TP/Gtags): {}".format(recall)) getLogger().debug("Precision (TP/Ptags): {}".format(precision)) getLogger().debug("F1 (2*p*r/(p+r)): {}".format(f1)) rc_text = "{:.2f}%".format(recall * 100) pr_text = "{:.2f}%".format(precision * 100) f1_text = "{:.2f}%".format(f1 * 100) if not args.batch: rp.print("True positive: {}".format(len(true_positive))) rp.print("False Negative: {}".format(len(false_negative))) rp.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored)) rp.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored)) rp.print("Recall: {}".format(rc_text)) rp.print("Precision: {}".format(pr_text)) rp.print("F1 : {}".format(f1_text)) if args.org: # output org-mode columns = [rc_text, pr_text, f1_text] if args.cols: columns = args.cols + columns rp.print('| {} |'.format(' | '.join(columns))) if args.debug: if not args.batch: print("Debug file: {}".format(args.debug)) debugfile = TextReport(args.debug) debugfile.print(".:: Table of content ::.") debugfile.print("") debugfile.print("[Misisng senses]") debugfile.print("[By classes]") debugfile.print("[Summary]") debugfile.print("") ss_map = {} debugfile.header("[Missing senses]") for sid, cfrom, cto, label in sorted(false_negative): if label not in ss_map: ss = omw.get_synset(label, ctx=ctx) ss_map[label] = ss else: ss = ss_map[label] # get the surface form surface = gold.get(sid).text[int(cfrom):int(cto)] debugfile.print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(sid, cfrom, cto, surface, label, ss.definition, ss.lemmas)) # by classes c = Counter() c.update(synsetID for sentID, cfrom, cto, synsetID in false_negative) debugfile.header("[By classes]") for synsetID, freq in c.most_common(): ss = ss_map[synsetID] debugfile.print("{}: {} | ({}) - {}".format(synsetID, freq, ', '.join(ss.lemmas), ss.definition)) # summary debugfile.header("[Summary]") debugfile.print("True positive: {}".format(len(true_positive))) debugfile.print("False positive: {}".format(len(false_negative))) debugfile.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored)) debugfile.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored)) debugfile.print("Recall (TP/Gtags) : {}".format(rc_text)) debugfile.print("Precision (TP/Ptags): {}".format(pr_text)) debugfile.print("F1 (2*p*r/(p+r)) : {}".format(f1_text)) ctx.close()
# ------------------------------------------------------------------------------ # Sample text report # ------------------------------------------------------------------------------ # a string report rp = TextReport() # by default, TextReport will write to standard output, i.e. terminal rp = TextReport(TextReport.STDOUT) # same as above rp = TextReport('~/tmp/my-report.txt') # output to a file rp = TextReport.null() # ouptut to /dev/null, i.e. nowhere rp = TextReport.string() # output to a string. Call rp.content() to get the string rp = TextReport(TextReport.STRINGIO) # same as above # TextReport will close the output stream automatically by using the with statement with TextReport.string() as rp: rp.header("Lorem Ipsum Analysis", level="h0") rp.header("Raw", level="h1") rp.print(LOREM_IPSUM) rp.header("Character Frequency") ct.summarise(report=rp) print(rp.content()) # ------------------------------------------------------------------------------ # Web fetcher # ------------------------------------------------------------------------------ from chirptext import WebHelper web = WebHelper('~/tmp/webcache.db') data = web.fetch('https://letuananh.github.io/test/data.json') print(data)
def doc_stats(cli, args): ''' Show document statistics ''' doc = Document.from_file(args.path) # input output = TextReport(args.output) # output stats = Counter() pred_counter = Counter() empty_sentences = [] unknown_preds = Counter() all_pos = Counter() not_found = None if args.ttl: ttl_doc = ttl.Document.read_ttl(args.ttl) not_found = set(s.ID for s in ttl_doc).difference(s.ident for s in doc) for sent in doc: stats.count("Sentences") if not len(sent): stats.count("Sentences-empty") empty_sentences.append(sent.ident) for reading in sent: stats.count("Readings") stats['Predicates'] += len(reading.dmrs().layout.nodes) # pred_counter.update(n.predstr for n in reading.dmrs().layout.nodes) for n in reading.dmrs().layout.nodes: if n.pred.pos == 'u' and n.pred.sense == 'unknown': stats.count("Unnown predicates") if '/' in n.pred.lemma: try: lemma, pos = n.pred.lemma.rsplit('/', 1) except: getLogger().warning("Invalid unknown pred: {}".format(n.pred)) raise all_pos.count(pos) unknown_preds.count((str(n.pred), lemma, pos)) else: stats.count("UFO") else: stats.count("Known predicates") pred_counter.count(n.predstr) output.header("Summary", level="h0") stats.summarise(output) output.header("Empty sentences") output.print("\n".join(empty_sentences)) if not_found is not None: output.header("Missing from TTL") for sid in not_found: output.print(sid) output.header("Unknown preds POS") for pos, count in all_pos.most_common(): output.print(pos, count, separator='\t') output.header("Unknown preds") for (pred, lemma, pos), count in unknown_preds.most_common(): output.print(pred, lemma, pos, count, separator='\t') output.header("Known preds", level="h1") pred_counter.summarise(output)
def map_predsense(cli, args): ''' Pred-Sense Mapping (gold DMRSes, gold Senses) ''' rp = TextReport(args.output) if args.output else TextReport() rp.header("Pred-Sense mapping / strategy = {}".format(args.strat)) if args.gold: sents = Document.from_file(args.gold) if args.patchsid: patch_gold_sid(sents) else: sents = read_gold_mrs() patch_gold_sid(sents) # ignore empty sentence empty_sents = [s for s in sents if not len(s)] not_empty_sents = [s for s in sents if len(s)] rp.print("MRS-Sents: {}".format(len(sents))) rp.print("MRS-Sents not empty: {}".format(len(not_empty_sents))) if args.ttl: doc = ttl.read(args.ttl, mode=args.ttl_format) else: # [XXX] using gold by default is bad ... doc = ttl.Document(name='gold', path='data').read() rp.print("TTL-Sents: {}".format(len(doc))) found_sents = 0 for sent in not_empty_sents: if doc.get(sent.ident) is None: cli.logger.warning("Sentence {} could not be found".format(sent.ident)) else: found_sents += 1 rp.print("Matched: {}".format(found_sents)) rp.print("Empty sentences: {}".format([s.ident for s in empty_sents])) # Now mapping is possible # ---------------------------------------- ct = Counter() # total cm = Counter() # matched cnm = Counter() # not matched cig = Counter() # ignored sense_lemmas = dd(set) # sense, lemma, map sense_sents = dd(set) # not-matched senses to sentences lemma_sents = dd(set) # not matched lemmas to sentences rp.print("Performing Pred-Sense Mapping") sents_to_map = not_empty_sents[:args.topk] if args.topk else not_empty_sents for sent in sents_to_map: sent.shallow = doc.get(sent.ident) for m, nm, ig in import_shallow(sent, mode=args.strat, no_small_sense=args.noss, fix_token=args.fixtoken, no_nonsense=args.nononsense): for c, nid, pred in m: ct.count(c.tag) cm.count(c.tag) for c in ig: sense_lemmas[c.tag].add(c.clemma) ct.count(c.tag) cig.count(c.tag) for c in nm: sense_lemmas[c.tag].add(c.clemma) ct.count(c.tag) cnm.count(c.tag) sense_sents[c.tag].add(sent) lemma_sents[c.clemma].add(sent) # print("Sent #{} - Not matched: {}".format(sent.ident, nm)) # print(" Matched : {}".format(len(m))) rp.header("Not matched", level='h0') for sid, c in cnm.most_common(): rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid])) rp.header("Not matched (by lemma)", level='h0') for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True): rp.print("{}: {} | sents: {}".format(clemma, len(sents), [s.ident for s in sents])) if args.matched: rp.header("Total", level='h0') ct.summarise() rp.header("Ignored", level='h0') for sid, c in cig.most_common(): rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid])) # show sense - sentences rp.header("Sense - Sentences", level='h0') for sid, c in cnm.most_common(): sents = sense_sents[sid] rp.header("{} - {}".format(sid, sense_lemmas[sid]), level='h2') for sent in sents: ttl_sent = doc.get(sent.ident) rp.print(ttl_sent) for concept in ttl_sent.concepts: if concept.tag == sid: rp.print(' -> {}'.format(concept)) rp.header("Lemma - Sentences", level='h0') for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True): rp.header("#{}".format(clemma,)) for sent in sents: ttl_sent = doc.get(sent.ident) rp.print(ttl_sent) for concept in ttl_sent.concepts: if concept.clemma == clemma: rp.print(' -> {}'.format(concept)) rp.print() # Show final numbers total_concepts = sum(x[1] for x in ct.most_common()) total_matched = sum(x[1] for x in cm.most_common()) total_notmatched = sum(x[1] for x in cnm.most_common()) total_ignored = sum(x[1] for x in cig.most_common()) rp.header("Summarise") rp.print("Total concepts: {}".format(total_concepts)) rp.print("Matched: {}".format(total_matched)) rp.print("Not matched: {}".format(total_notmatched)) rp.print("Ignored: {}".format(total_ignored)) if args.output: print("Total concepts: {}".format(total_concepts)) print("Matched: {}".format(total_matched)) print("Not matched: {}".format(total_notmatched)) print("Ignored: {}".format(total_ignored)) print("Output file: {}".format(args.output)) print("Done!") return total_concepts, total_matched, total_notmatched, total_ignored