def map_preds(cli, args): rp = TextReport(args.output) ctx = PredSense.wn.ctx() not_found = [] pred_file = 'data/erg_preds_interesting.txt' if args.all: pred_file = 'data/erg_preds_sorted.txt' name, ext = os.path.splitext(pred_file) not_found_file = name + "_notfound" + ext with open(pred_file, 'r') as infile: for p_str in infile.read().splitlines(): p = Predicate.from_string(p_str) candidates = None if p.pos == 'x' and p.sense == 'subord': continue # ignore these for now # if (p.pos == 'x' and p.sense == 'deg') or p.pos == 'p': if args.all or (p.pos and p.pos in 'xpq'): rp.header(p, p.lemma, p.pos, p.sense) candidates = PredSense.search_pred_string(p, ctx=ctx) for c in candidates: rp.print(c.ID, c.lemmas, c.definition) if not candidates: not_found.append(p_str) with TextReport(not_found_file, 'w') as outfile: for p in not_found: outfile.print(p) if args.output: print("Written to: {}".format(args.output)) print("Done")
def main(): header("Main method") c = Counter() t = Timer() t.start("Doing some time-consuming tasks ...") logging.info("Count even & odd numbers ...") for i in range(10000): if i % 2 == 0: c.count("even") else: c.count("odd") c.summarise() logging.info("Creating report dir ...") FileHelper.create_dir(DATA_DIR) report = TextReport(REPORT_LOC) logging.info("Now try to create a text report (Located at: %s)" % (report.get_path())) generate_report(report) # try to report to stdout logging.info("The same report to stdout ...") generate_report(TextReport()) t.end("Done")
def process_lemma(cli, args): limit = int(args.topk) if args.topk and int(args.topk) > 0 else None pos = args.pos db = EWDB(args.db) rp = TextReport() rp.header("DB location: {}".format(db.ds.path)) with db.ctx() as ctx: if args.flag: query = ['(flag IS NULL OR flag = ?)'] params = [args.flag] else: query = ['flag IS NULL'] params = [] if pos: query.append('pos=?') params.append(pos) senses = ctx.sense.select(' AND '.join(query), params, limit=limit) print("Found {} senses for {}".format(len(senses), pos)) for idx, sense in enumerate(senses): if idx % 50 == 0: print("Processed {} / {}".format(idx, len(senses))) found_gold = is_gold(sense, db, ctx) # non zero = True if found_gold: # flag this sense as gold db.flag_sense(sense.ID, found_gold, ctx=ctx) elif sense.flag != EWDB.Flags.PROCESSED: db.flag_sense(sense.ID, EWDB.Flags.PROCESSED, ctx=ctx) pass
def task_mine_mwe(cli, args, db=None): if db is None: db = EWDB(args.db) with db.ctx() as ctx: with TextReport('data/mwe_extra.txt') as outfile_extra: senses_extra = mine_mwe_extra(db, ctx) for sense in senses_extra: outfile_extra.print(sense.lemma) with TextReport('data/mwe.txt', 'w') as outfile: senses = mine_mwe(db, ctx) for sense in senses: outfile.print(sense.lemma) with TextReport('data/mwe_nospace.txt', 'w') as outfile_nospace: nospaces = mine_mwe_nospace(db, ctx, senses) for sense in nospaces: outfile_nospace.print(sense.lemma) with TextReport('data/mwe_of.txt') as outfile_of: senses_of = mine_mwe_of(db, ctx) for sense in senses_of: outfile_of.print(sense.lemma) with TextReport('data/mwe_apos_s.txt') as outfile_apos_s: senses_apos_s = mine_mwe_apos_s(db, ctx) for sense in senses_apos_s: outfile_apos_s.print(sense.lemma) # report getLogger().debug("Found MWE: {}".format(len(senses))) getLogger().debug("Found MWE-of: {}".format(len(senses_of))) getLogger().debug("No space: {}".format(len(nospaces))) getLogger().debug("Extra: {}".format(len(senses_extra)))
def list_gpreds(cli, args): rp = TextReport(args.output) with open('data/erg_preds_sorted.txt', 'r') as infile: sorted_preds = (Predicate.from_string(l) for l in infile) for pred in sorted_preds: if pred.ptype == Predicate.GRAMMARPRED: rp.print(pred) pass
def list_preds(cli, args): rp = TextReport(args.output) lexdb = read_erg_lex() keyrels = set(l.keyrel for l in lexdb if l.keyrel) preds = [Predicate.from_string(p) for p in keyrels] sorted_preds = sorted(preds, key=lambda x: x.pos or '') # All preds with open('data/erg_preds_sorted.txt', 'w') as outfile: for pred in sorted_preds: outfile.write('{}\n'.format(pred)) poses = set(p.pos for p in preds) trivial_preds = [p for p in preds if p.pos and p.pos in TRIVIAL_POS] if not args.trivial: preds = [p for p in preds if not p.pos or p.pos not in TRIVIAL_POS] interesting_poses = set(p.pos for p in preds) # write interesting preds to file c = Counter() with open('data/erg_preds_interesting.txt', 'w') as outfile: for pred in sorted(preds, key=lambda x: "cqpx".index(x.pos) if x.pos else 0): c.count(pred.pos if pred.pos else 'NONE') outfile.write('{}\n'.format(pred)) # report rp.print("Interesting preds: {}".format(len(preds))) rp.print("Trivial preds: {}".format(len(trivial_preds))) rp.print("POS: {}".format(poses)) rp.print("Interesting POS: {}".format(interesting_poses)) c.summarise(rp)
def show_stats(cli, args): db = EWDB(args.db) rp = TextReport() rp.header("DB location: {}".format(db.ds.path)) with db.ctx() as ctx: for pos in 'nvar': senses = ctx.sense.select("pos=?", (pos, )) print("pos={}: {}".format(pos, len(senses))) senses = ctx.sense.select("pos=? AND flag=?", (pos, EWDB.Flags.GOLD)) print("GOLD pos={}: {}".format(pos, len(senses))) pass
def test_kata2hira(self): rp = TextReport.string() for k in KATAKANA[1:87]: h = simple_kata2hira(k) rp.write(h, k, '|', separator='') expected = TestTool.ALL_MAPPING self.assertEqual(rp.content(), expected)
def wn31_to_wn30(cli, args): csvlines = CSV.read('data/ili-map-pwn30.tab') ili_map = {k: v for k, v in csvlines} notfound = [] c = Counter() print("ILI-wn30 map: {}".format(len(ili_map))) wn31 = CSV.read('data/wn31.csv', dialect='excel-tab') with omw.ctx() as ctx, TextReport('data/wn31_diff.txt') as diff_file: for sid, iid, sdef in wn31: if iid in ili_map: c.count("Found") wn30_id = ili_map[iid] try: if wn30_id.endswith('s'): wn30_id = wn30_id[:-1] + 'a' wn30_ss = omw.get_synset(wn30_id, ctx=ctx) except: cli.logger.exception( "Cannot find synset {}".format(wn30_id)) raise # compare def if sdef != wn30_ss.definition and not sdef.startswith( wn30_ss.definition + ";"): diff_file.print("synset: {} | Lemmas: {}".format( wn30_id, ', '.join(wn30_ss.lemmas))) diff_file.print("OMW : {}".format(wn30_ss.definition)) diff_file.print("Wn31 : {}".format(sdef)) diff_file.print("") c.count("Diff") else: notfound.append(iid) c.count("Not in ILI") c.summarise() print(notfound) pass
def gen_vocab(cli, args): ''' Generate vocabulary list from a tokenized file ''' if args.topk and args.topk <= 0: topk = None cli.logger.warning("Invalid k will be ignored (k should be greater than or equal to 1)") else: topk = args.topk if args.stopwords: with open(args.stopwords, 'r') as swfile: stopwords = swfile.read().splitlines() else: stopwords = [] if os.path.isfile(args.input): cli.logger.info("Generating vocabulary list from file {}".format(args.input)) with codecs.open(args.input, encoding='utf-8') as infile: if args.output: cli.logger.info("Output: {}".format(args.output)) rp = TextReport(args.output) lines = infile.read().splitlines() c = Counter() for line in lines: words = line.split() c.update(w for w in words if w not in stopwords) # report vocab word_freq = c.most_common(topk) words = [k for k, v in word_freq] rp.header("Lexicon") rp.writeline("\n".join(textwrap.wrap(" ".join(w for w in words), width=70))) for k, v in word_freq: rp.print("{}: {}".format(k, v)) else: cli.logger.warning("File {} does not exist".format(args.input))
def hello_jamdict(cli, args): ''' Say hello and test if Jamdict is working ''' jam = get_jam(cli, args) if jam.ready: results = jam.lookup("一期一会") dump_result(results, report=TextReport()) else: getLogger().warning( "Hello there, unfortunately jamdict data is not available. Please try to install using `pip install jamdict-data`" )
def test_export_to_streams(self): doc = ttl.Document('manual', TEST_DATA) # create sents in doc raws = ("三毛猫が好きです。", "雨が降る。", "女の子はケーキを食べる。") for sid, r in enumerate(raws): msent = txt2mecab(r) tsent = doc.new_sent(msent.surface, sid) tsent.import_tokens(msent.words) # pos tagging for mtk, tk in zip(msent, tsent): tk.pos = mtk.pos3() tk.new_tag(mtk.reading_hira(), tagtype="Reading", source=ttl.Tag.MECAB) # sense tagging doc[2][4].comment = 'to eat' doc[0].new_concept("三毛猫", "wiki.ja:三毛猫", tokens=[0, 1, 2]).comment = 'Calico cat, you know?' doc[1].new_concept("降る", "02756821-v", tokens=(2,)) doc[2].new_concept("女の子", "10084295-n", tokens=(0,)) doc[2].new_concept("食べる", "01166351-v", (4,)) # tags doc[0].new_tag("WIKI", 0, 3, tagtype="SRC") doc[0].new_tag("https://ja.wikipedia.org/wiki/三毛猫", 0, 3, tagtype="URL") doc[2].new_tag("WIKI", 0, 3, tagtype="SRC") doc[2].new_tag("https://ja.wikipedia.org/wiki/少女", 0, 3, tagtype="URL") # export doc concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) as writer: writer.write_doc(doc) getLogger().debug("sents\n{}".format(sents.content())) getLogger().debug("words\n{}".format(words.content())) getLogger().debug("concepts\n{}".format(concepts.content())) getLogger().debug("links\n{}".format(links.content())) getLogger().debug("tags\n{}".format(tags.content())) self.assertTrue(sents.content()) self.assertTrue(words.content()) self.assertTrue(concepts.content()) self.assertTrue(links.content()) self.assertTrue(tags.content()) for sent in doc: logging.debug(json.dumps(sent.to_json(), ensure_ascii=False))
def test_get_mfs(self): words = 'we this sing full cat tongue name dry die horn sun with mountain eye belly old big red woman live head animal because cloud louse sleep ear wet know salt walk eat seed green bite say person all child count thin stand father laugh night give stone heavy if bone sister other yellow small work snake smoke kill white swim short grease worm narrow flower neck path drink flesh good sharp ash snow hot fire mouth see dirty hand egg skin cold fly wood mother come I warm where one play foot sea year new earth smooth two water what burn fish vomit bird how long hunt sit rope feather nose dust round wind tooth correct bark root ice not blood tail dull brother man heart lie liver many pig rain claw who day grass knee when leaf wide hair meat black dog star dance breasts wife sand husband You bad hear moon river tree that'.split( ) with omw.ctx() as ctx, TextReport( 'data/mfs1500.txt') as rp, TextReport( "data/wndef.txt") as deffile: query = 'wordid in (SELECT wordid FROM word WHERE lemma in {})'.format( repr(tuple(words))) rows = ctx.sense.select(query) ssids = [SynsetID.from_string(r.synset) for r in rows] for ssid in ssids: ss = omw.get_synset(ssid, ctx=ctx) if ss.lemmas and ss.definition: rp.print("{id} ({lm}): {df}".format( id=ss.ID, lm=", ".join(ss.lemmas), df=ss.definition.strip())) deffile.print(ss.definition.strip()) print("Done!")
def convert(cli, args): ''' Convert patches from CSV format to YAML ''' rp = TextReport() # validate input file if not args.input: patch_path = os.path.join(DATA_FOLDER, 'patches', '20171112_Wn31_glitches_def.csv') else: patch_path = args.input if not os.path.isfile(patch_path): raise Exception("File {} does not exist.".format(patch_path)) # validate output file out_path = args.output if args.output else None if out_path == '*.yaml': out_path = FileHelper.replace_ext(patch_path, 'yaml') rp.print("Input:", patch_path) rp.print("Output:", out_path if out_path else '*stdout*') # convert patches patches = read_csv(patch_path) json_patches = [p.to_json() for p in patches] yaml_str = yaml.dump(json_patches, default_flow_style=False) # dump output if out_path: with open(out_path, 'w') as outfile: outfile.write(yaml_str) if args.echo: print(yaml_str) else: print(yaml_str)
def gen_mfs_5500(cli, args): ''' Generate 3rd round tree banking ''' rp = TextReport(args.output) topk_synsets = topk_mfs(5500) # finished treebanking first_round = read_lines('data/omw3000_synsets.txt') second_round = read_lines('data/omw5000_synsets.txt') done_synsets = set(first_round + second_round) # new third_round = topk_synsets.difference(done_synsets) # report print("All :", len(topk_synsets)) print("Done :", len(done_synsets)) print("New :", len(third_round)) # write to a synset file with open('data/omw5300_synsets.txt', 'w') as outfile: outfile.write('\n'.join(third_round)) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: profile = 'omw5300' filename = 'omw5300A' for idx, sid in enumerate(third_round): ss = omw.get_synset(sid, ctx=ctx) hub[profile].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub[profile].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def import_data(cli, args): '''Import XML data into SQLite database''' rp = TextReport() t = Timer(report=rp) db_loc = os.path.abspath(os.path.expanduser(args.jdb)) rp.print("Jamdict DB location : {}".format(db_loc)) rp.print("JMDict XML file location : {}".format(args.jmdxml)) rp.print("Kanjidic2 XML file location: {}".format(args.kd2xml)) jam = get_jam(cli, args) if args and (args.jdb or args.kd2): if os.path.isfile(db_loc): if not confirm( "Database file exists. Do you want to overwite (This action cannot be undone! yes/no?) " ): cli.logger.warning("Program aborted.") exit() else: os.unlink(db_loc) # perform input t.start( "Creating Jamdict SQLite database. This process may take very long time ..." ) jam.import_data() t.stop() else: print("Database paths were not provided. Process aborted.")
def test_export_to_streams(self): doc = ttl.Document('manual', TEST_DATA) # create sents in doc raws = (sent1, sent2, sent3) mecab_outputs = (sent1_mecab, sent2_mecab, sent3_mecab) for sid, (text, mecab_output) in enumerate(zip(raws, mecab_outputs)): deko.mecab._mecab_output_to_sent(text, mecab_output, doc=doc) # sense tagging doc[2][4].comment = 'to eat' doc[0].concepts.new("三毛猫", "wiki_ja", "三毛猫", tokens=[0, 1, 2]).comment = 'Calico cat, you know?' doc[1].concepts.new("02756821-v", "wn", "降る", tokens=(2, )) doc[2].concepts.new("10084295-n", "wn", "女の子", tokens=(0, )) doc[2].concepts.new("01166351-v", "wn", "食べる", (4, )) # tags doc[0].tags.new("WIKI", "src", 0, 3) doc[0].tags.new("https://ja.wikipedia.org/wiki/三毛猫", "url", 0, 3) doc[2].tags.new("WIKI", "src", 0, 3) doc[2].tags.new("https://ja.wikipedia.org/wiki/少女", "url", 0, 3) # export doc concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) as writer: writer.write_doc(doc) getLogger().debug("sents\n{}".format(sents.content())) getLogger().debug("words\n{}".format(words.content())) getLogger().debug("concepts\n{}".format(concepts.content())) getLogger().debug("links\n{}".format(links.content())) getLogger().debug("tags\n{}".format(tags.content())) self.assertTrue(sents.content()) self.assertTrue(words.content()) self.assertTrue(concepts.content()) self.assertTrue(links.content()) self.assertTrue(tags.content()) for text in doc: logging.debug(json.dumps(text.to_dict(), ensure_ascii=False))
def wn2ttl(args): inpath = FileHelper.abspath(args.inpath) header("WN to TTL format") wn = GWordnetXML() wn.read(inpath) print("Found senses: {}".format(len(wn.synsets))) outpath = FileHelper.abspath(args.outpath) if args.outpath else None with TextReport(outpath, 'w') as outfile: if args.format == 'json': convert_json(wn.synsets, outfile) elif args.format == 'xml': convert_xml(wn.synsets, outfile) print("Done!")
def test_ttl_tsv_serialization(self): sent = self.build_test_sent() concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() writer = ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) writer.write_sent(sent) sents_txt = sents.content() words_txt = words.content() concepts_txt = concepts.content() links_txt = links.content() tags_txt = tags.content() getLogger().debug("sents\n{}".format(sents_txt)) getLogger().debug("words\n{}".format(words_txt)) getLogger().debug("concepts\n{}".format(concepts_txt)) getLogger().debug("links\n{}".format(links_txt)) getLogger().debug("tags\n{}".format(tags_txt)) # read it back reader = ttl.TxtReader(io.StringIO(sents_txt), io.StringIO(words_txt), io.StringIO(concepts_txt), io.StringIO(links_txt), io.StringIO(tags_txt)) docx = reader.read() # patch sent.ID sent.ID = 1 jo = sent.to_json() jr = docx[0].to_json() getLogger().debug(jo) getLogger().debug(jr) self.assertEqual(jo['text'], jr['text']) self.assertEqual(jo['tokens'], jr['tokens']) self.assertEqual(jo['concepts'], jr['concepts']) self.assertEqual(jo['tags'], jr['tags']) self.assertEqual(jo['flag'], jr['flag']) self.assertEqual(jo['comment'], jr['comment']) self.assertEqual(jo, jr)
def extract_omw(cli, args): ''' OMW Extractor ''' rp = TextReport() omw = get_omw() WN_POS = 'nvar' with omw.ctx() as ctx: for pos in WN_POS: rp.header("POS: {}".format(pos)) query = '''SELECT lemma, sense.synset, def as sdef FROM sense LEFT JOIN word ON sense.wordid = word.wordid and sense.lang=word.lang LEFT JOIN synset_def ON sense.synset = synset_def.synset AND sense.lang = synset_def.lang WHERE sense.lang='eng' AND word.lang='eng' AND synset_def.lang='eng' AND pos=? ORDER By freq DESC ''' params = [pos] if args.topk: query += ' LIMIT ?' params.append(args.topk) results = ctx.select(query, params) senses = OrderedDict() potential_names = 0 for lemma, sid, sdef in results: if lemma.lower() != lemma: # if pos not in 'nar': # rp.print("{} - {}".format(lemma, pos)) potential_names += 1 if (lemma, sid) in senses: senses[(lemma, sid)] += "; " + sdef else: senses[(lemma, sid)] = sdef print("Found {} sense in OMW".format(len(senses.keys()))) print("Potential name: {}".format(potential_names)) if args.output: out_path = "{}_{}.txt".format(args.output, pos) wordsenses = (k + (v, ) for k, v in senses.items()) CSV.write_tsv(out_path, wordsenses, quoting=CSV.QUOTE_MINIMAL) print("Written to {}".format(out_path)) lemma_out_path = "{}_{}_lemma.txt".format(args.output, pos) with open(lemma_out_path, 'w') as outfile: for l, sid in senses.keys(): outfile.write(l) outfile.write('\n') print("Written to {}".format(lemma_out_path))
def test_ttl_tsv_serialization(self): sent = self.build_test_sent() concepts = TextReport.string() links = TextReport.string() sents = TextReport.string() tags = TextReport.string() words = TextReport.string() writer = ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) writer.write_sent(sent) sents_txt = sents.content() words_txt = words.content() concepts_txt = concepts.content() links_txt = links.content() tags_txt = tags.content() getLogger().debug("sents\n{}".format(sents_txt)) getLogger().debug("words\n{}".format(words_txt)) getLogger().debug("concepts\n{}".format(concepts_txt)) getLogger().debug("links\n{}".format(links_txt)) getLogger().debug("tags\n{}".format(tags_txt)) # read it back reader = ttl.TxtReader(io.StringIO(sents_txt), io.StringIO(words_txt), io.StringIO(concepts_txt), io.StringIO(links_txt), io.StringIO(tags_txt)) docx = reader.read() # patch sent.ID sent.ID = 1 jo = sent.to_dict() jr = docx[0].to_dict() getLogger().debug(jo) getLogger().debug(jr) self.assertEqual(jo['text'], jr['text']) self.assertEqual(jo['tokens'], jr['tokens']) self.assertEqual(jo['concepts'], jr['concepts']) self.assertEqual(jo['tags'], jr['tags']) self.assertEqual(jo['flag'], jr['flag']) self.assertEqual(jo['comment'], jr['comment']) self.assertEqual(jo, jr)
def babelfy_doc(docpath=DEFAULT_DOCPATH, docname=DEFAULT_DOCNAME, outfile=None, **kwargs): ''' Babelfy a tagged document ''' speckled = ttl.Document(docname, docpath).read() sents = [] for s in speckled: output = json.loads(babelfy(s.text, **kwargs)) sents.append((s.ID, output)) if outfile: with TextReport(outfile) as rp: rp.write(json.dumps(sents, indent=2)) return sents
def manual_patch(cli, args): rp = TextReport() omw = get_omw() if not args.input or not os.path.isfile(args.input): raise Exception("Input file could not be found") with open(args.input, 'r') as infile, omw.ctx() as ctx: synsets = json.loads(infile.read()) # for ss in synsets: # rp.print(ss['synset'], ss['definition']) # rp.print("Found synsets:", len(synsets)) for sinfo in synsets: sid, fixed_def = sinfo['synset'], sinfo['definition'] ss = omw.get_synset(sid, ctx=ctx) orig_def = remove_puncs(ss.definition) if remove_puncs(fixed_def) != orig_def: rp.header("WARNING:", sid) rp.print(ss.definition) rp.print(fixed_def)
def test_long_sentence(self): r = Reading('''[ TOP: h0 RELS: < [ part_of<3:6> LBL: h1 ARG0: x25 [ x NUM: pl PERS: 3 ] ARG1: x27 [ x NUM: pl PERS: 3 IND: + ] ] [ _all_q<3:6> LBL: h2 ARG0: x25 RSTR: h49 ] [ _these_q_dem<7:12> LBL: h3 ARG0: x27 RSTR: h50 ] [ _vary_v_cause<13:19> LBL: h4 ARG0: e26 [ e SF: prop TENSE: untensed MOOD: indicative PROG: bool PERF: - ] ARG2: x27 ] [ _case_n_of<20:26> LBL: h4 ARG0: x27 ] [ _however_a_1<27:35> LBL: h5 ARG0: i28 ARG1: h51 ] [ pron<36:37> LBL: h6 ARG0: x29 [ x NUM: sg PERS: 1 IND: + PT: std ] ] [ pronoun_q<36:37> LBL: h7 ARG0: x29 RSTR: h52 ] [ neg<38:44> LBL: h8 ARG0: e30 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: h53 ] [ _can_v_modal<38:44> LBL: h9 ARG0: e31 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] ARG1: h54 ] [ _recall_v_1<45:51> LBL: h10 ARG0: e32 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: x29 ARG2: x33 [ x NUM: pl PERS: 3 ] ] [ _any_q<52:55> LBL: h11 ARG0: x33 RSTR: h55 ] [ part_of<52:55> LBL: h12 ARG0: x33 ARG1: x25 ] [ _present_v_to<62:71> LBL: h12 ARG0: e34 [ e SF: prop TENSE: past MOOD: indicative PROG: - PERF: - ] ARG1: x33 ARG2: x37 [ x NUM: pl PERS: 3 IND: + ] ] [ udef_q<72:191> LBL: h13 ARG0: x37 RSTR: h56 ] [ _more_x_comp<72:76> LBL: h14 ARG0: e35 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: e36 [ e SF: prop TENSE: untensed MOOD: indicative PROG: bool PERF: - ] ARG2: x38 [ x NUM: sg PERS: 3 GEND: n ] ] [ _singular_a_1<77:85> LBL: h14 ARG0: e36 ARG1: x37 ] [ _feature_n_1<86:94> LBL: h14 ARG0: x37 ] [ generic_entity<100:104> LBL: h15 ARG0: x38 ] [ _that_q_dem<100:104> LBL: h16 ARG0: x38 RSTR: h57 ] [ _associate_v_with<115:125> LBL: h15 ARG0: e39 [ e SF: prop TENSE: past MOOD: indicative PROG: - PERF: - ] ARG2: x38 ARG3: x44 [ x NUM: sg PERS: 3 IND: + ] ] [ _the_q<131:134> LBL: h17 ARG0: x44 RSTR: h58 ] [ _well_x_deg<135:140> LBL: h18 ARG0: e40 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: e41 [ e SF: prop TENSE: untensed MOOD: indicative PROG: bool PERF: - ] ] [ _know_v_1<140:145> LBL: h18 ARG0: e41 ARG2: x44 ] [ compound<146:159> LBL: h18 ARG0: e42 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: x44 ARG2: x43 [ x NUM: sg PERS: 3 IND: + PT: notpro ] ] [ proper_q<146:152> LBL: h19 ARG0: x43 RSTR: h59 ] [ named<146:152> LBL: h20 ARG0: x43 CARG: "Surrey" ] [ _family_n_of<153:159> LBL: h18 ARG0: x44 ] [ _of_p<160:162> LBL: h18 ARG0: e45 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: x44 ARG2: x46 [ x NUM: pl PERS: 3 IND: + ] ] [ _the_q<163:166> LBL: h21 ARG0: x46 RSTR: h60 ] [ named<167:175> LBL: h22 ARG0: x46 CARG: "Roylotts" ] [ _of_p<176:178> LBL: h22 ARG0: e47 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: x46 ARG2: x48 [ x NUM: sg PERS: 3 IND: + ] ] [ proper_q<179:191> LBL: h23 ARG0: x48 RSTR: h61 ] [ named<179:191> LBL: h24 ARG0: x48 CARG: "Stoke Moran" ] > HCONS: < h0 qeq h5 h49 qeq h1 h50 qeq h4 h51 qeq h8 h52 qeq h6 h53 qeq h9 h54 qeq h10 h55 qeq h12 h56 qeq h14 h57 qeq h15 h58 qeq h18 h59 qeq h20 h60 qeq h22 h61 qeq h24 > ] ''') optimus = Transformer() rules = optimus.find_rules(r.dmrs().layout.nodes, limit=100) with TextReport.null() as outfile: for rule in rules: outfile.print(rule.lemma, rule.head(), rule.construction.to_dmrs()) optimus.apply(r) getLogger().debug(r.edit().nodes)
def lookup(cli, args): '''Lookup words by kanji/kana''' jam = get_jam(cli, args) results = jam.lookup(args.query, strict_lookup=args.strict) report = TextReport(args.output) if args.format == 'json': report.print(json.dumps(results.to_json(), ensure_ascii=args.ensure_ascii, indent=args.indent if args.indent else None)) else: if args.compact: report.print(results.text(separator='\n------\n', entry_sep='\n')) else: dump_result(results, report=report)
def find_omw_typo(cli, args): omw = get_omw() with omw.ctx() as ctx: defs = ctx.synset_def.select( "lang='eng' and (def like '% )%' or def like '% %' or def like '% e.g.' or def like '% ,%' or def like '%:')" ) if args.action == 'list': print("Found {} definitions with typo".format(len(defs))) for d in defs: print(d) print("Fixed: {}".format(repr(fix_typo(d._2)))) elif args.action == 'patch': patch_script = TextReport(args.output) for d in defs: fixed_def = fix_typo(d._2) patch_script.writeline("-- Orig : {} [{}]".format( d._2, d.synset)) patch_script.writeline("-- Fixed: {}".format(fixed_def)) patch_script.writeline( "UPDATE synset_def SET def = '{}' WHERE synset='{}' AND def='{}';\n" .format(to_sqlite_string(fixed_def), d.synset, to_sqlite_string(d._2)))
def create_ewdb(cli, args): db = EWDB(args.db) c = Counter() rp = TextReport() rp.header("DB location: {}".format(db.ds.path)) with db.ctx() as ctx: for pos in 'nvar': file_name = 'data/tsdb/skeletons/omw_{}.txt'.format(pos) rp.print("Reading file: {}".format(file_name)) for idx, row in enumerate(iter_tsv(file_name)): lemma, sid, sdef = row db.add_sense(sid, lemma, pos, sdef, ctx=ctx) c.count("Added") c.summarise() pass
def gen_vocab(cli, args): ''' Generate vocabulary list from a tokenized file ''' if args.topk and args.topk <= 0: topk = None cli.logger.warning( "Invalid k will be ignored (k should be greater than or equal to 1)" ) else: topk = args.topk if args.stopwords: with open(args.stopwords, 'r') as swfile: stopwords = swfile.read().splitlines() else: stopwords = [] if os.path.isfile(args.input): cli.logger.info("Generating vocabulary list from file {}".format( args.input)) with codecs.open(args.input, encoding='utf-8') as infile: if args.output: cli.logger.info("Output: {}".format(args.output)) rp = TextReport(args.output) lines = infile.read().splitlines() c = Counter() for line in lines: words = line.split() c.update(w for w in words if w not in stopwords) # report vocab word_freq = c.most_common(topk) words = [k for k, v in word_freq] rp.header("Lexicon") rp.writeline("\n".join( textwrap.wrap(" ".join(w for w in words), width=70))) for k, v in word_freq: rp.print("{}: {}".format(k, v)) else: cli.logger.warning("File {} does not exist".format(args.input))
def import_data(cli, args): '''Generate Jamdict SQLite database from XML data files''' rp = TextReport() t = Timer(report=rp) show_info(cli, args) jam = get_jam(cli, args) if not jam.db_file: print("Database path is not available") elif os.path.isfile(jam.db_file): if not confirm( "Database file exists. Do you want to overwite (This action cannot be undone! yes/no?) " ): cli.logger.warning("Program aborted.") exit() else: os.unlink(jam.db_file) # perform input print(f"Importing data to: {jam.db_file}") t.start( "Creating Jamdict SQLite database. This process may take very long time ..." ) jam.import_data() t.stop()
def gen_mfs_3000(cli, args): rp = TextReport(args.output) ssids = list(topk_mfs(3000)) random.shuffle(ssids) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: filename = 'omw3000A' for idx, sid in enumerate(ssids): ss = omw.get_synset(sid, ctx=ctx) if idx > len(ssids) / 2: filename = 'omw3000B' hub['omw3000'].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub['omw3000'].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def lookup(cli, args): '''Lookup words by kanji/kana''' jam = get_jam(cli, args) if jam.ready: results = jam.lookup(args.query, strict_lookup=args.strict) report = TextReport(args.output) if args.format == 'json': report.print( json.dumps(results.to_dict(), ensure_ascii=args.ensure_ascii, indent=args.indent if args.indent else None)) else: if args.compact: report.print( results.text(separator='\n------\n', entry_sep='\n')) else: dump_result(results, report=report) else: getLogger().warning( f"Jamdict database is not available.\nThere are 3 ways to install data: \n 1) install jamdict_data via PyPI using `pip install jamdict_data` \n 2) download prebuilt dictionary database file from: {jamdict.__url__}, \n 3) or build your own database file from XML source files." )
def extract_wn31(cli, args): c = Counter() rp = TextReport() entries = [] infile = FileHelper.abspath(args.input) if not os.path.isfile(infile): rp.print("File not found") else: rp.print("Processing {}".format(infile)) tree = etree.iterparse(infile) for event, element in tree: if event == 'end' and element.tag == 'Synset': for child in element: if child.tag == 'Definition': entries.append((element.get('id'), element.get('ili'), child.text)) c.count('Definition') c.count("Synset") element.clear() c.summarise(report=rp) # Format: wn31sid ili definition CSV.write_tsv(args.output, entries)
def gen_mfs_5000(cli, args): rp = TextReport(args.output) from omwtk.wn_ntumc_top3000 import WN_NTUMC_TOP3000 first_round = set(x['synset'] for x in WN_NTUMC_TOP3000) top5000 = topk_mfs(5000) round2 = list(top5000.difference(first_round)) random.shuffle(round2) with FileHub(working_dir='data', default_mode='w') as hub, omw.ctx() as ctx: filename = 'omw5000A' for idx, sid in enumerate(round2): ss = omw.get_synset(sid, ctx=ctx) if idx > 200: filename = 'omw5000B' hub['omw5000'].header(ss.ID, 'lemmas: {}'.format(", ".join(ss.lemmas))) for d in ss.definitions: hub[filename].writeline(d) hub['omw5000'].print(d, level=1) rp.header("Generated files") for f in hub.files.keys(): rp.print(hub[f].path)
def order_preds(cli, args): doc = Document.from_file(args.gold) output = TextReport(args.output) if not args.ident: output.print("No ident was provided") for ident in args.ident: sent = doc.by_ident(ident, default=None) if sent is None: output.print("Sent #{} is missing".format(ident)) else: output.print(sent) eps = sent[0].dmrs().obj().eps() sort_eps(eps) output.print(["{}<{}:{}>".format(str(x.pred), x.cfrom, x.cto) for x in eps]) output.print("Done")
def doc_stats(cli, args): ''' Show document statistics ''' doc = Document.from_file(args.path) # input output = TextReport(args.output) # output stats = Counter() pred_counter = Counter() empty_sentences = [] unknown_preds = Counter() all_pos = Counter() not_found = None if args.ttl: ttl_doc = ttl.Document.read_ttl(args.ttl) not_found = set(s.ID for s in ttl_doc).difference(s.ident for s in doc) for sent in doc: stats.count("Sentences") if not len(sent): stats.count("Sentences-empty") empty_sentences.append(sent.ident) for reading in sent: stats.count("Readings") stats['Predicates'] += len(reading.dmrs().layout.nodes) # pred_counter.update(n.predstr for n in reading.dmrs().layout.nodes) for n in reading.dmrs().layout.nodes: if n.pred.pos == 'u' and n.pred.sense == 'unknown': stats.count("Unnown predicates") if '/' in n.pred.lemma: try: lemma, pos = n.pred.lemma.rsplit('/', 1) except: getLogger().warning("Invalid unknown pred: {}".format(n.pred)) raise all_pos.count(pos) unknown_preds.count((str(n.pred), lemma, pos)) else: stats.count("UFO") else: stats.count("Known predicates") pred_counter.count(n.predstr) output.header("Summary", level="h0") stats.summarise(output) output.header("Empty sentences") output.print("\n".join(empty_sentences)) if not_found is not None: output.header("Missing from TTL") for sid in not_found: output.print(sid) output.header("Unknown preds POS") for pos, count in all_pos.most_common(): output.print(pos, count, separator='\t') output.header("Unknown preds") for (pred, lemma, pos), count in unknown_preds.most_common(): output.print(pred, lemma, pos, count, separator='\t') output.header("Known preds", level="h1") pred_counter.summarise(output)
def show_info(cli, args): ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' output = TextReport(args.output) if 'output' in args else TextReport() output.header("Jamdict | {} - Version: {}".format( version_info.__description__, version_info.__version__), level='h0') output.header("Basic configuration") output.print("JAMDICT_HOME: {}".format(config.home_dir())) output.print("Configuration location: {}".format( config._get_config_manager().locate_config())) output.header("Data files") output.print("Jamdict DB location: {} - {}".format(args.jdb, file_status(args.jdb))) output.print("JMDict XML file : {} - {}".format( args.jmdxml, file_status(args.jmdxml))) output.print("KanjiDic2 XML file : {} - {}".format( args.kd2xml, file_status(args.kd2xml)))
def test_tagging_all(self): getLogger().debug("Tagging everything ...") sents = self.gold() smap = {str(s.ident): s for s in sents} # reag tags doc = ttl.Document('gold', TEST_GOLD_DIR).read() filter_wrong_senses(doc) count_good_bad = Counter() perfects = [] to_be_checked = dd(list) tbc_concepts = dd(list) concept_count = Counter() fix_texts = [] instances = Counter() tag_map = dd(set) report = TextReport('data/gold_report.txt') matched_report = TextReport('data/gold_matched.txt') not_matched_report = TextReport('data/gold_notmatched.txt') for s in sents[:5]: sid = str(s.ident) if not doc.has_id(sid): raise Exception("Cannot find sentence {}".format(sid)) elif len(s) == 0: logging.warning("Empty sentence: {}".format(s)) else: tagged = doc.get(sid) if s.text != tagged.text: fix_texts.append((s.ident, s.text, tagged.text)) # try to tag ... dmrs = s[0].dmrs() matched, not_matched, ignored = tag_gold(dmrs, tagged, s.text, mode=Lexsem.ROBUST) if not not_matched: count_good_bad.count("Perfect") perfects.append((s, matched)) else: for nm in not_matched: tag_map[nm.tag].add(nm.clemma) tbc_concepts[nm.tag].append(s.ident) concept_count.count(nm.tag) instances.count('instances') to_be_checked[s.ident].append(nm) count_good_bad.count("To be checked") # report matched for sent, m in perfects: tagged = doc.get(str(sent.ident)) matched_report.header("#{}: {}".format(sent.ident, sent.text), "h0") matched_report.writeline(sent[0].dmrs()) matched_report.header("Concepts") for c, nid, pred in m: matched_report.writeline("{} ===> {}:{}".format(c, nid, pred)) matched_report.writeline() matched_report.writeline() # report not matched not_matched_report.header("By senses", "h0") for k, v in concept_count.most_common(): sids = ' '.join(["#{}".format(x) for x in tbc_concepts[k]]) not_matched_report.print("{}: {} | {} => {}".format(k, v, sids, tag_map[k])) not_matched_report.header("By sentences", "h0") for sid, nm in to_be_checked.items(): not_matched_report.print("#{}: {} | {}".format(sid, nm, smap[str(sid)].text)) # full details for sid, nm in to_be_checked.items(): sent = smap[str(sid)] tagged = doc.get(str(sid)) not_matched_report.header("#{}: {}".format(sid, sent.text)) not_matched_report.writeline(sent[0].dmrs()) for n in nm: not_matched_report.writeline(n) # for i, t1, t2 in fix_texts: # getLogger().debug(i) # getLogger().debug(t1) # getLogger().debug(t2) count_good_bad.summarise(report=report) instances.summarise(report=report)
continue ct.count(char) vc.count("Letters") if char in 'auieo': vc.count("Vowels") else: vc.count("Consonants") vc.summarise() ct.summarise(byfreq=True, limit=5) # ------------------------------------------------------------------------------ # Sample text report # ------------------------------------------------------------------------------ # a string report rp = TextReport() # by default, TextReport will write to standard output, i.e. terminal rp = TextReport(TextReport.STDOUT) # same as above rp = TextReport('~/tmp/my-report.txt') # output to a file rp = TextReport.null() # ouptut to /dev/null, i.e. nowhere rp = TextReport.string() # output to a string. Call rp.content() to get the string rp = TextReport(TextReport.STRINGIO) # same as above # TextReport will close the output stream automatically by using the with statement with TextReport.string() as rp: rp.header("Lorem Ipsum Analysis", level="h0") rp.header("Raw", level="h1") rp.print(LOREM_IPSUM) rp.header("Character Frequency") ct.summarise(report=rp) print(rp.content())
def isf_to_ukb(cli, args): ''' ISF to UKB ''' doc = Document.from_file(args.input) output = TextReport(args.output) tokenfile = TextReport(args.output + '.tokens.txt') report = TextReport(args.report) report.print("Output file: {}".format(args.output)) processed = 0 if not args.ident: report.print("No ident was provided") for idx, sent in enumerate(doc): # sent = doc.by_ident(ident, default=None) if args.topk and idx > args.topk: break if args.ident and sent.ident not in args.ident: continue if sent is None: report.print("Sent #{} is missing".format(sent.ident)) elif len(sent) == 0: report.print("Sent #{} is empty (i.e. there is no parse)".format(sent.ident)) else: sentid = sent.ID if sent.ID else sent.ident report.print("Processing {}".format(sentid)) tokens = sent.readings[0].dmrs().tokenize_pos(strict=args.strict) if not tokens: report.print("Empty DMRS: {} (no pred???)".format(sentid)) continue # sentense is OK ... output.print(sentid) for idx, (isf_lemma, pos, cfrom, cto) in enumerate(tokens): # In UKB's lemmas, use _ to represent a space lemma = isf_lemma.replace('+', '_') output.write("{text}#{p}#w{wid}#1 ".format(text=lemma, p=pos, wid=idx)) tokenfile.writeline('\t'.join((str(sentid), str(idx), str(cfrom), str(cto)))) output.write('\n\n') processed += 1 report.print("Processed {} sentence(s)".format(processed)) report.print("Done")
def remove_msw_ttl(cli, args): doc = read_ttl(args.path) rp = TextReport(args.debug) rp.print("Doc size: {}".format(len(doc))) orig_tag_count = 0 orig_concept_count = 0 for s in doc: orig_concept_count += len(s.concepts) orig_tag_count += len(s.tags) print("# tags: {}".format(orig_tag_count)) print("# concepts: {}".format(orig_concept_count)) manual = dd(lambda: dd(dict)) nonsenses = set() # just ignore any tag with these sense IDs if args.manual: entries = CSV.read_tsv(args.manual) for sid, wid, tag, keep, lemma in entries: sid, wid, keep = int(sid), int(wid), int(keep) if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'): nonsenses.add(tag) if not lemma: manual[sid][wid][tag] = keep else: manual[sid][wid][(tag, lemma)] = keep wn = get_wn() ctx = wn.ctx() nope_synsets = set() ok_synsets = set() if args.wn30: rp.print("WN30 filter is activated") for sidx, sent in enumerate(doc): if args.topk and sidx > int(args.topk): break getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc))) getLogger().debug("Before concepts: {}".format(sent.concepts)) getLogger().debug("Before tags: {}".format(sent.tags)) # remove concepts that are not in PWN 3.0 if args.wn30: remove_tags = set() for tag in sent.tags: if tag.tagtype == 'OMW' or tag.label in nonsenses: remove_tags.add(tag) for tag in remove_tags: sent.tags.remove(tag) remove_concepts = set() for c in sent.concepts: if c.tag in ok_synsets: pass elif c.tag in nope_synsets: remove_concepts.add(c) # pop_concept(sent, c) elif wn.get_synset(c.tag, ctx=ctx) is None: # remove it nope_synsets.add(c.tag) remove_concepts.add(c) # pop_concept(sent, c) else: ok_synsets.add(c.tag) for c in remove_concepts: pop_concept(sent, c) msw = list(sent.msw()) tcmap = sent.tcmap() # remove_tags = set() if msw: keep_remove = [] for w in msw: max_len = 0 keep = [] remove = set() wid = sent.tokens.index(w) for c in tcmap[w]: if c.tag in manual[sent.ID][wid]: if manual[sent.ID][wid][c.tag]: keep.append(c) else: remove.add(c) elif (c.tag, c.clemma) in manual[sent.ID][wid]: if manual[sent.ID][wid][(c.tag, c.clemma)]: keep.append(c) else: remove.add(c) elif len(c.tokens) == 1 or len(c.tokens) < max_len: remove.add(c) elif c.tag in nonsenses: remove.add(c) else: max_len = len(c.tokens) keep.append(c) if len(keep) != 1: keep_remove.append((w, keep, remove)) else: # everything is OK, remove them now for c in remove: if args.debug: rp.print("Removing concept {} from {}".format(c, sent.ID)) getLogger().debug("Removing concept {} from {}".format(c, sent.ID)) pop_concept(sent, c) if keep_remove: rp.header(sent) for w, keep, remove in keep_remove: rp.write(w) rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove)) # remove sent's tags # for tag in remove_tags: # getLogger().debug("removing tag: {}".format(tag)) # sent.tags.remove(tag) getLogger().debug("After concepts: {}".format(sent.concepts)) getLogger().debug("After tags: {}".format(sent.tags)) if nope_synsets: rp.print("Noped synsets: {}".format(nope_synsets)) if args.output: doc_path = os.path.dirname(args.output) doc_name = os.path.basename(args.output) new_doc = ttl.Document(doc_name, doc_path) sents = doc if not args.topk else list(doc)[:int(args.topk)] for s in sents: new_doc.add_sent(s) tag_count = 0 concept_count = 0 for s in sents: concept_count += len(s.concepts) tag_count += len(s.tags) # baking ... if args.bake: print("Baking doc ...") bake_doc(new_doc) print("[New] # tags: {}".format(tag_count)) print("[New] # concepts: {}".format(concept_count)) rp.print("Writing fixed TTL to {}".format(new_doc.sent_path)) new_doc.write_ttl()
def find_lesk_candidates(cli, args): doc = Document.from_file(args.gold) ne = 0 for s in doc: if len(s): ne += 1 print("Gold ISF: {} | not empty sents: {}".format(args.gold, ne)) # candidates = dd(lambda: dd(set)) notfound = dd(list) ident_sent_map = {} all_preds = Counter() missing_preds = Counter() found_preds = Counter() with PredSense.wn.ctx() as ctx: for idx, sent in enumerate(doc): if not len(sent): continue elif args.ident and sent.ident not in args.ident: continue if args.topk and args.topk < idx: break print(sent) ident_sent_map[sent.ident] = sent dmrs = sent[0].dmrs() if dmrs.tags: for ep in dmrs.get_lexical_preds(): all_preds.count(str(ep.pred)) if ep.nodeid in dmrs.tags: # if there is a tag for this node ep_synsets = PredSense.search_ep(ep, ctx=ctx) # return a SynsetCollection() for tag in dmrs.tags[ep.nodeid]: if tag.synset.ID not in ep_synsets: notfound[sent.ident].append((ep.nodeid, str(ep.pred), tag.synset.ID, tag.synset.lemma, [(x.ID, x.lemma) for x in ep_synsets])) missing_preds.count(str(ep.pred)) else: found_preds.count(str(ep.pred)) output = TextReport(args.output) # summarise total_found = sum(c for pred, c in found_preds.most_common()) total_missing = sum(c for pred, c in missing_preds.most_common()) output.print("Found : {}".format(total_found)) output.print("Not found: {}".format(total_missing)) ratio = (total_missing * 100) / (total_found + total_missing) output.print("Missing %: {}".format(ratio)) # preds by sentences output.header("By sentences") for sid in sorted(notfound.keys()): sent = ident_sent_map[sid] output.print((sid, sent.text)) items = notfound[sid] for item in items: output.print(item) output.print() # by preds output.header("By preds") for pred, occurrence in missing_preds.most_common(): output.print("{}: {}".format(pred, occurrence)) print("Done")
def verify_patch(cli, args): rp = TextReport() c = Counter() if not args.input or not os.path.isfile(args.input): raise Exception("Patch file not found") # load patches with open(args.input) as infile: patches = [DefPatch.from_dict(p) for p in yaml.safe_load(infile)] rp.print("Found {} patches.".format(len(patches))) # Validate against GWN-30 # gwn = get_gwn() # don't use GWN, for now omw = get_omw() wn = get_wn() with omw.ctx() as ctx, wn.ctx() as wnctx: for patch in patches: try: sid = wn.sk2sid(patch.sensekey, ctx=wnctx) if not sid: raise Exception("sensekey `{}' does not exist.".format( patch.sensekey)) ss = omw.get_synset(sid, ctx=ctx) ssdef = ss.definition[:-1] if ss.definition.endswith( ';') else ss.definition if patch.orig_def == ssdef: c.count("Found") rp.print("-", "{} [{}]".format(patch.orig_def, patch.sensekey)) rp.print(" ", patch.new_def) if patch.comment: rp.print("C", patch.comment) else: c.count("Found - diff") rp.print("[DIFF]", "{} [{}]".format(patch.orig_def, patch.sensekey)) rp.print("New: ", "{} [{}]".format(patch.new_def, patch.sensekey)) rp.print(" ", ssdef) rp.print("Note: ", patch.comment) except: getLogger().warn("sensekey `{}' couldn't be found".format( patch.sensekey)) c.count("Not found") continue c.summarise(report=rp)
def list_unksense(args): header("List unknown sensekeys in Semcor") semxml = SemcorXML(SEMCOR_TTL) unk = Counter() sids = Counter() c = Counter() out = TextReport() if not args.out else TextReport(args.out) for f in semxml.files[:args.limit] if args.limit else ttl.files: doc = ttl.Document.from_json_file(ttl.files.abspath(f)) for s in doc: for concept in s.concepts: try: sid = SynsetID.from_string(concept.tag) sids.count((sid, concept.clemma)) c.count("Known instances") except: sid = None unk.count((concept.tag, concept.clemma)) c.count("Unknown instances") out.header("Known concepts") out.writeline("\t".join(("synsetID", "lemma", "count"))) for k, v in sids.sorted_by_count(): sid, lemma = k out.writeline("\t".join((str(sid), lemma, str(v)))) out.header("Unknown concepts") out.writeline("\t".join(("sensekey", "lemma", "count"))) for k, v in unk.sorted_by_count(): sk, lemma = k out.writeline("\t".join((sk, lemma, str(v)))) out.header("Total") out.writeline("Known: {}".format(len(sids))) out.writeline("Unknown: {}".format(len(unk))) c.summarise(out)
def read_nttat(cli, args): ''' Convert NTTAT patch to JSON ''' stdout = TextReport() ext = 'json' rp = TextReport("{}_1.{}".format(args.output, ext)) rp2 = TextReport("{}_2.{}".format(args.output, ext)) gwn = get_gwn() data = [] with open(args.input, 'r') as infile, gwn.ctx() as ctx: ssids = re.findall('\d{8}-[nvarx]', infile.read()) print(len(ssids)) print(ssids) for sid in ssids: ss = gwn.get_synset(sid, ctx=ctx) sdef = fix_gwn_def(ss.definition) stdout.header(sid, "Lemmas: {}".format(", ".join(ss.lemmas))) stdout.print(sdef) data.append({ "synset": sid, "lemmas": ss.lemmas, "definition": sdef }) cut = int(len(data) / 2) # first half first_half = json.dumps(data[:cut], indent=2) rp.write(first_half) # second half second_half = json.dumps(data[cut:], indent=2) rp2.write(second_half)
def compare_ttls(cli, args): ''' Compare TTL to gold ''' rp = TextReport() omw = get_omw() ctx = omw.ctx() gold = None profile = None ignored_ids = [] if args.ignore: ignored_ids = [x.strip() for x in read_file(args.ignore).splitlines() if x.strip()] getLogger().debug("Ignored sentence IDs: {}".format(', '.join(ignored_ids))) if args.gold_profile: gold = read_ttl(args.gold_profile, ttl_format=args.ttl_format) # remove ignored sentences if ignored_ids: for sid in ignored_ids: gold.pop(sid, default=None) if not args.batch: rp.header("Gold sentences: {} | Loc: {}".format(len(gold), args.gold_profile)) if args.verbose and not args.batch: for s in gold: rp.print("Sent #{}: {} tags".format(s.ID, len(s.tags))) elif not args.batch: print("Oops, no gold!") # read profile if args.profile: profile = read_ttl(args.profile, ttl_format=args.ttl_format) if not args.batch: rp.header("Profile sentences: {} | Loc: {}".format(len(profile), args.profile)) # remove ignored sentences if ignored_ids: for sid in ignored_ids: profile.pop(sid, default=None) if not args.batch: rp.header("Profile sentences: {} (ignored: {}) | Loc: {}".format(len(profile), len(ignored_ids), args.profile)) if args.verbose and not args.batch: for s in profile: getLogger().debug("Profile/Sent #{}: {} tags".format(s.ID, len(s.tags))) elif not args.batch: print("Oops, no profile to evaluate") # calculate precision and recall if gold and profile: gold_tags, gold_tags_len, gold_ignored = prepare_tags(gold, args=args, nonsense=args.nonsense) profile_tags, profile_tags_len, profile_ignored = prepare_tags(profile, args=args, nonsense=args.nonsense) if gold_tags_len == 0: rp.print("WARNING: There was no tag found in the gold profile. Please make sure that the tags for comparison are *sentence level* tags") if profile_tags_len == 0: rp.print("WARNING: There was no tag found in the evaluating profile. Please make sure that the tags for comparison are *sentence level* tags") getLogger().debug("Gold tags: {}".format(gold_tags_len)) getLogger().debug(list(gold_tags.items())[:5]) getLogger().debug("Profile tags: {}".format(profile_tags_len)) getLogger().debug(list(profile_tags.items())[:5]) true_positive, false_negative = score(gold_tags, profile_tags, args=args) precision = len(true_positive) / profile_tags_len recall = len(true_positive) / gold_tags_len f1 = 2 * precision * recall / (precision + recall) getLogger().debug("TP: {}".format(len(true_positive))) getLogger().debug("FN: {}".format(len(false_negative))) getLogger().debug("Recall (TP/Gtags): {}".format(recall)) getLogger().debug("Precision (TP/Ptags): {}".format(precision)) getLogger().debug("F1 (2*p*r/(p+r)): {}".format(f1)) rc_text = "{:.2f}%".format(recall * 100) pr_text = "{:.2f}%".format(precision * 100) f1_text = "{:.2f}%".format(f1 * 100) if not args.batch: rp.print("True positive: {}".format(len(true_positive))) rp.print("False Negative: {}".format(len(false_negative))) rp.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored)) rp.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored)) rp.print("Recall: {}".format(rc_text)) rp.print("Precision: {}".format(pr_text)) rp.print("F1 : {}".format(f1_text)) if args.org: # output org-mode columns = [rc_text, pr_text, f1_text] if args.cols: columns = args.cols + columns rp.print('| {} |'.format(' | '.join(columns))) if args.debug: if not args.batch: print("Debug file: {}".format(args.debug)) debugfile = TextReport(args.debug) debugfile.print(".:: Table of content ::.") debugfile.print("") debugfile.print("[Misisng senses]") debugfile.print("[By classes]") debugfile.print("[Summary]") debugfile.print("") ss_map = {} debugfile.header("[Missing senses]") for sid, cfrom, cto, label in sorted(false_negative): if label not in ss_map: ss = omw.get_synset(label, ctx=ctx) ss_map[label] = ss else: ss = ss_map[label] # get the surface form surface = gold.get(sid).text[int(cfrom):int(cto)] debugfile.print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(sid, cfrom, cto, surface, label, ss.definition, ss.lemmas)) # by classes c = Counter() c.update(synsetID for sentID, cfrom, cto, synsetID in false_negative) debugfile.header("[By classes]") for synsetID, freq in c.most_common(): ss = ss_map[synsetID] debugfile.print("{}: {} | ({}) - {}".format(synsetID, freq, ', '.join(ss.lemmas), ss.definition)) # summary debugfile.header("[Summary]") debugfile.print("True positive: {}".format(len(true_positive))) debugfile.print("False positive: {}".format(len(false_negative))) debugfile.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored)) debugfile.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored)) debugfile.print("Recall (TP/Gtags) : {}".format(rc_text)) debugfile.print("Precision (TP/Ptags): {}".format(pr_text)) debugfile.print("F1 (2*p*r/(p+r)) : {}".format(f1_text)) ctx.close()
def omw_fix_dup(cli, args): rp = TextReport(args.output) omw = get_omw() c = Counter() with omw.ctx() as ctx: senses = ctx.sense.select(limit=args.topk, columns=('synset', )) synsetids = {s.synset for s in senses} rp.print("-- OMW synsets: {}\n".format(len(synsetids))) for sid in synsetids: try: sid = SynsetID.from_string(sid) except: cli.logger.warning("Ignored synset ID: {}".format(sid)) continue ss = omw.get_synset(sid, ctx=ctx) fixed_def, dup_defs = join_definitions(ss) if dup_defs: c.count("Duplicated") rp.print("-- Original {}: {}".format(ss.ID, ss.definition)) rp.print("-- Fixed {}: {}".format(ss.ID, fixed_def)) for dup in dup_defs: rp.print( "DELETE FROM synset_def WHERE synset='{}' and def='{}';" .format(ss.ID, to_sqlite_string(dup))) rp.print() c.summarise() pass
def map_predsense(cli, args): ''' Pred-Sense Mapping (gold DMRSes, gold Senses) ''' rp = TextReport(args.output) if args.output else TextReport() rp.header("Pred-Sense mapping / strategy = {}".format(args.strat)) if args.gold: sents = Document.from_file(args.gold) if args.patchsid: patch_gold_sid(sents) else: sents = read_gold_mrs() patch_gold_sid(sents) # ignore empty sentence empty_sents = [s for s in sents if not len(s)] not_empty_sents = [s for s in sents if len(s)] rp.print("MRS-Sents: {}".format(len(sents))) rp.print("MRS-Sents not empty: {}".format(len(not_empty_sents))) if args.ttl: doc = ttl.read(args.ttl, mode=args.ttl_format) else: # [XXX] using gold by default is bad ... doc = ttl.Document(name='gold', path='data').read() rp.print("TTL-Sents: {}".format(len(doc))) found_sents = 0 for sent in not_empty_sents: if doc.get(sent.ident) is None: cli.logger.warning("Sentence {} could not be found".format(sent.ident)) else: found_sents += 1 rp.print("Matched: {}".format(found_sents)) rp.print("Empty sentences: {}".format([s.ident for s in empty_sents])) # Now mapping is possible # ---------------------------------------- ct = Counter() # total cm = Counter() # matched cnm = Counter() # not matched cig = Counter() # ignored sense_lemmas = dd(set) # sense, lemma, map sense_sents = dd(set) # not-matched senses to sentences lemma_sents = dd(set) # not matched lemmas to sentences rp.print("Performing Pred-Sense Mapping") sents_to_map = not_empty_sents[:args.topk] if args.topk else not_empty_sents for sent in sents_to_map: sent.shallow = doc.get(sent.ident) for m, nm, ig in import_shallow(sent, mode=args.strat, no_small_sense=args.noss, fix_token=args.fixtoken, no_nonsense=args.nononsense): for c, nid, pred in m: ct.count(c.tag) cm.count(c.tag) for c in ig: sense_lemmas[c.tag].add(c.clemma) ct.count(c.tag) cig.count(c.tag) for c in nm: sense_lemmas[c.tag].add(c.clemma) ct.count(c.tag) cnm.count(c.tag) sense_sents[c.tag].add(sent) lemma_sents[c.clemma].add(sent) # print("Sent #{} - Not matched: {}".format(sent.ident, nm)) # print(" Matched : {}".format(len(m))) rp.header("Not matched", level='h0') for sid, c in cnm.most_common(): rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid])) rp.header("Not matched (by lemma)", level='h0') for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True): rp.print("{}: {} | sents: {}".format(clemma, len(sents), [s.ident for s in sents])) if args.matched: rp.header("Total", level='h0') ct.summarise() rp.header("Ignored", level='h0') for sid, c in cig.most_common(): rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid])) # show sense - sentences rp.header("Sense - Sentences", level='h0') for sid, c in cnm.most_common(): sents = sense_sents[sid] rp.header("{} - {}".format(sid, sense_lemmas[sid]), level='h2') for sent in sents: ttl_sent = doc.get(sent.ident) rp.print(ttl_sent) for concept in ttl_sent.concepts: if concept.tag == sid: rp.print(' -> {}'.format(concept)) rp.header("Lemma - Sentences", level='h0') for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True): rp.header("#{}".format(clemma,)) for sent in sents: ttl_sent = doc.get(sent.ident) rp.print(ttl_sent) for concept in ttl_sent.concepts: if concept.clemma == clemma: rp.print(' -> {}'.format(concept)) rp.print() # Show final numbers total_concepts = sum(x[1] for x in ct.most_common()) total_matched = sum(x[1] for x in cm.most_common()) total_notmatched = sum(x[1] for x in cnm.most_common()) total_ignored = sum(x[1] for x in cig.most_common()) rp.header("Summarise") rp.print("Total concepts: {}".format(total_concepts)) rp.print("Matched: {}".format(total_matched)) rp.print("Not matched: {}".format(total_notmatched)) rp.print("Ignored: {}".format(total_ignored)) if args.output: print("Total concepts: {}".format(total_concepts)) print("Matched: {}".format(total_matched)) print("Not matched: {}".format(total_notmatched)) print("Ignored: {}".format(total_ignored)) print("Output file: {}".format(args.output)) print("Done!") return total_concepts, total_matched, total_notmatched, total_ignored