def gen_vocab(cli, args): ''' Generate vocabulary list from a tokenized file ''' if args.topk and args.topk <= 0: topk = None cli.logger.warning("Invalid k will be ignored (k should be greater than or equal to 1)") else: topk = args.topk if args.stopwords: with open(args.stopwords, 'r') as swfile: stopwords = swfile.read().splitlines() else: stopwords = [] if os.path.isfile(args.input): cli.logger.info("Generating vocabulary list from file {}".format(args.input)) with codecs.open(args.input, encoding='utf-8') as infile: if args.output: cli.logger.info("Output: {}".format(args.output)) rp = TextReport(args.output) lines = infile.read().splitlines() c = Counter() for line in lines: words = line.split() c.update(w for w in words if w not in stopwords) # report vocab word_freq = c.most_common(topk) words = [k for k, v in word_freq] rp.header("Lexicon") rp.writeline("\n".join(textwrap.wrap(" ".join(w for w in words), width=70))) for k, v in word_freq: rp.print("{}: {}".format(k, v)) else: cli.logger.warning("File {} does not exist".format(args.input))
def isf_to_ukb(cli, args): ''' ISF to UKB ''' doc = Document.from_file(args.input) output = TextReport(args.output) tokenfile = TextReport(args.output + '.tokens.txt') report = TextReport(args.report) report.print("Output file: {}".format(args.output)) processed = 0 if not args.ident: report.print("No ident was provided") for idx, sent in enumerate(doc): # sent = doc.by_ident(ident, default=None) if args.topk and idx > args.topk: break if args.ident and sent.ident not in args.ident: continue if sent is None: report.print("Sent #{} is missing".format(sent.ident)) elif len(sent) == 0: report.print("Sent #{} is empty (i.e. there is no parse)".format(sent.ident)) else: sentid = sent.ID if sent.ID else sent.ident report.print("Processing {}".format(sentid)) tokens = sent.readings[0].dmrs().tokenize_pos(strict=args.strict) if not tokens: report.print("Empty DMRS: {} (no pred???)".format(sentid)) continue # sentense is OK ... output.print(sentid) for idx, (isf_lemma, pos, cfrom, cto) in enumerate(tokens): # In UKB's lemmas, use _ to represent a space lemma = isf_lemma.replace('+', '_') output.write("{text}#{p}#w{wid}#1 ".format(text=lemma, p=pos, wid=idx)) tokenfile.writeline('\t'.join((str(sentid), str(idx), str(cfrom), str(cto)))) output.write('\n\n') processed += 1 report.print("Processed {} sentence(s)".format(processed)) report.print("Done")
def gen_vocab(cli, args): ''' Generate vocabulary list from a tokenized file ''' if args.topk and args.topk <= 0: topk = None cli.logger.warning( "Invalid k will be ignored (k should be greater than or equal to 1)" ) else: topk = args.topk if args.stopwords: with open(args.stopwords, 'r') as swfile: stopwords = swfile.read().splitlines() else: stopwords = [] if os.path.isfile(args.input): cli.logger.info("Generating vocabulary list from file {}".format( args.input)) with codecs.open(args.input, encoding='utf-8') as infile: if args.output: cli.logger.info("Output: {}".format(args.output)) rp = TextReport(args.output) lines = infile.read().splitlines() c = Counter() for line in lines: words = line.split() c.update(w for w in words if w not in stopwords) # report vocab word_freq = c.most_common(topk) words = [k for k, v in word_freq] rp.header("Lexicon") rp.writeline("\n".join( textwrap.wrap(" ".join(w for w in words), width=70))) for k, v in word_freq: rp.print("{}: {}".format(k, v)) else: cli.logger.warning("File {} does not exist".format(args.input))
def find_omw_typo(cli, args): omw = get_omw() with omw.ctx() as ctx: defs = ctx.synset_def.select( "lang='eng' and (def like '% )%' or def like '% %' or def like '% e.g.' or def like '% ,%' or def like '%:')" ) if args.action == 'list': print("Found {} definitions with typo".format(len(defs))) for d in defs: print(d) print("Fixed: {}".format(repr(fix_typo(d._2)))) elif args.action == 'patch': patch_script = TextReport(args.output) for d in defs: fixed_def = fix_typo(d._2) patch_script.writeline("-- Orig : {} [{}]".format( d._2, d.synset)) patch_script.writeline("-- Fixed: {}".format(fixed_def)) patch_script.writeline( "UPDATE synset_def SET def = '{}' WHERE synset='{}' AND def='{}';\n" .format(to_sqlite_string(fixed_def), d.synset, to_sqlite_string(d._2)))
def remove_msw_ttl(cli, args): doc = read_ttl(args.path) rp = TextReport(args.debug) rp.print("Doc size: {}".format(len(doc))) orig_tag_count = 0 orig_concept_count = 0 for s in doc: orig_concept_count += len(s.concepts) orig_tag_count += len(s.tags) print("# tags: {}".format(orig_tag_count)) print("# concepts: {}".format(orig_concept_count)) manual = dd(lambda: dd(dict)) nonsenses = set() # just ignore any tag with these sense IDs if args.manual: entries = CSV.read_tsv(args.manual) for sid, wid, tag, keep, lemma in entries: sid, wid, keep = int(sid), int(wid), int(keep) if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'): nonsenses.add(tag) if not lemma: manual[sid][wid][tag] = keep else: manual[sid][wid][(tag, lemma)] = keep wn = get_wn() ctx = wn.ctx() nope_synsets = set() ok_synsets = set() if args.wn30: rp.print("WN30 filter is activated") for sidx, sent in enumerate(doc): if args.topk and sidx > int(args.topk): break getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc))) getLogger().debug("Before concepts: {}".format(sent.concepts)) getLogger().debug("Before tags: {}".format(sent.tags)) # remove concepts that are not in PWN 3.0 if args.wn30: remove_tags = set() for tag in sent.tags: if tag.tagtype == 'OMW' or tag.label in nonsenses: remove_tags.add(tag) for tag in remove_tags: sent.tags.remove(tag) remove_concepts = set() for c in sent.concepts: if c.tag in ok_synsets: pass elif c.tag in nope_synsets: remove_concepts.add(c) # pop_concept(sent, c) elif wn.get_synset(c.tag, ctx=ctx) is None: # remove it nope_synsets.add(c.tag) remove_concepts.add(c) # pop_concept(sent, c) else: ok_synsets.add(c.tag) for c in remove_concepts: pop_concept(sent, c) msw = list(sent.msw()) tcmap = sent.tcmap() # remove_tags = set() if msw: keep_remove = [] for w in msw: max_len = 0 keep = [] remove = set() wid = sent.tokens.index(w) for c in tcmap[w]: if c.tag in manual[sent.ID][wid]: if manual[sent.ID][wid][c.tag]: keep.append(c) else: remove.add(c) elif (c.tag, c.clemma) in manual[sent.ID][wid]: if manual[sent.ID][wid][(c.tag, c.clemma)]: keep.append(c) else: remove.add(c) elif len(c.tokens) == 1 or len(c.tokens) < max_len: remove.add(c) elif c.tag in nonsenses: remove.add(c) else: max_len = len(c.tokens) keep.append(c) if len(keep) != 1: keep_remove.append((w, keep, remove)) else: # everything is OK, remove them now for c in remove: if args.debug: rp.print("Removing concept {} from {}".format(c, sent.ID)) getLogger().debug("Removing concept {} from {}".format(c, sent.ID)) pop_concept(sent, c) if keep_remove: rp.header(sent) for w, keep, remove in keep_remove: rp.write(w) rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove)) # remove sent's tags # for tag in remove_tags: # getLogger().debug("removing tag: {}".format(tag)) # sent.tags.remove(tag) getLogger().debug("After concepts: {}".format(sent.concepts)) getLogger().debug("After tags: {}".format(sent.tags)) if nope_synsets: rp.print("Noped synsets: {}".format(nope_synsets)) if args.output: doc_path = os.path.dirname(args.output) doc_name = os.path.basename(args.output) new_doc = ttl.Document(doc_name, doc_path) sents = doc if not args.topk else list(doc)[:int(args.topk)] for s in sents: new_doc.add_sent(s) tag_count = 0 concept_count = 0 for s in sents: concept_count += len(s.concepts) tag_count += len(s.tags) # baking ... if args.bake: print("Baking doc ...") bake_doc(new_doc) print("[New] # tags: {}".format(tag_count)) print("[New] # concepts: {}".format(concept_count)) rp.print("Writing fixed TTL to {}".format(new_doc.sent_path)) new_doc.write_ttl()
def list_unksense(args): header("List unknown sensekeys in Semcor") semxml = SemcorXML(SEMCOR_TTL) unk = Counter() sids = Counter() c = Counter() out = TextReport() if not args.out else TextReport(args.out) for f in semxml.files[:args.limit] if args.limit else ttl.files: doc = ttl.Document.from_json_file(ttl.files.abspath(f)) for s in doc: for concept in s.concepts: try: sid = SynsetID.from_string(concept.tag) sids.count((sid, concept.clemma)) c.count("Known instances") except: sid = None unk.count((concept.tag, concept.clemma)) c.count("Unknown instances") out.header("Known concepts") out.writeline("\t".join(("synsetID", "lemma", "count"))) for k, v in sids.sorted_by_count(): sid, lemma = k out.writeline("\t".join((str(sid), lemma, str(v)))) out.header("Unknown concepts") out.writeline("\t".join(("sensekey", "lemma", "count"))) for k, v in unk.sorted_by_count(): sk, lemma = k out.writeline("\t".join((sk, lemma, str(v)))) out.header("Total") out.writeline("Known: {}".format(len(sids))) out.writeline("Unknown: {}".format(len(unk))) c.summarise(out)
def test_tagging_all(self): getLogger().debug("Tagging everything ...") sents = self.gold() smap = {str(s.ident): s for s in sents} # reag tags doc = ttl.Document('gold', TEST_GOLD_DIR).read() filter_wrong_senses(doc) count_good_bad = Counter() perfects = [] to_be_checked = dd(list) tbc_concepts = dd(list) concept_count = Counter() fix_texts = [] instances = Counter() tag_map = dd(set) report = TextReport('data/gold_report.txt') matched_report = TextReport('data/gold_matched.txt') not_matched_report = TextReport('data/gold_notmatched.txt') for s in sents[:5]: sid = str(s.ident) if not doc.has_id(sid): raise Exception("Cannot find sentence {}".format(sid)) elif len(s) == 0: logging.warning("Empty sentence: {}".format(s)) else: tagged = doc.get(sid) if s.text != tagged.text: fix_texts.append((s.ident, s.text, tagged.text)) # try to tag ... dmrs = s[0].dmrs() matched, not_matched, ignored = tag_gold(dmrs, tagged, s.text, mode=Lexsem.ROBUST) if not not_matched: count_good_bad.count("Perfect") perfects.append((s, matched)) else: for nm in not_matched: tag_map[nm.tag].add(nm.clemma) tbc_concepts[nm.tag].append(s.ident) concept_count.count(nm.tag) instances.count('instances') to_be_checked[s.ident].append(nm) count_good_bad.count("To be checked") # report matched for sent, m in perfects: tagged = doc.get(str(sent.ident)) matched_report.header("#{}: {}".format(sent.ident, sent.text), "h0") matched_report.writeline(sent[0].dmrs()) matched_report.header("Concepts") for c, nid, pred in m: matched_report.writeline("{} ===> {}:{}".format(c, nid, pred)) matched_report.writeline() matched_report.writeline() # report not matched not_matched_report.header("By senses", "h0") for k, v in concept_count.most_common(): sids = ' '.join(["#{}".format(x) for x in tbc_concepts[k]]) not_matched_report.print("{}: {} | {} => {}".format(k, v, sids, tag_map[k])) not_matched_report.header("By sentences", "h0") for sid, nm in to_be_checked.items(): not_matched_report.print("#{}: {} | {}".format(sid, nm, smap[str(sid)].text)) # full details for sid, nm in to_be_checked.items(): sent = smap[str(sid)] tagged = doc.get(str(sid)) not_matched_report.header("#{}: {}".format(sid, sent.text)) not_matched_report.writeline(sent[0].dmrs()) for n in nm: not_matched_report.writeline(n) # for i, t1, t2 in fix_texts: # getLogger().debug(i) # getLogger().debug(t1) # getLogger().debug(t2) count_good_bad.summarise(report=report) instances.summarise(report=report)