Esempio n. 1
0
def gen_vocab(cli, args):
    ''' Generate vocabulary list from a tokenized file '''
    if args.topk and args.topk <= 0:
        topk = None
        cli.logger.warning("Invalid k will be ignored (k should be greater than or equal to 1)")
    else:
        topk = args.topk
    if args.stopwords:
        with open(args.stopwords, 'r') as swfile:
            stopwords = swfile.read().splitlines()
    else:
        stopwords = []
    if os.path.isfile(args.input):
        cli.logger.info("Generating vocabulary list from file {}".format(args.input))
        with codecs.open(args.input, encoding='utf-8') as infile:
            if args.output:
                cli.logger.info("Output: {}".format(args.output))
            rp = TextReport(args.output)
            lines = infile.read().splitlines()
            c = Counter()
            for line in lines:
                words = line.split()
                c.update(w for w in words if w not in stopwords)
            # report vocab
            word_freq = c.most_common(topk)
            words = [k for k, v in word_freq]
            rp.header("Lexicon")
            rp.writeline("\n".join(textwrap.wrap(" ".join(w for w in words), width=70)))
            for k, v in word_freq:
                rp.print("{}: {}".format(k, v))
    else:
        cli.logger.warning("File {} does not exist".format(args.input))
Esempio n. 2
0
def isf_to_ukb(cli, args):
    ''' ISF to UKB '''
    doc = Document.from_file(args.input)
    output = TextReport(args.output)
    tokenfile = TextReport(args.output + '.tokens.txt')
    report = TextReport(args.report)
    report.print("Output file: {}".format(args.output))
    processed = 0
    if not args.ident:
        report.print("No ident was provided")
    for idx, sent in enumerate(doc):
        # sent = doc.by_ident(ident, default=None)
        if args.topk and idx > args.topk:
            break
        if args.ident and sent.ident not in args.ident:
            continue
        if sent is None:
            report.print("Sent #{} is missing".format(sent.ident))
        elif len(sent) == 0:
            report.print("Sent #{} is empty (i.e. there is no parse)".format(sent.ident))
        else:
            sentid = sent.ID if sent.ID else sent.ident
            report.print("Processing {}".format(sentid))
            tokens = sent.readings[0].dmrs().tokenize_pos(strict=args.strict)
            if not tokens:
                report.print("Empty DMRS: {} (no pred???)".format(sentid))
                continue
            # sentense is OK ...
            output.print(sentid)
            for idx, (isf_lemma, pos, cfrom, cto) in enumerate(tokens):
                # In UKB's lemmas, use _ to represent a space
                lemma = isf_lemma.replace('+', '_')
                output.write("{text}#{p}#w{wid}#1 ".format(text=lemma, p=pos, wid=idx))
                tokenfile.writeline('\t'.join((str(sentid), str(idx), str(cfrom), str(cto))))
            output.write('\n\n')
            processed += 1
    report.print("Processed {} sentence(s)".format(processed))
    report.print("Done")
Esempio n. 3
0
def gen_vocab(cli, args):
    ''' Generate vocabulary list from a tokenized file '''
    if args.topk and args.topk <= 0:
        topk = None
        cli.logger.warning(
            "Invalid k will be ignored (k should be greater than or equal to 1)"
        )
    else:
        topk = args.topk
    if args.stopwords:
        with open(args.stopwords, 'r') as swfile:
            stopwords = swfile.read().splitlines()
    else:
        stopwords = []
    if os.path.isfile(args.input):
        cli.logger.info("Generating vocabulary list from file {}".format(
            args.input))
        with codecs.open(args.input, encoding='utf-8') as infile:
            if args.output:
                cli.logger.info("Output: {}".format(args.output))
            rp = TextReport(args.output)
            lines = infile.read().splitlines()
            c = Counter()
            for line in lines:
                words = line.split()
                c.update(w for w in words if w not in stopwords)
            # report vocab
            word_freq = c.most_common(topk)
            words = [k for k, v in word_freq]
            rp.header("Lexicon")
            rp.writeline("\n".join(
                textwrap.wrap(" ".join(w for w in words), width=70)))
            for k, v in word_freq:
                rp.print("{}: {}".format(k, v))
    else:
        cli.logger.warning("File {} does not exist".format(args.input))
Esempio n. 4
0
def find_omw_typo(cli, args):
    omw = get_omw()
    with omw.ctx() as ctx:
        defs = ctx.synset_def.select(
            "lang='eng' and (def like '% )%' or def like '%  %' or def like '% e.g.' or def like '% ,%' or def like '%:')"
        )
        if args.action == 'list':
            print("Found {} definitions with typo".format(len(defs)))
            for d in defs:
                print(d)
                print("Fixed: {}".format(repr(fix_typo(d._2))))
        elif args.action == 'patch':
            patch_script = TextReport(args.output)
            for d in defs:
                fixed_def = fix_typo(d._2)
                patch_script.writeline("-- Orig : {} [{}]".format(
                    d._2, d.synset))
                patch_script.writeline("-- Fixed: {}".format(fixed_def))
                patch_script.writeline(
                    "UPDATE synset_def SET def = '{}' WHERE synset='{}' AND def='{}';\n"
                    .format(to_sqlite_string(fixed_def), d.synset,
                            to_sqlite_string(d._2)))
Esempio n. 5
0
def remove_msw_ttl(cli, args):
    doc = read_ttl(args.path)
    rp = TextReport(args.debug)
    rp.print("Doc size: {}".format(len(doc)))
    orig_tag_count = 0
    orig_concept_count = 0
    for s in doc:
        orig_concept_count += len(s.concepts)
        orig_tag_count += len(s.tags)
    print("# tags: {}".format(orig_tag_count))
    print("# concepts: {}".format(orig_concept_count))
    manual = dd(lambda: dd(dict))
    nonsenses = set()  # just ignore any tag with these sense IDs
    if args.manual:
        entries = CSV.read_tsv(args.manual)
        for sid, wid, tag, keep, lemma in entries:
            sid, wid, keep = int(sid), int(wid), int(keep)
            if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'):
                nonsenses.add(tag)
            if not lemma:
                manual[sid][wid][tag] = keep
            else:
                manual[sid][wid][(tag, lemma)] = keep
    wn = get_wn()
    ctx = wn.ctx()
    nope_synsets = set()
    ok_synsets = set()
    if args.wn30:
        rp.print("WN30 filter is activated")
    for sidx, sent in enumerate(doc):
        if args.topk and sidx > int(args.topk):
            break
        getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc)))
        getLogger().debug("Before concepts: {}".format(sent.concepts))
        getLogger().debug("Before tags: {}".format(sent.tags))
        # remove concepts that are not in PWN 3.0
        if args.wn30:
            remove_tags = set()
            for tag in sent.tags:
                if tag.tagtype == 'OMW' or tag.label in nonsenses:
                    remove_tags.add(tag)
            for tag in remove_tags:
                sent.tags.remove(tag)
            remove_concepts = set()
            for c in sent.concepts:
                if c.tag in ok_synsets:
                    pass
                elif c.tag in nope_synsets:
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                elif wn.get_synset(c.tag, ctx=ctx) is None:
                    # remove it
                    nope_synsets.add(c.tag)
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                else:
                    ok_synsets.add(c.tag)
            for c in remove_concepts:
                pop_concept(sent, c)
        msw = list(sent.msw())
        tcmap = sent.tcmap()
        # remove_tags = set()
        if msw:
            keep_remove = []
            for w in msw:
                max_len = 0
                keep = []
                remove = set()
                wid = sent.tokens.index(w)
                for c in tcmap[w]:
                    if c.tag in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][c.tag]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif (c.tag, c.clemma) in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][(c.tag, c.clemma)]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif len(c.tokens) == 1 or len(c.tokens) < max_len:
                        remove.add(c)
                    elif c.tag in nonsenses:
                        remove.add(c)
                    else:
                        max_len = len(c.tokens)
                        keep.append(c)
                if len(keep) != 1:
                    keep_remove.append((w, keep, remove))
                else:
                    # everything is OK, remove them now
                    for c in remove:
                        if args.debug:
                            rp.print("Removing concept {} from {}".format(c, sent.ID))
                        getLogger().debug("Removing concept {} from {}".format(c, sent.ID))
                        pop_concept(sent, c)
            if keep_remove:
                rp.header(sent)
                for w, keep, remove in keep_remove:
                    rp.write(w)
                    rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove))
        # remove sent's tags
        # for tag in remove_tags:
        #     getLogger().debug("removing tag: {}".format(tag))
        #     sent.tags.remove(tag)
        getLogger().debug("After concepts: {}".format(sent.concepts))
        getLogger().debug("After tags: {}".format(sent.tags))
    if nope_synsets:
        rp.print("Noped synsets: {}".format(nope_synsets))
    if args.output:
        doc_path = os.path.dirname(args.output)
        doc_name = os.path.basename(args.output)
        new_doc = ttl.Document(doc_name, doc_path)
        sents = doc if not args.topk else list(doc)[:int(args.topk)]
        for s in sents:
            new_doc.add_sent(s)
        tag_count = 0
        concept_count = 0
        for s in sents:
            concept_count += len(s.concepts)
            tag_count += len(s.tags)
        # baking ...
        if args.bake:
            print("Baking doc ...")
            bake_doc(new_doc)
        print("[New] # tags: {}".format(tag_count))
        print("[New] # concepts: {}".format(concept_count))
        rp.print("Writing fixed TTL to {}".format(new_doc.sent_path))
        new_doc.write_ttl()
Esempio n. 6
0
def list_unksense(args):
    header("List unknown sensekeys in Semcor")
    semxml = SemcorXML(SEMCOR_TTL)
    unk = Counter()
    sids = Counter()
    c = Counter()
    out = TextReport() if not args.out else TextReport(args.out)
    for f in semxml.files[:args.limit] if args.limit else ttl.files:
        doc = ttl.Document.from_json_file(ttl.files.abspath(f))
        for s in doc:
            for concept in s.concepts:
                try:
                    sid = SynsetID.from_string(concept.tag)
                    sids.count((sid, concept.clemma))
                    c.count("Known instances")
                except:
                    sid = None
                    unk.count((concept.tag, concept.clemma))
                    c.count("Unknown instances")
    out.header("Known concepts")
    out.writeline("\t".join(("synsetID", "lemma", "count")))
    for k, v in sids.sorted_by_count():
        sid, lemma = k
        out.writeline("\t".join((str(sid), lemma, str(v))))
    out.header("Unknown concepts")
    out.writeline("\t".join(("sensekey", "lemma", "count")))
    for k, v in unk.sorted_by_count():
        sk, lemma = k
        out.writeline("\t".join((sk, lemma, str(v))))
    out.header("Total")
    out.writeline("Known: {}".format(len(sids)))
    out.writeline("Unknown: {}".format(len(unk)))
    c.summarise(out)
Esempio n. 7
0
    def test_tagging_all(self):
        getLogger().debug("Tagging everything ...")
        sents = self.gold()
        smap = {str(s.ident): s for s in sents}
        # reag tags
        doc = ttl.Document('gold', TEST_GOLD_DIR).read()
        filter_wrong_senses(doc)
        count_good_bad = Counter()
        perfects = []
        to_be_checked = dd(list)
        tbc_concepts = dd(list)
        concept_count = Counter()
        fix_texts = []
        instances = Counter()
        tag_map = dd(set)
        report = TextReport('data/gold_report.txt')
        matched_report = TextReport('data/gold_matched.txt')
        not_matched_report = TextReport('data/gold_notmatched.txt')
        for s in sents[:5]:
            sid = str(s.ident)
            if not doc.has_id(sid):
                raise Exception("Cannot find sentence {}".format(sid))
            elif len(s) == 0:
                logging.warning("Empty sentence: {}".format(s))
            else:
                tagged = doc.get(sid)
                if s.text != tagged.text:
                    fix_texts.append((s.ident, s.text, tagged.text))
                # try to tag ...
                dmrs = s[0].dmrs()
                matched, not_matched, ignored = tag_gold(dmrs, tagged, s.text, mode=Lexsem.ROBUST)
                if not not_matched:
                    count_good_bad.count("Perfect")
                    perfects.append((s, matched))
                else:
                    for nm in not_matched:
                        tag_map[nm.tag].add(nm.clemma)
                        tbc_concepts[nm.tag].append(s.ident)
                        concept_count.count(nm.tag)
                        instances.count('instances')
                    to_be_checked[s.ident].append(nm)
                    count_good_bad.count("To be checked")
        # report matched
        for sent, m in perfects:
            tagged = doc.get(str(sent.ident))
            matched_report.header("#{}: {}".format(sent.ident, sent.text), "h0")
            matched_report.writeline(sent[0].dmrs())
            matched_report.header("Concepts")
            for c, nid, pred in m:
                matched_report.writeline("{} ===> {}:{}".format(c, nid, pred))
            matched_report.writeline()
            matched_report.writeline()
        # report not matched
        not_matched_report.header("By senses", "h0")
        for k, v in concept_count.most_common():
            sids = ' '.join(["#{}".format(x) for x in tbc_concepts[k]])
            not_matched_report.print("{}: {} | {} => {}".format(k, v, sids, tag_map[k]))
        not_matched_report.header("By sentences", "h0")
        for sid, nm in to_be_checked.items():
            not_matched_report.print("#{}: {}  | {}".format(sid, nm, smap[str(sid)].text))
        # full details
        for sid, nm in to_be_checked.items():
            sent = smap[str(sid)]
            tagged = doc.get(str(sid))
            not_matched_report.header("#{}: {}".format(sid, sent.text))
            not_matched_report.writeline(sent[0].dmrs())
            for n in nm:
                not_matched_report.writeline(n)

        # for i, t1, t2 in fix_texts:
        #     getLogger().debug(i)
        #     getLogger().debug(t1)
        #     getLogger().debug(t2)
        count_good_bad.summarise(report=report)
        instances.summarise(report=report)
Esempio n. 8
0
def list_unksense(args):
    header("List unknown sensekeys in Semcor")
    semxml = SemcorXML(SEMCOR_TTL)
    unk = Counter()
    sids = Counter()
    c = Counter()
    out = TextReport() if not args.out else TextReport(args.out)
    for f in semxml.files[:args.limit] if args.limit else ttl.files:
        doc = ttl.Document.from_json_file(ttl.files.abspath(f))
        for s in doc:
            for concept in s.concepts:
                try:
                    sid = SynsetID.from_string(concept.tag)
                    sids.count((sid, concept.clemma))
                    c.count("Known instances")
                except:
                    sid = None
                    unk.count((concept.tag, concept.clemma))
                    c.count("Unknown instances")
    out.header("Known concepts")
    out.writeline("\t".join(("synsetID", "lemma", "count")))
    for k, v in sids.sorted_by_count():
        sid, lemma = k
        out.writeline("\t".join((str(sid), lemma, str(v))))
    out.header("Unknown concepts")
    out.writeline("\t".join(("sensekey", "lemma", "count")))
    for k, v in unk.sorted_by_count():
        sk, lemma = k
        out.writeline("\t".join((sk, lemma, str(v))))
    out.header("Total")
    out.writeline("Known: {}".format(len(sids)))
    out.writeline("Unknown: {}".format(len(unk)))
    c.summarise(out)