Esempio n. 1
0
def map_preds(cli, args):
    rp = TextReport(args.output)
    ctx = PredSense.wn.ctx()
    not_found = []
    pred_file = 'data/erg_preds_interesting.txt'
    if args.all:
        pred_file = 'data/erg_preds_sorted.txt'
    name, ext = os.path.splitext(pred_file)
    not_found_file = name + "_notfound" + ext
    with open(pred_file, 'r') as infile:
        for p_str in infile.read().splitlines():
            p = Predicate.from_string(p_str)
            candidates = None
            if p.pos == 'x' and p.sense == 'subord':
                continue  # ignore these for now
            # if (p.pos == 'x' and p.sense == 'deg') or p.pos == 'p':
            if args.all or (p.pos and p.pos in 'xpq'):
                rp.header(p, p.lemma, p.pos, p.sense)
                candidates = PredSense.search_pred_string(p, ctx=ctx)
                for c in candidates:
                    rp.print(c.ID, c.lemmas, c.definition)
            if not candidates:
                not_found.append(p_str)
    with TextReport(not_found_file, 'w') as outfile:
        for p in not_found:
            outfile.print(p)

    if args.output:
        print("Written to: {}".format(args.output))
    print("Done")
Esempio n. 2
0
def main():
    header("Main method")
    c = Counter()
    t = Timer()
    t.start("Doing some time-consuming tasks ...")

    logging.info("Count even & odd numbers ...")
    for i in range(10000):
        if i % 2 == 0:
            c.count("even")
        else:
            c.count("odd")
    c.summarise()

    logging.info("Creating report dir ...")
    FileHelper.create_dir(DATA_DIR)

    report = TextReport(REPORT_LOC)
    logging.info("Now try to create a text report (Located at: %s)" % (report.get_path()))
    generate_report(report)

    # try to report to stdout
    logging.info("The same report to stdout ...")
    generate_report(TextReport())
    t.end("Done")
Esempio n. 3
0
def process_lemma(cli, args):
    limit = int(args.topk) if args.topk and int(args.topk) > 0 else None
    pos = args.pos
    db = EWDB(args.db)
    rp = TextReport()
    rp.header("DB location: {}".format(db.ds.path))
    with db.ctx() as ctx:
        if args.flag:
            query = ['(flag IS NULL OR flag = ?)']
            params = [args.flag]
        else:
            query = ['flag IS NULL']
            params = []
        if pos:
            query.append('pos=?')
            params.append(pos)
        senses = ctx.sense.select(' AND '.join(query), params, limit=limit)
        print("Found {} senses for {}".format(len(senses), pos))
        for idx, sense in enumerate(senses):
            if idx % 50 == 0:
                print("Processed {} / {}".format(idx, len(senses)))
            found_gold = is_gold(sense, db, ctx)  # non zero = True
            if found_gold:
                # flag this sense as gold
                db.flag_sense(sense.ID, found_gold, ctx=ctx)
            elif sense.flag != EWDB.Flags.PROCESSED:
                db.flag_sense(sense.ID, EWDB.Flags.PROCESSED, ctx=ctx)
    pass
Esempio n. 4
0
def task_mine_mwe(cli, args, db=None):
    if db is None:
        db = EWDB(args.db)
    with db.ctx() as ctx:
        with TextReport('data/mwe_extra.txt') as outfile_extra:
            senses_extra = mine_mwe_extra(db, ctx)
            for sense in senses_extra:
                outfile_extra.print(sense.lemma)
        with TextReport('data/mwe.txt', 'w') as outfile:
            senses = mine_mwe(db, ctx)
            for sense in senses:
                outfile.print(sense.lemma)
        with TextReport('data/mwe_nospace.txt', 'w') as outfile_nospace:
            nospaces = mine_mwe_nospace(db, ctx, senses)
            for sense in nospaces:
                outfile_nospace.print(sense.lemma)
        with TextReport('data/mwe_of.txt') as outfile_of:
            senses_of = mine_mwe_of(db, ctx)
            for sense in senses_of:
                outfile_of.print(sense.lemma)
        with TextReport('data/mwe_apos_s.txt') as outfile_apos_s:
            senses_apos_s = mine_mwe_apos_s(db, ctx)
            for sense in senses_apos_s:
                outfile_apos_s.print(sense.lemma)
        # report
        getLogger().debug("Found MWE: {}".format(len(senses)))
        getLogger().debug("Found MWE-of: {}".format(len(senses_of)))
        getLogger().debug("No space: {}".format(len(nospaces)))
        getLogger().debug("Extra: {}".format(len(senses_extra)))
Esempio n. 5
0
def main():
    header("Main method")
    c = Counter()
    t = Timer()
    t.start("Doing some time-consuming tasks ...")

    logging.info("Count even & odd numbers ...")
    for i in range(10000):
        if i % 2 == 0:
            c.count("even")
        else:
            c.count("odd")
    c.summarise()

    logging.info("Creating report dir ...")
    FileHelper.create_dir(DATA_DIR)

    report = TextReport(REPORT_LOC)
    logging.info("Now try to create a text report (Located at: %s)" %
                 (report.get_path()))
    generate_report(report)

    # try to report to stdout
    logging.info("The same report to stdout ...")
    generate_report(TextReport())
    t.end("Done")
Esempio n. 6
0
def list_gpreds(cli, args):
    rp = TextReport(args.output)
    with open('data/erg_preds_sorted.txt', 'r') as infile:
        sorted_preds = (Predicate.from_string(l) for l in infile)
        for pred in sorted_preds:
            if pred.ptype == Predicate.GRAMMARPRED:
                rp.print(pred)
    pass
Esempio n. 7
0
def list_preds(cli, args):
    rp = TextReport(args.output)
    lexdb = read_erg_lex()
    keyrels = set(l.keyrel for l in lexdb if l.keyrel)
    preds = [Predicate.from_string(p) for p in keyrels]
    sorted_preds = sorted(preds, key=lambda x: x.pos or '')
    # All preds
    with open('data/erg_preds_sorted.txt', 'w') as outfile:
        for pred in sorted_preds:
            outfile.write('{}\n'.format(pred))
    poses = set(p.pos for p in preds)
    trivial_preds = [p for p in preds if p.pos and p.pos in TRIVIAL_POS]
    if not args.trivial:
        preds = [p for p in preds if not p.pos or p.pos not in TRIVIAL_POS]
    interesting_poses = set(p.pos for p in preds)
    # write interesting preds to file
    c = Counter()
    with open('data/erg_preds_interesting.txt', 'w') as outfile:
        for pred in sorted(preds, key=lambda x: "cqpx".index(x.pos) if x.pos else 0):
            c.count(pred.pos if pred.pos else 'NONE')
            outfile.write('{}\n'.format(pred))
    # report
    rp.print("Interesting preds: {}".format(len(preds)))
    rp.print("Trivial preds: {}".format(len(trivial_preds)))
    rp.print("POS: {}".format(poses))
    rp.print("Interesting POS: {}".format(interesting_poses))
    c.summarise(rp)
Esempio n. 8
0
def show_stats(cli, args):
    db = EWDB(args.db)
    rp = TextReport()
    rp.header("DB location: {}".format(db.ds.path))
    with db.ctx() as ctx:
        for pos in 'nvar':
            senses = ctx.sense.select("pos=?", (pos, ))
            print("pos={}: {}".format(pos, len(senses)))
            senses = ctx.sense.select("pos=? AND flag=?",
                                      (pos, EWDB.Flags.GOLD))
            print("GOLD pos={}: {}".format(pos, len(senses)))
    pass
Esempio n. 9
0
 def test_kata2hira(self):
     rp = TextReport.string()
     for k in KATAKANA[1:87]:
         h = simple_kata2hira(k)
         rp.write(h, k, '|', separator='')
     expected = TestTool.ALL_MAPPING
     self.assertEqual(rp.content(), expected)
Esempio n. 10
0
 def test_kata2hira(self):
     rp = TextReport.string()
     for k in KATAKANA[1:87]:
         h = simple_kata2hira(k)
         rp.write(h, k, '|', separator='')
     expected = TestTool.ALL_MAPPING
     self.assertEqual(rp.content(), expected)
Esempio n. 11
0
def wn31_to_wn30(cli, args):
    csvlines = CSV.read('data/ili-map-pwn30.tab')
    ili_map = {k: v for k, v in csvlines}
    notfound = []
    c = Counter()
    print("ILI-wn30 map: {}".format(len(ili_map)))
    wn31 = CSV.read('data/wn31.csv', dialect='excel-tab')
    with omw.ctx() as ctx, TextReport('data/wn31_diff.txt') as diff_file:
        for sid, iid, sdef in wn31:
            if iid in ili_map:
                c.count("Found")
                wn30_id = ili_map[iid]
                try:
                    if wn30_id.endswith('s'):
                        wn30_id = wn30_id[:-1] + 'a'
                    wn30_ss = omw.get_synset(wn30_id, ctx=ctx)
                except:
                    cli.logger.exception(
                        "Cannot find synset {}".format(wn30_id))
                    raise
                # compare def
                if sdef != wn30_ss.definition and not sdef.startswith(
                        wn30_ss.definition + ";"):
                    diff_file.print("synset: {} | Lemmas: {}".format(
                        wn30_id, ', '.join(wn30_ss.lemmas)))
                    diff_file.print("OMW   : {}".format(wn30_ss.definition))
                    diff_file.print("Wn31  : {}".format(sdef))
                    diff_file.print("")
                    c.count("Diff")
            else:
                notfound.append(iid)
                c.count("Not in ILI")
    c.summarise()
    print(notfound)
    pass
Esempio n. 12
0
def gen_vocab(cli, args):
    ''' Generate vocabulary list from a tokenized file '''
    if args.topk and args.topk <= 0:
        topk = None
        cli.logger.warning("Invalid k will be ignored (k should be greater than or equal to 1)")
    else:
        topk = args.topk
    if args.stopwords:
        with open(args.stopwords, 'r') as swfile:
            stopwords = swfile.read().splitlines()
    else:
        stopwords = []
    if os.path.isfile(args.input):
        cli.logger.info("Generating vocabulary list from file {}".format(args.input))
        with codecs.open(args.input, encoding='utf-8') as infile:
            if args.output:
                cli.logger.info("Output: {}".format(args.output))
            rp = TextReport(args.output)
            lines = infile.read().splitlines()
            c = Counter()
            for line in lines:
                words = line.split()
                c.update(w for w in words if w not in stopwords)
            # report vocab
            word_freq = c.most_common(topk)
            words = [k for k, v in word_freq]
            rp.header("Lexicon")
            rp.writeline("\n".join(textwrap.wrap(" ".join(w for w in words), width=70)))
            for k, v in word_freq:
                rp.print("{}: {}".format(k, v))
    else:
        cli.logger.warning("File {} does not exist".format(args.input))
Esempio n. 13
0
def hello_jamdict(cli, args):
    ''' Say hello and test if Jamdict is working '''
    jam = get_jam(cli, args)
    if jam.ready:
        results = jam.lookup("一期一会")
        dump_result(results, report=TextReport())
    else:
        getLogger().warning(
            "Hello there, unfortunately jamdict data is not available. Please try to install using `pip install jamdict-data`"
        )
Esempio n. 14
0
 def test_export_to_streams(self):
     doc = ttl.Document('manual', TEST_DATA)
     # create sents in doc
     raws = ("三毛猫が好きです。", "雨が降る。", "女の子はケーキを食べる。")
     for sid, r in enumerate(raws):
         msent = txt2mecab(r)
         tsent = doc.new_sent(msent.surface, sid)
         tsent.import_tokens(msent.words)
         # pos tagging
         for mtk, tk in zip(msent, tsent):
             tk.pos = mtk.pos3()
             tk.new_tag(mtk.reading_hira(), tagtype="Reading", source=ttl.Tag.MECAB)
     # sense tagging
     doc[2][4].comment = 'to eat'
     doc[0].new_concept("三毛猫", "wiki.ja:三毛猫", tokens=[0, 1, 2]).comment = 'Calico cat, you know?'
     doc[1].new_concept("降る", "02756821-v", tokens=(2,))
     doc[2].new_concept("女の子", "10084295-n", tokens=(0,))
     doc[2].new_concept("食べる", "01166351-v", (4,))
     # tags
     doc[0].new_tag("WIKI", 0, 3, tagtype="SRC")
     doc[0].new_tag("https://ja.wikipedia.org/wiki/三毛猫", 0, 3, tagtype="URL")
     doc[2].new_tag("WIKI", 0, 3, tagtype="SRC")
     doc[2].new_tag("https://ja.wikipedia.org/wiki/少女", 0, 3, tagtype="URL")
     # export doc
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file) as writer:
         writer.write_doc(doc)
         getLogger().debug("sents\n{}".format(sents.content()))
         getLogger().debug("words\n{}".format(words.content()))
         getLogger().debug("concepts\n{}".format(concepts.content()))
         getLogger().debug("links\n{}".format(links.content()))
         getLogger().debug("tags\n{}".format(tags.content()))
         self.assertTrue(sents.content())
         self.assertTrue(words.content())
         self.assertTrue(concepts.content())
         self.assertTrue(links.content())
         self.assertTrue(tags.content())
         for sent in doc:
             logging.debug(json.dumps(sent.to_json(), ensure_ascii=False))
Esempio n. 15
0
 def test_get_mfs(self):
     words = 'we this sing full cat tongue name dry die horn sun with mountain eye belly old big red woman live head animal because cloud louse sleep ear wet know salt walk eat seed green bite say person all child count thin stand father laugh night give stone heavy if bone sister other yellow small work snake smoke kill white swim short grease worm narrow flower neck path drink flesh good sharp ash snow hot fire mouth see dirty hand egg skin cold fly wood mother come I warm where one play foot sea year new earth smooth two water what burn fish vomit bird how long hunt sit rope feather nose dust round wind tooth correct bark root ice not blood tail dull brother man heart lie liver many pig rain claw who day grass knee when leaf wide hair meat black dog star dance breasts wife sand husband You bad hear moon river tree that'.split(
     )
     with omw.ctx() as ctx, TextReport(
             'data/mfs1500.txt') as rp, TextReport(
                 "data/wndef.txt") as deffile:
         query = 'wordid in (SELECT wordid FROM word WHERE lemma in {})'.format(
             repr(tuple(words)))
         rows = ctx.sense.select(query)
         ssids = [SynsetID.from_string(r.synset) for r in rows]
         for ssid in ssids:
             ss = omw.get_synset(ssid, ctx=ctx)
             if ss.lemmas and ss.definition:
                 rp.print("{id} ({lm}): {df}".format(
                     id=ss.ID,
                     lm=", ".join(ss.lemmas),
                     df=ss.definition.strip()))
                 deffile.print(ss.definition.strip())
     print("Done!")
Esempio n. 16
0
def convert(cli, args):
    ''' Convert patches from CSV format to YAML '''
    rp = TextReport()
    # validate input file
    if not args.input:
        patch_path = os.path.join(DATA_FOLDER, 'patches',
                                  '20171112_Wn31_glitches_def.csv')
    else:
        patch_path = args.input
    if not os.path.isfile(patch_path):
        raise Exception("File {} does not exist.".format(patch_path))
    # validate output file
    out_path = args.output if args.output else None
    if out_path == '*.yaml':
        out_path = FileHelper.replace_ext(patch_path, 'yaml')
    rp.print("Input:", patch_path)
    rp.print("Output:", out_path if out_path else '*stdout*')
    # convert patches
    patches = read_csv(patch_path)
    json_patches = [p.to_json() for p in patches]
    yaml_str = yaml.dump(json_patches, default_flow_style=False)
    # dump output
    if out_path:
        with open(out_path, 'w') as outfile:
            outfile.write(yaml_str)
        if args.echo:
            print(yaml_str)
    else:
        print(yaml_str)
Esempio n. 17
0
def gen_mfs_5500(cli, args):
    ''' Generate 3rd round tree banking '''
    rp = TextReport(args.output)
    topk_synsets = topk_mfs(5500)
    # finished treebanking
    first_round = read_lines('data/omw3000_synsets.txt')
    second_round = read_lines('data/omw5000_synsets.txt')
    done_synsets = set(first_round + second_round)
    # new
    third_round = topk_synsets.difference(done_synsets)
    # report
    print("All     :", len(topk_synsets))
    print("Done    :", len(done_synsets))
    print("New     :", len(third_round))
    # write to a synset file
    with open('data/omw5300_synsets.txt', 'w') as outfile:
        outfile.write('\n'.join(third_round))
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        profile = 'omw5300'
        filename = 'omw5300A'
        for idx, sid in enumerate(third_round):
            ss = omw.get_synset(sid, ctx=ctx)
            hub[profile].header(ss.ID,
                                'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub[profile].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)
Esempio n. 18
0
def import_data(cli, args):
    '''Import XML data into SQLite database'''
    rp = TextReport()
    t = Timer(report=rp)
    db_loc = os.path.abspath(os.path.expanduser(args.jdb))
    rp.print("Jamdict DB location        : {}".format(db_loc))
    rp.print("JMDict XML file location   : {}".format(args.jmdxml))
    rp.print("Kanjidic2 XML file location: {}".format(args.kd2xml))
    jam = get_jam(cli, args)
    if args and (args.jdb or args.kd2):
        if os.path.isfile(db_loc):
            if not confirm(
                    "Database file exists. Do you want to overwite (This action cannot be undone! yes/no?) "
            ):
                cli.logger.warning("Program aborted.")
                exit()
            else:
                os.unlink(db_loc)
        # perform input
        t.start(
            "Creating Jamdict SQLite database. This process may take very long time ..."
        )
        jam.import_data()
        t.stop()
    else:
        print("Database paths were not provided. Process aborted.")
Esempio n. 19
0
 def test_export_to_streams(self):
     doc = ttl.Document('manual', TEST_DATA)
     # create sents in doc
     raws = (sent1, sent2, sent3)
     mecab_outputs = (sent1_mecab, sent2_mecab, sent3_mecab)
     for sid, (text, mecab_output) in enumerate(zip(raws, mecab_outputs)):
         deko.mecab._mecab_output_to_sent(text, mecab_output, doc=doc)
     # sense tagging
     doc[2][4].comment = 'to eat'
     doc[0].concepts.new("三毛猫", "wiki_ja", "三毛猫",
                         tokens=[0, 1, 2]).comment = 'Calico cat, you know?'
     doc[1].concepts.new("02756821-v", "wn", "降る", tokens=(2, ))
     doc[2].concepts.new("10084295-n", "wn", "女の子", tokens=(0, ))
     doc[2].concepts.new("01166351-v", "wn", "食べる", (4, ))
     # tags
     doc[0].tags.new("WIKI", "src", 0, 3)
     doc[0].tags.new("https://ja.wikipedia.org/wiki/三毛猫", "url", 0, 3)
     doc[2].tags.new("WIKI", "src", 0, 3)
     doc[2].tags.new("https://ja.wikipedia.org/wiki/少女", "url", 0, 3)
     # export doc
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     with ttl.TxtWriter(sents.file, words.file, concepts.file, links.file,
                        tags.file) as writer:
         writer.write_doc(doc)
         getLogger().debug("sents\n{}".format(sents.content()))
         getLogger().debug("words\n{}".format(words.content()))
         getLogger().debug("concepts\n{}".format(concepts.content()))
         getLogger().debug("links\n{}".format(links.content()))
         getLogger().debug("tags\n{}".format(tags.content()))
         self.assertTrue(sents.content())
         self.assertTrue(words.content())
         self.assertTrue(concepts.content())
         self.assertTrue(links.content())
         self.assertTrue(tags.content())
         for text in doc:
             logging.debug(json.dumps(text.to_dict(), ensure_ascii=False))
Esempio n. 20
0
def wn2ttl(args):
    inpath = FileHelper.abspath(args.inpath)
    header("WN to TTL format")
    wn = GWordnetXML()
    wn.read(inpath)
    print("Found senses: {}".format(len(wn.synsets)))
    outpath = FileHelper.abspath(args.outpath) if args.outpath else None
    with TextReport(outpath, 'w') as outfile:
        if args.format == 'json':
            convert_json(wn.synsets, outfile)
        elif args.format == 'xml':
            convert_xml(wn.synsets, outfile)
    print("Done!")
Esempio n. 21
0
 def test_ttl_tsv_serialization(self):
     sent = self.build_test_sent()
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     writer = ttl.TxtWriter(sents.file, words.file, concepts.file, links.file, tags.file)
     writer.write_sent(sent)
     sents_txt = sents.content()
     words_txt = words.content()
     concepts_txt = concepts.content()
     links_txt = links.content()
     tags_txt = tags.content()
     getLogger().debug("sents\n{}".format(sents_txt))
     getLogger().debug("words\n{}".format(words_txt))
     getLogger().debug("concepts\n{}".format(concepts_txt))
     getLogger().debug("links\n{}".format(links_txt))
     getLogger().debug("tags\n{}".format(tags_txt))
     # read it back
     reader = ttl.TxtReader(io.StringIO(sents_txt),
                            io.StringIO(words_txt),
                            io.StringIO(concepts_txt),
                            io.StringIO(links_txt),
                            io.StringIO(tags_txt))
     docx = reader.read()
     # patch sent.ID
     sent.ID = 1
     jo = sent.to_json()
     jr = docx[0].to_json()
     getLogger().debug(jo)
     getLogger().debug(jr)
     self.assertEqual(jo['text'], jr['text'])
     self.assertEqual(jo['tokens'], jr['tokens'])
     self.assertEqual(jo['concepts'], jr['concepts'])
     self.assertEqual(jo['tags'], jr['tags'])
     self.assertEqual(jo['flag'], jr['flag'])
     self.assertEqual(jo['comment'], jr['comment'])
     self.assertEqual(jo, jr)
Esempio n. 22
0
def extract_omw(cli, args):
    ''' OMW Extractor '''
    rp = TextReport()
    omw = get_omw()
    WN_POS = 'nvar'
    with omw.ctx() as ctx:
        for pos in WN_POS:
            rp.header("POS: {}".format(pos))
            query = '''SELECT lemma, sense.synset, def as sdef FROM sense LEFT JOIN word ON sense.wordid = word.wordid and sense.lang=word.lang LEFT JOIN synset_def ON sense.synset = synset_def.synset AND sense.lang = synset_def.lang WHERE sense.lang='eng' AND word.lang='eng' AND synset_def.lang='eng' AND pos=? ORDER By freq DESC '''
            params = [pos]
            if args.topk:
                query += ' LIMIT ?'
                params.append(args.topk)
            results = ctx.select(query, params)
            senses = OrderedDict()
            potential_names = 0
            for lemma, sid, sdef in results:
                if lemma.lower() != lemma:
                    # if pos not in 'nar':
                    #     rp.print("{} - {}".format(lemma, pos))
                    potential_names += 1
                if (lemma, sid) in senses:
                    senses[(lemma, sid)] += "; " + sdef
                else:
                    senses[(lemma, sid)] = sdef
            print("Found {} sense in OMW".format(len(senses.keys())))
            print("Potential name: {}".format(potential_names))
            if args.output:
                out_path = "{}_{}.txt".format(args.output, pos)
                wordsenses = (k + (v, ) for k, v in senses.items())
                CSV.write_tsv(out_path, wordsenses, quoting=CSV.QUOTE_MINIMAL)
                print("Written to {}".format(out_path))
                lemma_out_path = "{}_{}_lemma.txt".format(args.output, pos)
                with open(lemma_out_path, 'w') as outfile:
                    for l, sid in senses.keys():
                        outfile.write(l)
                        outfile.write('\n')
                    print("Written to {}".format(lemma_out_path))
Esempio n. 23
0
 def test_ttl_tsv_serialization(self):
     sent = self.build_test_sent()
     concepts = TextReport.string()
     links = TextReport.string()
     sents = TextReport.string()
     tags = TextReport.string()
     words = TextReport.string()
     writer = ttl.TxtWriter(sents.file, words.file, concepts.file,
                            links.file, tags.file)
     writer.write_sent(sent)
     sents_txt = sents.content()
     words_txt = words.content()
     concepts_txt = concepts.content()
     links_txt = links.content()
     tags_txt = tags.content()
     getLogger().debug("sents\n{}".format(sents_txt))
     getLogger().debug("words\n{}".format(words_txt))
     getLogger().debug("concepts\n{}".format(concepts_txt))
     getLogger().debug("links\n{}".format(links_txt))
     getLogger().debug("tags\n{}".format(tags_txt))
     # read it back
     reader = ttl.TxtReader(io.StringIO(sents_txt), io.StringIO(words_txt),
                            io.StringIO(concepts_txt),
                            io.StringIO(links_txt), io.StringIO(tags_txt))
     docx = reader.read()
     # patch sent.ID
     sent.ID = 1
     jo = sent.to_dict()
     jr = docx[0].to_dict()
     getLogger().debug(jo)
     getLogger().debug(jr)
     self.assertEqual(jo['text'], jr['text'])
     self.assertEqual(jo['tokens'], jr['tokens'])
     self.assertEqual(jo['concepts'], jr['concepts'])
     self.assertEqual(jo['tags'], jr['tags'])
     self.assertEqual(jo['flag'], jr['flag'])
     self.assertEqual(jo['comment'], jr['comment'])
     self.assertEqual(jo, jr)
Esempio n. 24
0
def babelfy_doc(docpath=DEFAULT_DOCPATH,
                docname=DEFAULT_DOCNAME,
                outfile=None,
                **kwargs):
    ''' Babelfy a tagged document
    '''
    speckled = ttl.Document(docname, docpath).read()
    sents = []
    for s in speckled:
        output = json.loads(babelfy(s.text, **kwargs))
        sents.append((s.ID, output))
    if outfile:
        with TextReport(outfile) as rp:
            rp.write(json.dumps(sents, indent=2))
    return sents
Esempio n. 25
0
def manual_patch(cli, args):
    rp = TextReport()
    omw = get_omw()
    if not args.input or not os.path.isfile(args.input):
        raise Exception("Input file could not be found")
    with open(args.input, 'r') as infile, omw.ctx() as ctx:
        synsets = json.loads(infile.read())
        # for ss in synsets:
        #     rp.print(ss['synset'], ss['definition'])
        # rp.print("Found synsets:", len(synsets))
        for sinfo in synsets:
            sid, fixed_def = sinfo['synset'], sinfo['definition']
            ss = omw.get_synset(sid, ctx=ctx)
            orig_def = remove_puncs(ss.definition)
            if remove_puncs(fixed_def) != orig_def:
                rp.header("WARNING:", sid)
                rp.print(ss.definition)
                rp.print(fixed_def)
Esempio n. 26
0
    def test_long_sentence(self):
        r = Reading('''[ TOP: h0
  RELS: < [ part_of<3:6> LBL: h1 ARG0: x25 [ x NUM: pl PERS: 3 ] ARG1: x27 [ x NUM: pl PERS: 3 IND: + ] ]
          [ _all_q<3:6> LBL: h2 ARG0: x25 RSTR: h49 ]
          [ _these_q_dem<7:12> LBL: h3 ARG0: x27 RSTR: h50 ]
          [ _vary_v_cause<13:19> LBL: h4 ARG0: e26 [ e SF: prop TENSE: untensed MOOD: indicative PROG: bool PERF: - ] ARG2: x27 ]
          [ _case_n_of<20:26> LBL: h4 ARG0: x27 ]
          [ _however_a_1<27:35> LBL: h5 ARG0: i28 ARG1: h51 ]
          [ pron<36:37> LBL: h6 ARG0: x29 [ x NUM: sg PERS: 1 IND: + PT: std ] ]
          [ pronoun_q<36:37> LBL: h7 ARG0: x29 RSTR: h52 ]
          [ neg<38:44> LBL: h8 ARG0: e30 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: h53 ]
          [ _can_v_modal<38:44> LBL: h9 ARG0: e31 [ e SF: prop TENSE: pres MOOD: indicative PROG: - PERF: - ] ARG1: h54 ]
          [ _recall_v_1<45:51> LBL: h10 ARG0: e32 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: x29 ARG2: x33 [ x NUM: pl PERS: 3 ] ]
          [ _any_q<52:55> LBL: h11 ARG0: x33 RSTR: h55 ]
          [ part_of<52:55> LBL: h12 ARG0: x33 ARG1: x25 ]
          [ _present_v_to<62:71> LBL: h12 ARG0: e34 [ e SF: prop TENSE: past MOOD: indicative PROG: - PERF: - ] ARG1: x33 ARG2: x37 [ x NUM: pl PERS: 3 IND: + ] ]
          [ udef_q<72:191> LBL: h13 ARG0: x37 RSTR: h56 ]
          [ _more_x_comp<72:76> LBL: h14 ARG0: e35 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: e36 [ e SF: prop TENSE: untensed MOOD: indicative PROG: bool PERF: - ] ARG2: x38 [ x NUM: sg PERS: 3 GEND: n ] ]
          [ _singular_a_1<77:85> LBL: h14 ARG0: e36 ARG1: x37 ]
          [ _feature_n_1<86:94> LBL: h14 ARG0: x37 ]
          [ generic_entity<100:104> LBL: h15 ARG0: x38 ]
          [ _that_q_dem<100:104> LBL: h16 ARG0: x38 RSTR: h57 ]
          [ _associate_v_with<115:125> LBL: h15 ARG0: e39 [ e SF: prop TENSE: past MOOD: indicative PROG: - PERF: - ] ARG2: x38 ARG3: x44 [ x NUM: sg PERS: 3 IND: + ] ]
          [ _the_q<131:134> LBL: h17 ARG0: x44 RSTR: h58 ]
          [ _well_x_deg<135:140> LBL: h18 ARG0: e40 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: e41 [ e SF: prop TENSE: untensed MOOD: indicative PROG: bool PERF: - ] ]
          [ _know_v_1<140:145> LBL: h18 ARG0: e41 ARG2: x44 ]
          [ compound<146:159> LBL: h18 ARG0: e42 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: x44 ARG2: x43 [ x NUM: sg PERS: 3 IND: + PT: notpro ] ]
          [ proper_q<146:152> LBL: h19 ARG0: x43 RSTR: h59 ]
          [ named<146:152> LBL: h20 ARG0: x43 CARG: "Surrey" ]
          [ _family_n_of<153:159> LBL: h18 ARG0: x44 ]
          [ _of_p<160:162> LBL: h18 ARG0: e45 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: x44 ARG2: x46 [ x NUM: pl PERS: 3 IND: + ] ]
          [ _the_q<163:166> LBL: h21 ARG0: x46 RSTR: h60 ]
          [ named<167:175> LBL: h22 ARG0: x46 CARG: "Roylotts" ]
          [ _of_p<176:178> LBL: h22 ARG0: e47 [ e SF: prop TENSE: untensed MOOD: indicative PROG: - PERF: - ] ARG1: x46 ARG2: x48 [ x NUM: sg PERS: 3 IND: + ] ]
          [ proper_q<179:191> LBL: h23 ARG0: x48 RSTR: h61 ]
          [ named<179:191> LBL: h24 ARG0: x48 CARG: "Stoke Moran" ] >
  HCONS: < h0 qeq h5 h49 qeq h1 h50 qeq h4 h51 qeq h8 h52 qeq h6 h53 qeq h9 h54 qeq h10 h55 qeq h12 h56 qeq h14 h57 qeq h15 h58 qeq h18 h59 qeq h20 h60 qeq h22 h61 qeq h24 > ]
''')
        optimus = Transformer()
        rules = optimus.find_rules(r.dmrs().layout.nodes, limit=100)
        with TextReport.null() as outfile:
            for rule in rules:
                outfile.print(rule.lemma, rule.head(), rule.construction.to_dmrs())
        optimus.apply(r)
        getLogger().debug(r.edit().nodes)
Esempio n. 27
0
def lookup(cli, args):
    '''Lookup words by kanji/kana'''
    jam = get_jam(cli, args)
    results = jam.lookup(args.query, strict_lookup=args.strict)
    report = TextReport(args.output)
    if args.format == 'json':
        report.print(json.dumps(results.to_json(),
                                ensure_ascii=args.ensure_ascii,
                                indent=args.indent if args.indent else None))
    else:
        if args.compact:
            report.print(results.text(separator='\n------\n', entry_sep='\n'))
        else:
            dump_result(results, report=report)
Esempio n. 28
0
def find_omw_typo(cli, args):
    omw = get_omw()
    with omw.ctx() as ctx:
        defs = ctx.synset_def.select(
            "lang='eng' and (def like '% )%' or def like '%  %' or def like '% e.g.' or def like '% ,%' or def like '%:')"
        )
        if args.action == 'list':
            print("Found {} definitions with typo".format(len(defs)))
            for d in defs:
                print(d)
                print("Fixed: {}".format(repr(fix_typo(d._2))))
        elif args.action == 'patch':
            patch_script = TextReport(args.output)
            for d in defs:
                fixed_def = fix_typo(d._2)
                patch_script.writeline("-- Orig : {} [{}]".format(
                    d._2, d.synset))
                patch_script.writeline("-- Fixed: {}".format(fixed_def))
                patch_script.writeline(
                    "UPDATE synset_def SET def = '{}' WHERE synset='{}' AND def='{}';\n"
                    .format(to_sqlite_string(fixed_def), d.synset,
                            to_sqlite_string(d._2)))
Esempio n. 29
0
def create_ewdb(cli, args):
    db = EWDB(args.db)
    c = Counter()
    rp = TextReport()
    rp.header("DB location: {}".format(db.ds.path))
    with db.ctx() as ctx:
        for pos in 'nvar':
            file_name = 'data/tsdb/skeletons/omw_{}.txt'.format(pos)
            rp.print("Reading file: {}".format(file_name))
            for idx, row in enumerate(iter_tsv(file_name)):
                lemma, sid, sdef = row
                db.add_sense(sid, lemma, pos, sdef, ctx=ctx)
                c.count("Added")
    c.summarise()
    pass
Esempio n. 30
0
def gen_vocab(cli, args):
    ''' Generate vocabulary list from a tokenized file '''
    if args.topk and args.topk <= 0:
        topk = None
        cli.logger.warning(
            "Invalid k will be ignored (k should be greater than or equal to 1)"
        )
    else:
        topk = args.topk
    if args.stopwords:
        with open(args.stopwords, 'r') as swfile:
            stopwords = swfile.read().splitlines()
    else:
        stopwords = []
    if os.path.isfile(args.input):
        cli.logger.info("Generating vocabulary list from file {}".format(
            args.input))
        with codecs.open(args.input, encoding='utf-8') as infile:
            if args.output:
                cli.logger.info("Output: {}".format(args.output))
            rp = TextReport(args.output)
            lines = infile.read().splitlines()
            c = Counter()
            for line in lines:
                words = line.split()
                c.update(w for w in words if w not in stopwords)
            # report vocab
            word_freq = c.most_common(topk)
            words = [k for k, v in word_freq]
            rp.header("Lexicon")
            rp.writeline("\n".join(
                textwrap.wrap(" ".join(w for w in words), width=70)))
            for k, v in word_freq:
                rp.print("{}: {}".format(k, v))
    else:
        cli.logger.warning("File {} does not exist".format(args.input))
Esempio n. 31
0
def import_data(cli, args):
    '''Generate Jamdict SQLite database from XML data files'''
    rp = TextReport()
    t = Timer(report=rp)
    show_info(cli, args)
    jam = get_jam(cli, args)
    if not jam.db_file:
        print("Database path is not available")
    elif os.path.isfile(jam.db_file):
        if not confirm(
                "Database file exists. Do you want to overwite (This action cannot be undone! yes/no?) "
        ):
            cli.logger.warning("Program aborted.")
            exit()
        else:
            os.unlink(jam.db_file)
    # perform input
    print(f"Importing data to: {jam.db_file}")
    t.start(
        "Creating Jamdict SQLite database. This process may take very long time ..."
    )
    jam.import_data()
    t.stop()
Esempio n. 32
0
def gen_mfs_3000(cli, args):
    rp = TextReport(args.output)
    ssids = list(topk_mfs(3000))
    random.shuffle(ssids)
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        filename = 'omw3000A'
        for idx, sid in enumerate(ssids):
            ss = omw.get_synset(sid, ctx=ctx)
            if idx > len(ssids) / 2:
                filename = 'omw3000B'
            hub['omw3000'].header(ss.ID,
                                  'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub['omw3000'].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)
Esempio n. 33
0
def lookup(cli, args):
    '''Lookup words by kanji/kana'''
    jam = get_jam(cli, args)
    if jam.ready:
        results = jam.lookup(args.query, strict_lookup=args.strict)
        report = TextReport(args.output)
        if args.format == 'json':
            report.print(
                json.dumps(results.to_dict(),
                           ensure_ascii=args.ensure_ascii,
                           indent=args.indent if args.indent else None))
        else:
            if args.compact:
                report.print(
                    results.text(separator='\n------\n', entry_sep='\n'))
            else:
                dump_result(results, report=report)
    else:
        getLogger().warning(
            f"Jamdict database is not available.\nThere are 3 ways to install data: \n    1) install jamdict_data via PyPI using `pip install jamdict_data` \n    2) download prebuilt dictionary database file from: {jamdict.__url__}, \n    3) or build your own database file from XML source files."
        )
Esempio n. 34
0
def extract_wn31(cli, args):
    c = Counter()
    rp = TextReport()
    entries = []
    infile = FileHelper.abspath(args.input)
    if not os.path.isfile(infile):
        rp.print("File not found")
    else:
        rp.print("Processing {}".format(infile))
        tree = etree.iterparse(infile)
        for event, element in tree:
            if event == 'end' and element.tag == 'Synset':
                for child in element:
                    if child.tag == 'Definition':
                        entries.append((element.get('id'), element.get('ili'),
                                        child.text))
                        c.count('Definition')
                c.count("Synset")
                element.clear()
        c.summarise(report=rp)
    # Format: wn31sid ili definition
    CSV.write_tsv(args.output, entries)
Esempio n. 35
0
def gen_mfs_5000(cli, args):
    rp = TextReport(args.output)
    from omwtk.wn_ntumc_top3000 import WN_NTUMC_TOP3000
    first_round = set(x['synset'] for x in WN_NTUMC_TOP3000)
    top5000 = topk_mfs(5000)
    round2 = list(top5000.difference(first_round))
    random.shuffle(round2)
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        filename = 'omw5000A'
        for idx, sid in enumerate(round2):
            ss = omw.get_synset(sid, ctx=ctx)
            if idx > 200:
                filename = 'omw5000B'
            hub['omw5000'].header(ss.ID,
                                  'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub['omw5000'].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)
Esempio n. 36
0
def order_preds(cli, args):
    doc = Document.from_file(args.gold)
    output = TextReport(args.output)
    if not args.ident:
        output.print("No ident was provided")
    for ident in args.ident:
        sent = doc.by_ident(ident, default=None)
        if sent is None:
            output.print("Sent #{} is missing".format(ident))
        else:
            output.print(sent)
            eps = sent[0].dmrs().obj().eps()
            sort_eps(eps)
            output.print(["{}<{}:{}>".format(str(x.pred), x.cfrom, x.cto) for x in eps])
    output.print("Done")
Esempio n. 37
0
def doc_stats(cli, args):
    ''' Show document statistics '''
    doc = Document.from_file(args.path)  # input
    output = TextReport(args.output)  # output
    stats = Counter()
    pred_counter = Counter()
    empty_sentences = []
    unknown_preds = Counter()
    all_pos = Counter()
    not_found = None
    if args.ttl:
        ttl_doc = ttl.Document.read_ttl(args.ttl)
        not_found = set(s.ID for s in ttl_doc).difference(s.ident for s in doc)
    for sent in doc:
        stats.count("Sentences")
        if not len(sent):
            stats.count("Sentences-empty")
            empty_sentences.append(sent.ident)
        for reading in sent:
            stats.count("Readings")
            stats['Predicates'] += len(reading.dmrs().layout.nodes)
            # pred_counter.update(n.predstr for n in reading.dmrs().layout.nodes)
            for n in reading.dmrs().layout.nodes:
                if n.pred.pos == 'u' and n.pred.sense == 'unknown':
                    stats.count("Unnown predicates")
                    if '/' in n.pred.lemma:
                        try:
                            lemma, pos = n.pred.lemma.rsplit('/', 1)
                        except:
                            getLogger().warning("Invalid unknown pred: {}".format(n.pred))
                            raise
                        all_pos.count(pos)
                        unknown_preds.count((str(n.pred), lemma, pos))
                    else:
                        stats.count("UFO")
                else:
                    stats.count("Known predicates")
                    pred_counter.count(n.predstr)
    output.header("Summary", level="h0")
    stats.summarise(output)
    output.header("Empty sentences")
    output.print("\n".join(empty_sentences))
    if not_found is not None:
        output.header("Missing from TTL")
        for sid in not_found:
            output.print(sid)
    output.header("Unknown preds POS")
    for pos, count in all_pos.most_common():
        output.print(pos, count, separator='\t')
    output.header("Unknown preds")
    for (pred, lemma, pos), count in unknown_preds.most_common():
        output.print(pred, lemma, pos, count, separator='\t')
    output.header("Known preds", level="h1")
    pred_counter.summarise(output)
Esempio n. 38
0
def show_info(cli, args):
    ''' Show jamdict configuration (data folder, configuration file location, etc.) '''
    output = TextReport(args.output) if 'output' in args else TextReport()
    output.header("Jamdict | {} - Version: {}".format(
        version_info.__description__, version_info.__version__),
                  level='h0')
    output.header("Basic configuration")
    output.print("JAMDICT_HOME:           {}".format(config.home_dir()))
    output.print("Configuration location: {}".format(
        config._get_config_manager().locate_config()))
    output.header("Data files")
    output.print("Jamdict DB location: {} - {}".format(args.jdb,
                                                       file_status(args.jdb)))
    output.print("JMDict XML file    : {} - {}".format(
        args.jmdxml, file_status(args.jmdxml)))
    output.print("KanjiDic2 XML file : {} - {}".format(
        args.kd2xml, file_status(args.kd2xml)))
Esempio n. 39
0
    def test_tagging_all(self):
        getLogger().debug("Tagging everything ...")
        sents = self.gold()
        smap = {str(s.ident): s for s in sents}
        # reag tags
        doc = ttl.Document('gold', TEST_GOLD_DIR).read()
        filter_wrong_senses(doc)
        count_good_bad = Counter()
        perfects = []
        to_be_checked = dd(list)
        tbc_concepts = dd(list)
        concept_count = Counter()
        fix_texts = []
        instances = Counter()
        tag_map = dd(set)
        report = TextReport('data/gold_report.txt')
        matched_report = TextReport('data/gold_matched.txt')
        not_matched_report = TextReport('data/gold_notmatched.txt')
        for s in sents[:5]:
            sid = str(s.ident)
            if not doc.has_id(sid):
                raise Exception("Cannot find sentence {}".format(sid))
            elif len(s) == 0:
                logging.warning("Empty sentence: {}".format(s))
            else:
                tagged = doc.get(sid)
                if s.text != tagged.text:
                    fix_texts.append((s.ident, s.text, tagged.text))
                # try to tag ...
                dmrs = s[0].dmrs()
                matched, not_matched, ignored = tag_gold(dmrs, tagged, s.text, mode=Lexsem.ROBUST)
                if not not_matched:
                    count_good_bad.count("Perfect")
                    perfects.append((s, matched))
                else:
                    for nm in not_matched:
                        tag_map[nm.tag].add(nm.clemma)
                        tbc_concepts[nm.tag].append(s.ident)
                        concept_count.count(nm.tag)
                        instances.count('instances')
                    to_be_checked[s.ident].append(nm)
                    count_good_bad.count("To be checked")
        # report matched
        for sent, m in perfects:
            tagged = doc.get(str(sent.ident))
            matched_report.header("#{}: {}".format(sent.ident, sent.text), "h0")
            matched_report.writeline(sent[0].dmrs())
            matched_report.header("Concepts")
            for c, nid, pred in m:
                matched_report.writeline("{} ===> {}:{}".format(c, nid, pred))
            matched_report.writeline()
            matched_report.writeline()
        # report not matched
        not_matched_report.header("By senses", "h0")
        for k, v in concept_count.most_common():
            sids = ' '.join(["#{}".format(x) for x in tbc_concepts[k]])
            not_matched_report.print("{}: {} | {} => {}".format(k, v, sids, tag_map[k]))
        not_matched_report.header("By sentences", "h0")
        for sid, nm in to_be_checked.items():
            not_matched_report.print("#{}: {}  | {}".format(sid, nm, smap[str(sid)].text))
        # full details
        for sid, nm in to_be_checked.items():
            sent = smap[str(sid)]
            tagged = doc.get(str(sid))
            not_matched_report.header("#{}: {}".format(sid, sent.text))
            not_matched_report.writeline(sent[0].dmrs())
            for n in nm:
                not_matched_report.writeline(n)

        # for i, t1, t2 in fix_texts:
        #     getLogger().debug(i)
        #     getLogger().debug(t1)
        #     getLogger().debug(t2)
        count_good_bad.summarise(report=report)
        instances.summarise(report=report)
Esempio n. 40
0
        continue
    ct.count(char)
    vc.count("Letters")
    if char in 'auieo':
        vc.count("Vowels")
    else:
        vc.count("Consonants")
vc.summarise()
ct.summarise(byfreq=True, limit=5)


# ------------------------------------------------------------------------------
# Sample text report
# ------------------------------------------------------------------------------
# a string report
rp = TextReport()  # by default, TextReport will write to standard output, i.e. terminal
rp = TextReport(TextReport.STDOUT)  # same as above
rp = TextReport('~/tmp/my-report.txt')  # output to a file
rp = TextReport.null()  # ouptut to /dev/null, i.e. nowhere
rp = TextReport.string()  # output to a string. Call rp.content() to get the string
rp = TextReport(TextReport.STRINGIO)  # same as above

# TextReport will close the output stream automatically by using the with statement
with TextReport.string() as rp:
    rp.header("Lorem Ipsum Analysis", level="h0")
    rp.header("Raw", level="h1")
    rp.print(LOREM_IPSUM)
    rp.header("Character Frequency")
    ct.summarise(report=rp)
    print(rp.content())
Esempio n. 41
0
def isf_to_ukb(cli, args):
    ''' ISF to UKB '''
    doc = Document.from_file(args.input)
    output = TextReport(args.output)
    tokenfile = TextReport(args.output + '.tokens.txt')
    report = TextReport(args.report)
    report.print("Output file: {}".format(args.output))
    processed = 0
    if not args.ident:
        report.print("No ident was provided")
    for idx, sent in enumerate(doc):
        # sent = doc.by_ident(ident, default=None)
        if args.topk and idx > args.topk:
            break
        if args.ident and sent.ident not in args.ident:
            continue
        if sent is None:
            report.print("Sent #{} is missing".format(sent.ident))
        elif len(sent) == 0:
            report.print("Sent #{} is empty (i.e. there is no parse)".format(sent.ident))
        else:
            sentid = sent.ID if sent.ID else sent.ident
            report.print("Processing {}".format(sentid))
            tokens = sent.readings[0].dmrs().tokenize_pos(strict=args.strict)
            if not tokens:
                report.print("Empty DMRS: {} (no pred???)".format(sentid))
                continue
            # sentense is OK ...
            output.print(sentid)
            for idx, (isf_lemma, pos, cfrom, cto) in enumerate(tokens):
                # In UKB's lemmas, use _ to represent a space
                lemma = isf_lemma.replace('+', '_')
                output.write("{text}#{p}#w{wid}#1 ".format(text=lemma, p=pos, wid=idx))
                tokenfile.writeline('\t'.join((str(sentid), str(idx), str(cfrom), str(cto))))
            output.write('\n\n')
            processed += 1
    report.print("Processed {} sentence(s)".format(processed))
    report.print("Done")
Esempio n. 42
0
def remove_msw_ttl(cli, args):
    doc = read_ttl(args.path)
    rp = TextReport(args.debug)
    rp.print("Doc size: {}".format(len(doc)))
    orig_tag_count = 0
    orig_concept_count = 0
    for s in doc:
        orig_concept_count += len(s.concepts)
        orig_tag_count += len(s.tags)
    print("# tags: {}".format(orig_tag_count))
    print("# concepts: {}".format(orig_concept_count))
    manual = dd(lambda: dd(dict))
    nonsenses = set()  # just ignore any tag with these sense IDs
    if args.manual:
        entries = CSV.read_tsv(args.manual)
        for sid, wid, tag, keep, lemma in entries:
            sid, wid, keep = int(sid), int(wid), int(keep)
            if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'):
                nonsenses.add(tag)
            if not lemma:
                manual[sid][wid][tag] = keep
            else:
                manual[sid][wid][(tag, lemma)] = keep
    wn = get_wn()
    ctx = wn.ctx()
    nope_synsets = set()
    ok_synsets = set()
    if args.wn30:
        rp.print("WN30 filter is activated")
    for sidx, sent in enumerate(doc):
        if args.topk and sidx > int(args.topk):
            break
        getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc)))
        getLogger().debug("Before concepts: {}".format(sent.concepts))
        getLogger().debug("Before tags: {}".format(sent.tags))
        # remove concepts that are not in PWN 3.0
        if args.wn30:
            remove_tags = set()
            for tag in sent.tags:
                if tag.tagtype == 'OMW' or tag.label in nonsenses:
                    remove_tags.add(tag)
            for tag in remove_tags:
                sent.tags.remove(tag)
            remove_concepts = set()
            for c in sent.concepts:
                if c.tag in ok_synsets:
                    pass
                elif c.tag in nope_synsets:
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                elif wn.get_synset(c.tag, ctx=ctx) is None:
                    # remove it
                    nope_synsets.add(c.tag)
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                else:
                    ok_synsets.add(c.tag)
            for c in remove_concepts:
                pop_concept(sent, c)
        msw = list(sent.msw())
        tcmap = sent.tcmap()
        # remove_tags = set()
        if msw:
            keep_remove = []
            for w in msw:
                max_len = 0
                keep = []
                remove = set()
                wid = sent.tokens.index(w)
                for c in tcmap[w]:
                    if c.tag in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][c.tag]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif (c.tag, c.clemma) in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][(c.tag, c.clemma)]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif len(c.tokens) == 1 or len(c.tokens) < max_len:
                        remove.add(c)
                    elif c.tag in nonsenses:
                        remove.add(c)
                    else:
                        max_len = len(c.tokens)
                        keep.append(c)
                if len(keep) != 1:
                    keep_remove.append((w, keep, remove))
                else:
                    # everything is OK, remove them now
                    for c in remove:
                        if args.debug:
                            rp.print("Removing concept {} from {}".format(c, sent.ID))
                        getLogger().debug("Removing concept {} from {}".format(c, sent.ID))
                        pop_concept(sent, c)
            if keep_remove:
                rp.header(sent)
                for w, keep, remove in keep_remove:
                    rp.write(w)
                    rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove))
        # remove sent's tags
        # for tag in remove_tags:
        #     getLogger().debug("removing tag: {}".format(tag))
        #     sent.tags.remove(tag)
        getLogger().debug("After concepts: {}".format(sent.concepts))
        getLogger().debug("After tags: {}".format(sent.tags))
    if nope_synsets:
        rp.print("Noped synsets: {}".format(nope_synsets))
    if args.output:
        doc_path = os.path.dirname(args.output)
        doc_name = os.path.basename(args.output)
        new_doc = ttl.Document(doc_name, doc_path)
        sents = doc if not args.topk else list(doc)[:int(args.topk)]
        for s in sents:
            new_doc.add_sent(s)
        tag_count = 0
        concept_count = 0
        for s in sents:
            concept_count += len(s.concepts)
            tag_count += len(s.tags)
        # baking ...
        if args.bake:
            print("Baking doc ...")
            bake_doc(new_doc)
        print("[New] # tags: {}".format(tag_count))
        print("[New] # concepts: {}".format(concept_count))
        rp.print("Writing fixed TTL to {}".format(new_doc.sent_path))
        new_doc.write_ttl()
Esempio n. 43
0
def find_lesk_candidates(cli, args):
    doc = Document.from_file(args.gold)
    ne = 0
    for s in doc:
        if len(s):
            ne += 1
    print("Gold ISF: {} | not empty sents: {}".format(args.gold, ne))
    # candidates = dd(lambda: dd(set))
    notfound = dd(list)
    ident_sent_map = {}
    all_preds = Counter()
    missing_preds = Counter()
    found_preds = Counter()
    with PredSense.wn.ctx() as ctx:
        for idx, sent in enumerate(doc):
            if not len(sent):
                continue
            elif args.ident and sent.ident not in args.ident:
                continue
            if args.topk and args.topk < idx:
                break
            print(sent)
            ident_sent_map[sent.ident] = sent
            dmrs = sent[0].dmrs()
            if dmrs.tags:
                for ep in dmrs.get_lexical_preds():
                    all_preds.count(str(ep.pred))
                    if ep.nodeid in dmrs.tags:
                        # if there is a tag for this node
                        ep_synsets = PredSense.search_ep(ep, ctx=ctx)  # return a SynsetCollection()
                        for tag in dmrs.tags[ep.nodeid]:
                            if tag.synset.ID not in ep_synsets:
                                notfound[sent.ident].append((ep.nodeid, str(ep.pred), tag.synset.ID, tag.synset.lemma, [(x.ID, x.lemma) for x in ep_synsets]))
                                missing_preds.count(str(ep.pred))
                            else:
                                found_preds.count(str(ep.pred))
    output = TextReport(args.output)
    # summarise
    total_found = sum(c for pred, c in found_preds.most_common())
    total_missing = sum(c for pred, c in missing_preds.most_common())
    output.print("Found    : {}".format(total_found))
    output.print("Not found: {}".format(total_missing))
    ratio = (total_missing * 100) / (total_found + total_missing)
    output.print("Missing %: {}".format(ratio))
    # preds by sentences
    output.header("By sentences")
    for sid in sorted(notfound.keys()):
        sent = ident_sent_map[sid]
        output.print((sid, sent.text))
        items = notfound[sid]
        for item in items:
            output.print(item)
        output.print()
    # by preds
    output.header("By preds")
    for pred, occurrence in missing_preds.most_common():
        output.print("{}: {}".format(pred, occurrence))
    print("Done")
Esempio n. 44
0
def verify_patch(cli, args):
    rp = TextReport()
    c = Counter()
    if not args.input or not os.path.isfile(args.input):
        raise Exception("Patch file not found")
    # load patches
    with open(args.input) as infile:
        patches = [DefPatch.from_dict(p) for p in yaml.safe_load(infile)]
    rp.print("Found {} patches.".format(len(patches)))
    # Validate against GWN-30
    # gwn = get_gwn()  # don't use GWN, for now
    omw = get_omw()
    wn = get_wn()
    with omw.ctx() as ctx, wn.ctx() as wnctx:
        for patch in patches:
            try:
                sid = wn.sk2sid(patch.sensekey, ctx=wnctx)
                if not sid:
                    raise Exception("sensekey `{}' does not exist.".format(
                        patch.sensekey))
                ss = omw.get_synset(sid, ctx=ctx)
                ssdef = ss.definition[:-1] if ss.definition.endswith(
                    ';') else ss.definition
                if patch.orig_def == ssdef:
                    c.count("Found")
                    rp.print("-", "{} [{}]".format(patch.orig_def,
                                                   patch.sensekey))
                    rp.print(" ", patch.new_def)
                    if patch.comment:
                        rp.print("C", patch.comment)
                else:
                    c.count("Found - diff")
                    rp.print("[DIFF]",
                             "{} [{}]".format(patch.orig_def, patch.sensekey))
                    rp.print("New:  ",
                             "{} [{}]".format(patch.new_def, patch.sensekey))
                    rp.print("      ", ssdef)
                    rp.print("Note: ", patch.comment)
            except:
                getLogger().warn("sensekey `{}' couldn't be found".format(
                    patch.sensekey))
                c.count("Not found")
                continue
        c.summarise(report=rp)
Esempio n. 45
0
def list_unksense(args):
    header("List unknown sensekeys in Semcor")
    semxml = SemcorXML(SEMCOR_TTL)
    unk = Counter()
    sids = Counter()
    c = Counter()
    out = TextReport() if not args.out else TextReport(args.out)
    for f in semxml.files[:args.limit] if args.limit else ttl.files:
        doc = ttl.Document.from_json_file(ttl.files.abspath(f))
        for s in doc:
            for concept in s.concepts:
                try:
                    sid = SynsetID.from_string(concept.tag)
                    sids.count((sid, concept.clemma))
                    c.count("Known instances")
                except:
                    sid = None
                    unk.count((concept.tag, concept.clemma))
                    c.count("Unknown instances")
    out.header("Known concepts")
    out.writeline("\t".join(("synsetID", "lemma", "count")))
    for k, v in sids.sorted_by_count():
        sid, lemma = k
        out.writeline("\t".join((str(sid), lemma, str(v))))
    out.header("Unknown concepts")
    out.writeline("\t".join(("sensekey", "lemma", "count")))
    for k, v in unk.sorted_by_count():
        sk, lemma = k
        out.writeline("\t".join((sk, lemma, str(v))))
    out.header("Total")
    out.writeline("Known: {}".format(len(sids)))
    out.writeline("Unknown: {}".format(len(unk)))
    c.summarise(out)
Esempio n. 46
0
def read_nttat(cli, args):
    ''' Convert NTTAT patch to JSON '''
    stdout = TextReport()
    ext = 'json'
    rp = TextReport("{}_1.{}".format(args.output, ext))
    rp2 = TextReport("{}_2.{}".format(args.output, ext))
    gwn = get_gwn()
    data = []
    with open(args.input, 'r') as infile, gwn.ctx() as ctx:
        ssids = re.findall('\d{8}-[nvarx]', infile.read())
        print(len(ssids))
        print(ssids)
        for sid in ssids:
            ss = gwn.get_synset(sid, ctx=ctx)
            sdef = fix_gwn_def(ss.definition)
            stdout.header(sid, "Lemmas: {}".format(", ".join(ss.lemmas)))
            stdout.print(sdef)
            data.append({
                "synset": sid,
                "lemmas": ss.lemmas,
                "definition": sdef
            })
    cut = int(len(data) / 2)
    # first half
    first_half = json.dumps(data[:cut], indent=2)
    rp.write(first_half)
    # second half
    second_half = json.dumps(data[cut:], indent=2)
    rp2.write(second_half)
Esempio n. 47
0
def compare_ttls(cli, args):
    ''' Compare TTL to gold '''
    rp = TextReport()
    omw = get_omw()
    ctx = omw.ctx()
    gold = None
    profile = None
    ignored_ids = []
    if args.ignore:
        ignored_ids = [x.strip() for x in read_file(args.ignore).splitlines() if x.strip()]
        getLogger().debug("Ignored sentence IDs: {}".format(', '.join(ignored_ids)))
    if args.gold_profile:
        gold = read_ttl(args.gold_profile, ttl_format=args.ttl_format)
        # remove ignored sentences
        if ignored_ids:
            for sid in ignored_ids:
                gold.pop(sid, default=None)
        if not args.batch:
            rp.header("Gold sentences: {} | Loc: {}".format(len(gold), args.gold_profile))
        if args.verbose and not args.batch:
            for s in gold:
                rp.print("Sent #{}: {} tags".format(s.ID, len(s.tags)))
    elif not args.batch:
        print("Oops, no gold!")
    # read profile
    if args.profile:
        profile = read_ttl(args.profile, ttl_format=args.ttl_format)
        if not args.batch:
            rp.header("Profile sentences: {} | Loc: {}".format(len(profile), args.profile))
        # remove ignored sentences
        if ignored_ids:
            for sid in ignored_ids:
                profile.pop(sid, default=None)
        if not args.batch:
            rp.header("Profile sentences: {} (ignored: {}) | Loc: {}".format(len(profile), len(ignored_ids), args.profile))
        if args.verbose and not args.batch:
            for s in profile:
                getLogger().debug("Profile/Sent #{}: {} tags".format(s.ID, len(s.tags)))
    elif not args.batch:
        print("Oops, no profile to evaluate")
    # calculate precision and recall
    if gold and profile:
        gold_tags, gold_tags_len, gold_ignored = prepare_tags(gold, args=args, nonsense=args.nonsense)
        profile_tags, profile_tags_len, profile_ignored = prepare_tags(profile, args=args, nonsense=args.nonsense)
        if gold_tags_len == 0:
            rp.print("WARNING: There was no tag found in the gold profile. Please make sure that the tags for comparison are *sentence level* tags")
        if profile_tags_len == 0:
            rp.print("WARNING: There was no tag found in the evaluating profile. Please make sure that the tags for comparison are *sentence level* tags")
        getLogger().debug("Gold tags: {}".format(gold_tags_len))
        getLogger().debug(list(gold_tags.items())[:5])
        getLogger().debug("Profile tags: {}".format(profile_tags_len))
        getLogger().debug(list(profile_tags.items())[:5])
        true_positive, false_negative = score(gold_tags, profile_tags, args=args)
        precision = len(true_positive) / profile_tags_len
        recall = len(true_positive) / gold_tags_len
        f1 = 2 * precision * recall / (precision + recall)
        getLogger().debug("TP: {}".format(len(true_positive)))
        getLogger().debug("FN: {}".format(len(false_negative)))
        getLogger().debug("Recall (TP/Gtags): {}".format(recall))
        getLogger().debug("Precision (TP/Ptags): {}".format(precision))
        getLogger().debug("F1 (2*p*r/(p+r)): {}".format(f1))
        rc_text = "{:.2f}%".format(recall * 100)
        pr_text = "{:.2f}%".format(precision * 100)
        f1_text = "{:.2f}%".format(f1 * 100)
        if not args.batch:
            rp.print("True positive: {}".format(len(true_positive)))
            rp.print("False Negative: {}".format(len(false_negative)))
            rp.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored))
            rp.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored))
            rp.print("Recall:    {}".format(rc_text))
            rp.print("Precision: {}".format(pr_text))
            rp.print("F1       : {}".format(f1_text))
        if args.org:
            # output org-mode
            columns = [rc_text, pr_text, f1_text]
            if args.cols:
                columns = args.cols + columns
            rp.print('| {} |'.format(' | '.join(columns)))
        if args.debug:
            if not args.batch:
                print("Debug file: {}".format(args.debug))
            debugfile = TextReport(args.debug)
            debugfile.print(".:: Table of content ::.")
            debugfile.print("")
            debugfile.print("[Misisng senses]")
            debugfile.print("[By classes]")
            debugfile.print("[Summary]")
            debugfile.print("")
            ss_map = {}
            debugfile.header("[Missing senses]")
            for sid, cfrom, cto, label in sorted(false_negative):
                if label not in ss_map:
                    ss = omw.get_synset(label, ctx=ctx)
                    ss_map[label] = ss
                else:
                    ss = ss_map[label]
                # get the surface form
                surface = gold.get(sid).text[int(cfrom):int(cto)]
                debugfile.print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(sid, cfrom, cto, surface, label, ss.definition, ss.lemmas))
            # by classes
            c = Counter()
            c.update(synsetID for sentID, cfrom, cto, synsetID in false_negative)
            debugfile.header("[By classes]")
            for synsetID, freq in c.most_common():
                ss = ss_map[synsetID]
                debugfile.print("{}: {} | ({}) - {}".format(synsetID, freq, ', '.join(ss.lemmas), ss.definition))
            # summary
            debugfile.header("[Summary]")
            debugfile.print("True positive: {}".format(len(true_positive)))
            debugfile.print("False positive: {}".format(len(false_negative)))
            debugfile.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored))
            debugfile.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored))
            debugfile.print("Recall (TP/Gtags)   : {}".format(rc_text))
            debugfile.print("Precision (TP/Ptags): {}".format(pr_text))
            debugfile.print("F1  (2*p*r/(p+r))   : {}".format(f1_text))
    ctx.close()
Esempio n. 48
0
def omw_fix_dup(cli, args):
    rp = TextReport(args.output)
    omw = get_omw()
    c = Counter()
    with omw.ctx() as ctx:
        senses = ctx.sense.select(limit=args.topk, columns=('synset', ))
        synsetids = {s.synset for s in senses}
        rp.print("-- OMW synsets: {}\n".format(len(synsetids)))
        for sid in synsetids:
            try:
                sid = SynsetID.from_string(sid)
            except:
                cli.logger.warning("Ignored synset ID: {}".format(sid))
                continue
            ss = omw.get_synset(sid, ctx=ctx)
            fixed_def, dup_defs = join_definitions(ss)
            if dup_defs:
                c.count("Duplicated")
                rp.print("-- Original {}: {}".format(ss.ID, ss.definition))
                rp.print("-- Fixed    {}: {}".format(ss.ID, fixed_def))
                for dup in dup_defs:
                    rp.print(
                        "DELETE FROM synset_def WHERE synset='{}' and def='{}';"
                        .format(ss.ID, to_sqlite_string(dup)))
                rp.print()
        c.summarise()
        pass
Esempio n. 49
0
def map_predsense(cli, args):
    ''' Pred-Sense Mapping (gold DMRSes, gold Senses) '''
    rp = TextReport(args.output) if args.output else TextReport()
    rp.header("Pred-Sense mapping / strategy = {}".format(args.strat))
    if args.gold:
        sents = Document.from_file(args.gold)
        if args.patchsid:
            patch_gold_sid(sents)
    else:
        sents = read_gold_mrs()
        patch_gold_sid(sents)
    # ignore empty sentence
    empty_sents = [s for s in sents if not len(s)]
    not_empty_sents = [s for s in sents if len(s)]
    rp.print("MRS-Sents: {}".format(len(sents)))
    rp.print("MRS-Sents not empty: {}".format(len(not_empty_sents)))
    if args.ttl:
        doc = ttl.read(args.ttl, mode=args.ttl_format)
    else:
        # [XXX] using gold by default is bad ...
        doc = ttl.Document(name='gold', path='data').read()
    rp.print("TTL-Sents: {}".format(len(doc)))
    found_sents = 0
    for sent in not_empty_sents:
        if doc.get(sent.ident) is None:
            cli.logger.warning("Sentence {} could not be found".format(sent.ident))
        else:
            found_sents += 1
    rp.print("Matched: {}".format(found_sents))
    rp.print("Empty sentences: {}".format([s.ident for s in empty_sents]))
    # Now mapping is possible
    # ----------------------------------------
    ct = Counter()  # total
    cm = Counter()  # matched
    cnm = Counter()  # not matched
    cig = Counter()  # ignored
    sense_lemmas = dd(set)  # sense, lemma, map
    sense_sents = dd(set)  # not-matched senses to sentences
    lemma_sents = dd(set)  # not matched lemmas to sentences
    rp.print("Performing Pred-Sense Mapping")
    sents_to_map = not_empty_sents[:args.topk] if args.topk else not_empty_sents
    for sent in sents_to_map:
        sent.shallow = doc.get(sent.ident)
        for m, nm, ig in import_shallow(sent, mode=args.strat, no_small_sense=args.noss, fix_token=args.fixtoken, no_nonsense=args.nononsense):
            for c, nid, pred in m:
                ct.count(c.tag)
                cm.count(c.tag)
            for c in ig:
                sense_lemmas[c.tag].add(c.clemma)
                ct.count(c.tag)
                cig.count(c.tag)
            for c in nm:
                sense_lemmas[c.tag].add(c.clemma)
                ct.count(c.tag)
                cnm.count(c.tag)
                sense_sents[c.tag].add(sent)
                lemma_sents[c.clemma].add(sent)
            # print("Sent #{} - Not matched: {}".format(sent.ident, nm))
            # print("           Matched    : {}".format(len(m)))
    rp.header("Not matched", level='h0')
    for sid, c in cnm.most_common():
        rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid]))
    rp.header("Not matched (by lemma)", level='h0')
    for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True):
        rp.print("{}: {} | sents: {}".format(clemma, len(sents), [s.ident for s in sents]))
    if args.matched:
        rp.header("Total", level='h0')
        ct.summarise()
    rp.header("Ignored", level='h0')
    for sid, c in cig.most_common():
        rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid]))
    # show sense - sentences
    rp.header("Sense - Sentences", level='h0')
    for sid, c in cnm.most_common():
        sents = sense_sents[sid]
        rp.header("{} - {}".format(sid, sense_lemmas[sid]), level='h2')
        for sent in sents:
            ttl_sent = doc.get(sent.ident)
            rp.print(ttl_sent)
            for concept in ttl_sent.concepts:
                if concept.tag == sid:
                    rp.print('  -> {}'.format(concept))
    rp.header("Lemma - Sentences", level='h0')
    for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True):
        rp.header("#{}".format(clemma,))
        for sent in sents:
            ttl_sent = doc.get(sent.ident)
            rp.print(ttl_sent)
            for concept in ttl_sent.concepts:
                if concept.clemma == clemma:
                    rp.print('  -> {}'.format(concept))
        rp.print()
    # Show final numbers
    total_concepts = sum(x[1] for x in ct.most_common())
    total_matched = sum(x[1] for x in cm.most_common())
    total_notmatched = sum(x[1] for x in cnm.most_common())
    total_ignored = sum(x[1] for x in cig.most_common())
    rp.header("Summarise")
    rp.print("Total concepts: {}".format(total_concepts))
    rp.print("Matched: {}".format(total_matched))
    rp.print("Not matched: {}".format(total_notmatched))
    rp.print("Ignored: {}".format(total_ignored))
    if args.output:
        print("Total concepts: {}".format(total_concepts))
        print("Matched: {}".format(total_matched))
        print("Not matched: {}".format(total_notmatched))
        print("Ignored: {}".format(total_ignored))
        print("Output file: {}".format(args.output))
    print("Done!")
    return total_concepts, total_matched, total_notmatched, total_ignored