Python TextReport.header Examples, chirptext.TextReport.header Python Examples

Example #1

0

Show file

File: lex2pred.py Project: letuananh/omwtk

def process_lemma(cli, args):
    limit = int(args.topk) if args.topk and int(args.topk) > 0 else None
    pos = args.pos
    db = EWDB(args.db)
    rp = TextReport()
    rp.header("DB location: {}".format(db.ds.path))
    with db.ctx() as ctx:
        if args.flag:
            query = ['(flag IS NULL OR flag = ?)']
            params = [args.flag]
        else:
            query = ['flag IS NULL']
            params = []
        if pos:
            query.append('pos=?')
            params.append(pos)
        senses = ctx.sense.select(' AND '.join(query), params, limit=limit)
        print("Found {} senses for {}".format(len(senses), pos))
        for idx, sense in enumerate(senses):
            if idx % 50 == 0:
                print("Processed {} / {}".format(idx, len(senses)))
            found_gold = is_gold(sense, db, ctx)  # non zero = True
            if found_gold:
                # flag this sense as gold
                db.flag_sense(sense.ID, found_gold, ctx=ctx)
            elif sense.flag != EWDB.Flags.PROCESSED:
                db.flag_sense(sense.ID, EWDB.Flags.PROCESSED, ctx=ctx)
    pass

Example #2

0

Show file

File: extract.py Project: letuananh/omwtk

def gen_mfs_5500(cli, args):
    ''' Generate 3rd round tree banking '''
    rp = TextReport(args.output)
    topk_synsets = topk_mfs(5500)
    # finished treebanking
    first_round = read_lines('data/omw3000_synsets.txt')
    second_round = read_lines('data/omw5000_synsets.txt')
    done_synsets = set(first_round + second_round)
    # new
    third_round = topk_synsets.difference(done_synsets)
    # report
    print("All     :", len(topk_synsets))
    print("Done    :", len(done_synsets))
    print("New     :", len(third_round))
    # write to a synset file
    with open('data/omw5300_synsets.txt', 'w') as outfile:
        outfile.write('\n'.join(third_round))
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        profile = 'omw5300'
        filename = 'omw5300A'
        for idx, sid in enumerate(third_round):
            ss = omw.get_synset(sid, ctx=ctx)
            hub[profile].header(ss.ID,
                                'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub[profile].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)

Example #3

0

Show file

File: chirp.py Project: letuananh/chirptext

def gen_vocab(cli, args):
    ''' Generate vocabulary list from a tokenized file '''
    if args.topk and args.topk <= 0:
        topk = None
        cli.logger.warning("Invalid k will be ignored (k should be greater than or equal to 1)")
    else:
        topk = args.topk
    if args.stopwords:
        with open(args.stopwords, 'r') as swfile:
            stopwords = swfile.read().splitlines()
    else:
        stopwords = []
    if os.path.isfile(args.input):
        cli.logger.info("Generating vocabulary list from file {}".format(args.input))
        with codecs.open(args.input, encoding='utf-8') as infile:
            if args.output:
                cli.logger.info("Output: {}".format(args.output))
            rp = TextReport(args.output)
            lines = infile.read().splitlines()
            c = Counter()
            for line in lines:
                words = line.split()
                c.update(w for w in words if w not in stopwords)
            # report vocab
            word_freq = c.most_common(topk)
            words = [k for k, v in word_freq]
            rp.header("Lexicon")
            rp.writeline("\n".join(textwrap.wrap(" ".join(w for w in words), width=70)))
            for k, v in word_freq:
                rp.print("{}: {}".format(k, v))
    else:
        cli.logger.warning("File {} does not exist".format(args.input))

Example #4

0

Show file

File: erg.py Project: letuananh/intsem.fx

def map_preds(cli, args):
    rp = TextReport(args.output)
    ctx = PredSense.wn.ctx()
    not_found = []
    pred_file = 'data/erg_preds_interesting.txt'
    if args.all:
        pred_file = 'data/erg_preds_sorted.txt'
    name, ext = os.path.splitext(pred_file)
    not_found_file = name + "_notfound" + ext
    with open(pred_file, 'r') as infile:
        for p_str in infile.read().splitlines():
            p = Predicate.from_string(p_str)
            candidates = None
            if p.pos == 'x' and p.sense == 'subord':
                continue  # ignore these for now
            # if (p.pos == 'x' and p.sense == 'deg') or p.pos == 'p':
            if args.all or (p.pos and p.pos in 'xpq'):
                rp.header(p, p.lemma, p.pos, p.sense)
                candidates = PredSense.search_pred_string(p, ctx=ctx)
                for c in candidates:
                    rp.print(c.ID, c.lemmas, c.definition)
            if not candidates:
                not_found.append(p_str)
    with TextReport(not_found_file, 'w') as outfile:
        for p in not_found:
            outfile.print(p)

    if args.output:
        print("Written to: {}".format(args.output))
    print("Done")

Example #5

0

Show file

File: gold.py Project: letuananh/intsem.fx

def find_lesk_candidates(cli, args):
    doc = Document.from_file(args.gold)
    ne = 0
    for s in doc:
        if len(s):
            ne += 1
    print("Gold ISF: {} | not empty sents: {}".format(args.gold, ne))
    # candidates = dd(lambda: dd(set))
    notfound = dd(list)
    ident_sent_map = {}
    all_preds = Counter()
    missing_preds = Counter()
    found_preds = Counter()
    with PredSense.wn.ctx() as ctx:
        for idx, sent in enumerate(doc):
            if not len(sent):
                continue
            elif args.ident and sent.ident not in args.ident:
                continue
            if args.topk and args.topk < idx:
                break
            print(sent)
            ident_sent_map[sent.ident] = sent
            dmrs = sent[0].dmrs()
            if dmrs.tags:
                for ep in dmrs.get_lexical_preds():
                    all_preds.count(str(ep.pred))
                    if ep.nodeid in dmrs.tags:
                        # if there is a tag for this node
                        ep_synsets = PredSense.search_ep(ep, ctx=ctx)  # return a SynsetCollection()
                        for tag in dmrs.tags[ep.nodeid]:
                            if tag.synset.ID not in ep_synsets:
                                notfound[sent.ident].append((ep.nodeid, str(ep.pred), tag.synset.ID, tag.synset.lemma, [(x.ID, x.lemma) for x in ep_synsets]))
                                missing_preds.count(str(ep.pred))
                            else:
                                found_preds.count(str(ep.pred))
    output = TextReport(args.output)
    # summarise
    total_found = sum(c for pred, c in found_preds.most_common())
    total_missing = sum(c for pred, c in missing_preds.most_common())
    output.print("Found    : {}".format(total_found))
    output.print("Not found: {}".format(total_missing))
    ratio = (total_missing * 100) / (total_found + total_missing)
    output.print("Missing %: {}".format(ratio))
    # preds by sentences
    output.header("By sentences")
    for sid in sorted(notfound.keys()):
        sent = ident_sent_map[sid]
        output.print((sid, sent.text))
        items = notfound[sid]
        for item in items:
            output.print(item)
        output.print()
    # by preds
    output.header("By preds")
    for pred, occurrence in missing_preds.most_common():
        output.print("{}: {}".format(pred, occurrence))
    print("Done")

Example #6

0

Show file

File: lex2pred.py Project: letuananh/omwtk

def show_stats(cli, args):
    db = EWDB(args.db)
    rp = TextReport()
    rp.header("DB location: {}".format(db.ds.path))
    with db.ctx() as ctx:
        for pos in 'nvar':
            senses = ctx.sense.select("pos=?", (pos, ))
            print("pos={}: {}".format(pos, len(senses)))
            senses = ctx.sense.select("pos=? AND flag=?",
                                      (pos, EWDB.Flags.GOLD))
            print("GOLD pos={}: {}".format(pos, len(senses)))
    pass

Example #7

0

Show file

def show_info(cli, args):
    ''' Show jamdict configuration (data folder, configuration file location, etc.) '''
    output = TextReport(args.output) if 'output' in args else TextReport()
    output.print("Jamdict " + version_info.__version__)
    output.print(version_info.__description__)
    output.header("Basic configuration")
    output.print("JAMDICT_HOME        : {}".format(config.home_dir()))
    output.print("Config file location: {}".format(config._get_config_manager().locate_config()))
    output.header("Data files")
    output.print("Jamdict DB location: {} - {}".format(args.jdb, file_status(args.jdb)))
    output.print("JMDict XML file    : {} - {}".format(args.jmdxml, file_status(args.jmdxml)))
    output.print("KanjiDic2 XML file : {} - {}".format(args.kd2xml, file_status(args.kd2xml)))
    output.print("JMnedict XML file : {} - {}".format(args.jmnexml, file_status(args.jmnexml)))

Example #8

0

Show file

File: lex2pred.py Project: letuananh/omwtk

def create_ewdb(cli, args):
    db = EWDB(args.db)
    c = Counter()
    rp = TextReport()
    rp.header("DB location: {}".format(db.ds.path))
    with db.ctx() as ctx:
        for pos in 'nvar':
            file_name = 'data/tsdb/skeletons/omw_{}.txt'.format(pos)
            rp.print("Reading file: {}".format(file_name))
            for idx, row in enumerate(iter_tsv(file_name)):
                lemma, sid, sdef = row
                db.add_sense(sid, lemma, pos, sdef, ctx=ctx)
                c.count("Added")
    c.summarise()
    pass

Example #9

0

Show file

File: tools.py Project: neocl/jamdict

def show_info(cli, args):
    ''' Show jamdict configuration (data folder, configuration file location, etc.) '''
    output = TextReport(args.output) if 'output' in args else TextReport()
    if args.config:
        jamdict.config.read_config(args.config)
    output.print("Jamdict " + jamdict.version_info.__version__)
    output.print(jamdict.version_info.__description__)
    jam = get_jam(cli, args)
    output.header("Basic configuration")
    jamdict_home = jamdict.config.home_dir()
    if not os.path.isdir(os.path.expanduser(jamdict_home)):
        jamdict_home += " [Missing]"
    else:
        jamdict_home += " [OK]"
    output.print(f"JAMDICT_HOME: {jamdict_home}")
    if jamdict.util._JAMDICT_DATA_AVAILABLE:
        import jamdict_data
        data_pkg = f"version {jamdict_data.__version__} [OK]"
    else:
        data_pkg = "Not installed"
    output.print(f"jamdict-data: {data_pkg}")
    if args.config:
        _config_path = args.config + " [Custom]"
        if not os.path.isfile(args.config):
            _config_path += " [Missing]"
    else:
        _config_path = jamdict.config._get_config_manager().locate_config()
    if not _config_path:
        _config_path = "Not available.\n     Run `python3 -m jamdict config` to create configuration file if needed."
    output.print(f"Config file : {_config_path}")

    output.header("Data files")
    output.print(
        f"Jamdict DB location: {jam.db_file} - {file_status(jam.db_file)}")
    output.print(
        f"JMDict XML file    : {jam.jmd_xml_file} - {file_status(jam.jmd_xml_file)}"
    )
    output.print(
        f"KanjiDic2 XML file : {jam.kd2_xml_file} - {file_status(jam.kd2_xml_file)}"
    )
    output.print(
        f"JMnedict XML file  : {jam.jmnedict_xml_file} - {file_status(jam.jmnedict_xml_file)}"
    )

    if jam.ready:
        output.header("Jamdict database metadata")
        try:
            for meta in jam.jmdict.meta.select():
                output.print(f"{meta.key}: {meta.value}")
        except Exception as e:
            print(e)
            output.print("Error happened while retrieving database meta data")
    output.header("Others")
    output.print(f"puchikarui: version {puchikarui_version}")
    output.print(f"chirptext : version {chirptext_version}")
    output.print(f"lxml      : {jamdict.jmdict._LXML_AVAILABLE}")

Example #10

0

Show file

def manual_patch(cli, args):
    rp = TextReport()
    omw = get_omw()
    if not args.input or not os.path.isfile(args.input):
        raise Exception("Input file could not be found")
    with open(args.input, 'r') as infile, omw.ctx() as ctx:
        synsets = json.loads(infile.read())
        # for ss in synsets:
        #     rp.print(ss['synset'], ss['definition'])
        # rp.print("Found synsets:", len(synsets))
        for sinfo in synsets:
            sid, fixed_def = sinfo['synset'], sinfo['definition']
            ss = omw.get_synset(sid, ctx=ctx)
            orig_def = remove_puncs(ss.definition)
            if remove_puncs(fixed_def) != orig_def:
                rp.header("WARNING:", sid)
                rp.print(ss.definition)
                rp.print(fixed_def)

Example #11

0

Show file

File: extract.py Project: letuananh/omwtk

def gen_mfs_3000(cli, args):
    rp = TextReport(args.output)
    ssids = list(topk_mfs(3000))
    random.shuffle(ssids)
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        filename = 'omw3000A'
        for idx, sid in enumerate(ssids):
            ss = omw.get_synset(sid, ctx=ctx)
            if idx > len(ssids) / 2:
                filename = 'omw3000B'
            hub['omw3000'].header(ss.ID,
                                  'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub['omw3000'].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)

Example #12

0

Show file

File: main.py Project: letuananh/pysemcor

def list_unksense(args):
    header("List unknown sensekeys in Semcor")
    semxml = SemcorXML(SEMCOR_TTL)
    unk = Counter()
    sids = Counter()
    c = Counter()
    out = TextReport() if not args.out else TextReport(args.out)
    for f in semxml.files[:args.limit] if args.limit else ttl.files:
        doc = ttl.Document.from_json_file(ttl.files.abspath(f))
        for s in doc:
            for concept in s.concepts:
                try:
                    sid = SynsetID.from_string(concept.tag)
                    sids.count((sid, concept.clemma))
                    c.count("Known instances")
                except:
                    sid = None
                    unk.count((concept.tag, concept.clemma))
                    c.count("Unknown instances")
    out.header("Known concepts")
    out.writeline("\t".join(("synsetID", "lemma", "count")))
    for k, v in sids.sorted_by_count():
        sid, lemma = k
        out.writeline("\t".join((str(sid), lemma, str(v))))
    out.header("Unknown concepts")
    out.writeline("\t".join(("sensekey", "lemma", "count")))
    for k, v in unk.sorted_by_count():
        sk, lemma = k
        out.writeline("\t".join((sk, lemma, str(v))))
    out.header("Total")
    out.writeline("Known: {}".format(len(sids)))
    out.writeline("Unknown: {}".format(len(unk)))
    c.summarise(out)

Example #13

0

Show file

def list_unksense(args):
    header("List unknown sensekeys in Semcor")
    semxml = SemcorXML(SEMCOR_TTL)
    unk = Counter()
    sids = Counter()
    c = Counter()
    out = TextReport() if not args.out else TextReport(args.out)
    for f in semxml.files[:args.limit] if args.limit else ttl.files:
        doc = ttl.Document.from_json_file(ttl.files.abspath(f))
        for s in doc:
            for concept in s.concepts:
                try:
                    sid = SynsetID.from_string(concept.tag)
                    sids.count((sid, concept.clemma))
                    c.count("Known instances")
                except:
                    sid = None
                    unk.count((concept.tag, concept.clemma))
                    c.count("Unknown instances")
    out.header("Known concepts")
    out.writeline("\t".join(("synsetID", "lemma", "count")))
    for k, v in sids.sorted_by_count():
        sid, lemma = k
        out.writeline("\t".join((str(sid), lemma, str(v))))
    out.header("Unknown concepts")
    out.writeline("\t".join(("sensekey", "lemma", "count")))
    for k, v in unk.sorted_by_count():
        sk, lemma = k
        out.writeline("\t".join((sk, lemma, str(v))))
    out.header("Total")
    out.writeline("Known: {}".format(len(sids)))
    out.writeline("Unknown: {}".format(len(unk)))
    c.summarise(out)

Example #14

0

Show file

File: extract.py Project: letuananh/omwtk

def gen_mfs_5000(cli, args):
    rp = TextReport(args.output)
    from omwtk.wn_ntumc_top3000 import WN_NTUMC_TOP3000
    first_round = set(x['synset'] for x in WN_NTUMC_TOP3000)
    top5000 = topk_mfs(5000)
    round2 = list(top5000.difference(first_round))
    random.shuffle(round2)
    with FileHub(working_dir='data',
                 default_mode='w') as hub, omw.ctx() as ctx:
        filename = 'omw5000A'
        for idx, sid in enumerate(round2):
            ss = omw.get_synset(sid, ctx=ctx)
            if idx > 200:
                filename = 'omw5000B'
            hub['omw5000'].header(ss.ID,
                                  'lemmas: {}'.format(", ".join(ss.lemmas)))
            for d in ss.definitions:
                hub[filename].writeline(d)
                hub['omw5000'].print(d, level=1)
        rp.header("Generated files")
        for f in hub.files.keys():
            rp.print(hub[f].path)

Example #15

0

Show file

File: extract.py Project: letuananh/omwtk

def extract_omw(cli, args):
    ''' OMW Extractor '''
    rp = TextReport()
    omw = get_omw()
    WN_POS = 'nvar'
    with omw.ctx() as ctx:
        for pos in WN_POS:
            rp.header("POS: {}".format(pos))
            query = '''SELECT lemma, sense.synset, def as sdef FROM sense LEFT JOIN word ON sense.wordid = word.wordid and sense.lang=word.lang LEFT JOIN synset_def ON sense.synset = synset_def.synset AND sense.lang = synset_def.lang WHERE sense.lang='eng' AND word.lang='eng' AND synset_def.lang='eng' AND pos=? ORDER By freq DESC '''
            params = [pos]
            if args.topk:
                query += ' LIMIT ?'
                params.append(args.topk)
            results = ctx.select(query, params)
            senses = OrderedDict()
            potential_names = 0
            for lemma, sid, sdef in results:
                if lemma.lower() != lemma:
                    # if pos not in 'nar':
                    #     rp.print("{} - {}".format(lemma, pos))
                    potential_names += 1
                if (lemma, sid) in senses:
                    senses[(lemma, sid)] += "; " + sdef
                else:
                    senses[(lemma, sid)] = sdef
            print("Found {} sense in OMW".format(len(senses.keys())))
            print("Potential name: {}".format(potential_names))
            if args.output:
                out_path = "{}_{}.txt".format(args.output, pos)
                wordsenses = (k + (v, ) for k, v in senses.items())
                CSV.write_tsv(out_path, wordsenses, quoting=CSV.QUOTE_MINIMAL)
                print("Written to {}".format(out_path))
                lemma_out_path = "{}_{}_lemma.txt".format(args.output, pos)
                with open(lemma_out_path, 'w') as outfile:
                    for l, sid in senses.keys():
                        outfile.write(l)
                        outfile.write('\n')
                    print("Written to {}".format(lemma_out_path))

Example #16

0

Show file

def gen_vocab(cli, args):
    ''' Generate vocabulary list from a tokenized file '''
    if args.topk and args.topk <= 0:
        topk = None
        cli.logger.warning(
            "Invalid k will be ignored (k should be greater than or equal to 1)"
        )
    else:
        topk = args.topk
    if args.stopwords:
        with open(args.stopwords, 'r') as swfile:
            stopwords = swfile.read().splitlines()
    else:
        stopwords = []
    if os.path.isfile(args.input):
        cli.logger.info("Generating vocabulary list from file {}".format(
            args.input))
        with codecs.open(args.input, encoding='utf-8') as infile:
            if args.output:
                cli.logger.info("Output: {}".format(args.output))
            rp = TextReport(args.output)
            lines = infile.read().splitlines()
            c = Counter()
            for line in lines:
                words = line.split()
                c.update(w for w in words if w not in stopwords)
            # report vocab
            word_freq = c.most_common(topk)
            words = [k for k, v in word_freq]
            rp.header("Lexicon")
            rp.writeline("\n".join(
                textwrap.wrap(" ".join(w for w in words), width=70)))
            for k, v in word_freq:
                rp.print("{}: {}".format(k, v))
    else:
        cli.logger.warning("File {} does not exist".format(args.input))

Example #17

0

Show file

File: test_goldgen.py Project: letuananh/intsem.fx

    def test_tagging_all(self):
        getLogger().debug("Tagging everything ...")
        sents = self.gold()
        smap = {str(s.ident): s for s in sents}
        # reag tags
        doc = ttl.Document('gold', TEST_GOLD_DIR).read()
        filter_wrong_senses(doc)
        count_good_bad = Counter()
        perfects = []
        to_be_checked = dd(list)
        tbc_concepts = dd(list)
        concept_count = Counter()
        fix_texts = []
        instances = Counter()
        tag_map = dd(set)
        report = TextReport('data/gold_report.txt')
        matched_report = TextReport('data/gold_matched.txt')
        not_matched_report = TextReport('data/gold_notmatched.txt')
        for s in sents[:5]:
            sid = str(s.ident)
            if not doc.has_id(sid):
                raise Exception("Cannot find sentence {}".format(sid))
            elif len(s) == 0:
                logging.warning("Empty sentence: {}".format(s))
            else:
                tagged = doc.get(sid)
                if s.text != tagged.text:
                    fix_texts.append((s.ident, s.text, tagged.text))
                # try to tag ...
                dmrs = s[0].dmrs()
                matched, not_matched, ignored = tag_gold(dmrs, tagged, s.text, mode=Lexsem.ROBUST)
                if not not_matched:
                    count_good_bad.count("Perfect")
                    perfects.append((s, matched))
                else:
                    for nm in not_matched:
                        tag_map[nm.tag].add(nm.clemma)
                        tbc_concepts[nm.tag].append(s.ident)
                        concept_count.count(nm.tag)
                        instances.count('instances')
                    to_be_checked[s.ident].append(nm)
                    count_good_bad.count("To be checked")
        # report matched
        for sent, m in perfects:
            tagged = doc.get(str(sent.ident))
            matched_report.header("#{}: {}".format(sent.ident, sent.text), "h0")
            matched_report.writeline(sent[0].dmrs())
            matched_report.header("Concepts")
            for c, nid, pred in m:
                matched_report.writeline("{} ===> {}:{}".format(c, nid, pred))
            matched_report.writeline()
            matched_report.writeline()
        # report not matched
        not_matched_report.header("By senses", "h0")
        for k, v in concept_count.most_common():
            sids = ' '.join(["#{}".format(x) for x in tbc_concepts[k]])
            not_matched_report.print("{}: {} | {} => {}".format(k, v, sids, tag_map[k]))
        not_matched_report.header("By sentences", "h0")
        for sid, nm in to_be_checked.items():
            not_matched_report.print("#{}: {}  | {}".format(sid, nm, smap[str(sid)].text))
        # full details
        for sid, nm in to_be_checked.items():
            sent = smap[str(sid)]
            tagged = doc.get(str(sid))
            not_matched_report.header("#{}: {}".format(sid, sent.text))
            not_matched_report.writeline(sent[0].dmrs())
            for n in nm:
                not_matched_report.writeline(n)

        # for i, t1, t2 in fix_texts:
        #     getLogger().debug(i)
        #     getLogger().debug(t1)
        #     getLogger().debug(t2)
        count_good_bad.summarise(report=report)
        instances.summarise(report=report)

Example #18

0

Show file

File: ttl.py Project: letuananh/intsem.fx

def remove_msw_ttl(cli, args):
    doc = read_ttl(args.path)
    rp = TextReport(args.debug)
    rp.print("Doc size: {}".format(len(doc)))
    orig_tag_count = 0
    orig_concept_count = 0
    for s in doc:
        orig_concept_count += len(s.concepts)
        orig_tag_count += len(s.tags)
    print("# tags: {}".format(orig_tag_count))
    print("# concepts: {}".format(orig_concept_count))
    manual = dd(lambda: dd(dict))
    nonsenses = set()  # just ignore any tag with these sense IDs
    if args.manual:
        entries = CSV.read_tsv(args.manual)
        for sid, wid, tag, keep, lemma in entries:
            sid, wid, keep = int(sid), int(wid), int(keep)
            if (sid, wid, keep, lemma) == (-1, -1, -1, 'U'):
                nonsenses.add(tag)
            if not lemma:
                manual[sid][wid][tag] = keep
            else:
                manual[sid][wid][(tag, lemma)] = keep
    wn = get_wn()
    ctx = wn.ctx()
    nope_synsets = set()
    ok_synsets = set()
    if args.wn30:
        rp.print("WN30 filter is activated")
    for sidx, sent in enumerate(doc):
        if args.topk and sidx > int(args.topk):
            break
        getLogger().debug("Processing sentence {}/{}".format(sidx + 1, len(doc)))
        getLogger().debug("Before concepts: {}".format(sent.concepts))
        getLogger().debug("Before tags: {}".format(sent.tags))
        # remove concepts that are not in PWN 3.0
        if args.wn30:
            remove_tags = set()
            for tag in sent.tags:
                if tag.tagtype == 'OMW' or tag.label in nonsenses:
                    remove_tags.add(tag)
            for tag in remove_tags:
                sent.tags.remove(tag)
            remove_concepts = set()
            for c in sent.concepts:
                if c.tag in ok_synsets:
                    pass
                elif c.tag in nope_synsets:
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                elif wn.get_synset(c.tag, ctx=ctx) is None:
                    # remove it
                    nope_synsets.add(c.tag)
                    remove_concepts.add(c)
                    # pop_concept(sent, c)
                else:
                    ok_synsets.add(c.tag)
            for c in remove_concepts:
                pop_concept(sent, c)
        msw = list(sent.msw())
        tcmap = sent.tcmap()
        # remove_tags = set()
        if msw:
            keep_remove = []
            for w in msw:
                max_len = 0
                keep = []
                remove = set()
                wid = sent.tokens.index(w)
                for c in tcmap[w]:
                    if c.tag in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][c.tag]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif (c.tag, c.clemma) in manual[sent.ID][wid]:
                        if manual[sent.ID][wid][(c.tag, c.clemma)]:
                            keep.append(c)
                        else:
                            remove.add(c)
                    elif len(c.tokens) == 1 or len(c.tokens) < max_len:
                        remove.add(c)
                    elif c.tag in nonsenses:
                        remove.add(c)
                    else:
                        max_len = len(c.tokens)
                        keep.append(c)
                if len(keep) != 1:
                    keep_remove.append((w, keep, remove))
                else:
                    # everything is OK, remove them now
                    for c in remove:
                        if args.debug:
                            rp.print("Removing concept {} from {}".format(c, sent.ID))
                        getLogger().debug("Removing concept {} from {}".format(c, sent.ID))
                        pop_concept(sent, c)
            if keep_remove:
                rp.header(sent)
                for w, keep, remove in keep_remove:
                    rp.write(w)
                    rp.writeline(" - Keep: {} | Remove: {}".format(keep, remove))
        # remove sent's tags
        # for tag in remove_tags:
        #     getLogger().debug("removing tag: {}".format(tag))
        #     sent.tags.remove(tag)
        getLogger().debug("After concepts: {}".format(sent.concepts))
        getLogger().debug("After tags: {}".format(sent.tags))
    if nope_synsets:
        rp.print("Noped synsets: {}".format(nope_synsets))
    if args.output:
        doc_path = os.path.dirname(args.output)
        doc_name = os.path.basename(args.output)
        new_doc = ttl.Document(doc_name, doc_path)
        sents = doc if not args.topk else list(doc)[:int(args.topk)]
        for s in sents:
            new_doc.add_sent(s)
        tag_count = 0
        concept_count = 0
        for s in sents:
            concept_count += len(s.concepts)
            tag_count += len(s.tags)
        # baking ...
        if args.bake:
            print("Baking doc ...")
            bake_doc(new_doc)
        print("[New] # tags: {}".format(tag_count))
        print("[New] # concepts: {}".format(concept_count))
        rp.print("Writing fixed TTL to {}".format(new_doc.sent_path))
        new_doc.write_ttl()

Example #19

0

Show file

File: demo.py Project: letuananh/chirptext

rp2 = TextReport('~/tmp/my-report.txt')  # output to a file
rp2.write("This is a line in my-report.txt")

rp3 = TextReport.null()  # ouptut to /dev/null, i.e. nowhere
rp3.write("This line goes no where")

rp4 = TextReport.string(
)  # output to a string. Call rp.content() to get the string
rp4.write("This line will be stored in a string buffer")

rp5 = TextReport(TextReport.STRINGIO)  # same as above
rp5.write("This line will also be stored in a string buffer")

# TextReport will close the output stream automatically by using the with statement
with TextReport.string() as rp:
    rp.header("Lorem Ipsum Analysis", level="h0")
    rp.header("Raw", level="h1")
    rp.print(LOREM_IPSUM)
    rp.header("Character Frequency")
    ct.summarise(report=rp)
    print(rp.content())

# ------------------------------------------------------------------------------
# Web fetcher
# ------------------------------------------------------------------------------
from chirptext import WebHelper

web = WebHelper('~/tmp/webcache.db')
data = web.fetch('https://letuananh.github.io/test/data.json')
print(data)
data_json = web.fetch_json('https://letuananh.github.io/test/data.json')

Example #20

0

Show file

File: ttl.py Project: letuananh/intsem.fx

def compare_ttls(cli, args):
    ''' Compare TTL to gold '''
    rp = TextReport()
    omw = get_omw()
    ctx = omw.ctx()
    gold = None
    profile = None
    ignored_ids = []
    if args.ignore:
        ignored_ids = [x.strip() for x in read_file(args.ignore).splitlines() if x.strip()]
        getLogger().debug("Ignored sentence IDs: {}".format(', '.join(ignored_ids)))
    if args.gold_profile:
        gold = read_ttl(args.gold_profile, ttl_format=args.ttl_format)
        # remove ignored sentences
        if ignored_ids:
            for sid in ignored_ids:
                gold.pop(sid, default=None)
        if not args.batch:
            rp.header("Gold sentences: {} | Loc: {}".format(len(gold), args.gold_profile))
        if args.verbose and not args.batch:
            for s in gold:
                rp.print("Sent #{}: {} tags".format(s.ID, len(s.tags)))
    elif not args.batch:
        print("Oops, no gold!")
    # read profile
    if args.profile:
        profile = read_ttl(args.profile, ttl_format=args.ttl_format)
        if not args.batch:
            rp.header("Profile sentences: {} | Loc: {}".format(len(profile), args.profile))
        # remove ignored sentences
        if ignored_ids:
            for sid in ignored_ids:
                profile.pop(sid, default=None)
        if not args.batch:
            rp.header("Profile sentences: {} (ignored: {}) | Loc: {}".format(len(profile), len(ignored_ids), args.profile))
        if args.verbose and not args.batch:
            for s in profile:
                getLogger().debug("Profile/Sent #{}: {} tags".format(s.ID, len(s.tags)))
    elif not args.batch:
        print("Oops, no profile to evaluate")
    # calculate precision and recall
    if gold and profile:
        gold_tags, gold_tags_len, gold_ignored = prepare_tags(gold, args=args, nonsense=args.nonsense)
        profile_tags, profile_tags_len, profile_ignored = prepare_tags(profile, args=args, nonsense=args.nonsense)
        if gold_tags_len == 0:
            rp.print("WARNING: There was no tag found in the gold profile. Please make sure that the tags for comparison are *sentence level* tags")
        if profile_tags_len == 0:
            rp.print("WARNING: There was no tag found in the evaluating profile. Please make sure that the tags for comparison are *sentence level* tags")
        getLogger().debug("Gold tags: {}".format(gold_tags_len))
        getLogger().debug(list(gold_tags.items())[:5])
        getLogger().debug("Profile tags: {}".format(profile_tags_len))
        getLogger().debug(list(profile_tags.items())[:5])
        true_positive, false_negative = score(gold_tags, profile_tags, args=args)
        precision = len(true_positive) / profile_tags_len
        recall = len(true_positive) / gold_tags_len
        f1 = 2 * precision * recall / (precision + recall)
        getLogger().debug("TP: {}".format(len(true_positive)))
        getLogger().debug("FN: {}".format(len(false_negative)))
        getLogger().debug("Recall (TP/Gtags): {}".format(recall))
        getLogger().debug("Precision (TP/Ptags): {}".format(precision))
        getLogger().debug("F1 (2*p*r/(p+r)): {}".format(f1))
        rc_text = "{:.2f}%".format(recall * 100)
        pr_text = "{:.2f}%".format(precision * 100)
        f1_text = "{:.2f}%".format(f1 * 100)
        if not args.batch:
            rp.print("True positive: {}".format(len(true_positive)))
            rp.print("False Negative: {}".format(len(false_negative)))
            rp.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored))
            rp.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored))
            rp.print("Recall:    {}".format(rc_text))
            rp.print("Precision: {}".format(pr_text))
            rp.print("F1       : {}".format(f1_text))
        if args.org:
            # output org-mode
            columns = [rc_text, pr_text, f1_text]
            if args.cols:
                columns = args.cols + columns
            rp.print('| {} |'.format(' | '.join(columns)))
        if args.debug:
            if not args.batch:
                print("Debug file: {}".format(args.debug))
            debugfile = TextReport(args.debug)
            debugfile.print(".:: Table of content ::.")
            debugfile.print("")
            debugfile.print("[Misisng senses]")
            debugfile.print("[By classes]")
            debugfile.print("[Summary]")
            debugfile.print("")
            ss_map = {}
            debugfile.header("[Missing senses]")
            for sid, cfrom, cto, label in sorted(false_negative):
                if label not in ss_map:
                    ss = omw.get_synset(label, ctx=ctx)
                    ss_map[label] = ss
                else:
                    ss = ss_map[label]
                # get the surface form
                surface = gold.get(sid).text[int(cfrom):int(cto)]
                debugfile.print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(sid, cfrom, cto, surface, label, ss.definition, ss.lemmas))
            # by classes
            c = Counter()
            c.update(synsetID for sentID, cfrom, cto, synsetID in false_negative)
            debugfile.header("[By classes]")
            for synsetID, freq in c.most_common():
                ss = ss_map[synsetID]
                debugfile.print("{}: {} | ({}) - {}".format(synsetID, freq, ', '.join(ss.lemmas), ss.definition))
            # summary
            debugfile.header("[Summary]")
            debugfile.print("True positive: {}".format(len(true_positive)))
            debugfile.print("False positive: {}".format(len(false_negative)))
            debugfile.print("Gold # senses: {} | Ignored: {} | Total: {}".format(gold_tags_len, gold_ignored, gold_tags_len + gold_ignored))
            debugfile.print("Predicted # senses: {} | Ignored: {} | Total: {}".format(profile_tags_len, profile_ignored, profile_tags_len + profile_ignored))
            debugfile.print("Recall (TP/Gtags)   : {}".format(rc_text))
            debugfile.print("Precision (TP/Ptags): {}".format(pr_text))
            debugfile.print("F1  (2*p*r/(p+r))   : {}".format(f1_text))
    ctx.close()

Example #21

0

Show file

File: demo.py Project: letuananh/chirptext


# ------------------------------------------------------------------------------
# Sample text report
# ------------------------------------------------------------------------------
# a string report
rp = TextReport()  # by default, TextReport will write to standard output, i.e. terminal
rp = TextReport(TextReport.STDOUT)  # same as above
rp = TextReport('~/tmp/my-report.txt')  # output to a file
rp = TextReport.null()  # ouptut to /dev/null, i.e. nowhere
rp = TextReport.string()  # output to a string. Call rp.content() to get the string
rp = TextReport(TextReport.STRINGIO)  # same as above

# TextReport will close the output stream automatically by using the with statement
with TextReport.string() as rp:
    rp.header("Lorem Ipsum Analysis", level="h0")
    rp.header("Raw", level="h1")
    rp.print(LOREM_IPSUM)
    rp.header("Character Frequency")
    ct.summarise(report=rp)
    print(rp.content())


# ------------------------------------------------------------------------------
# Web fetcher
# ------------------------------------------------------------------------------
from chirptext import WebHelper

web = WebHelper('~/tmp/webcache.db')
data = web.fetch('https://letuananh.github.io/test/data.json')
print(data)

Example #22

0

Show file

File: miner.py Project: letuananh/intsem.fx

def doc_stats(cli, args):
    ''' Show document statistics '''
    doc = Document.from_file(args.path)  # input
    output = TextReport(args.output)  # output
    stats = Counter()
    pred_counter = Counter()
    empty_sentences = []
    unknown_preds = Counter()
    all_pos = Counter()
    not_found = None
    if args.ttl:
        ttl_doc = ttl.Document.read_ttl(args.ttl)
        not_found = set(s.ID for s in ttl_doc).difference(s.ident for s in doc)
    for sent in doc:
        stats.count("Sentences")
        if not len(sent):
            stats.count("Sentences-empty")
            empty_sentences.append(sent.ident)
        for reading in sent:
            stats.count("Readings")
            stats['Predicates'] += len(reading.dmrs().layout.nodes)
            # pred_counter.update(n.predstr for n in reading.dmrs().layout.nodes)
            for n in reading.dmrs().layout.nodes:
                if n.pred.pos == 'u' and n.pred.sense == 'unknown':
                    stats.count("Unnown predicates")
                    if '/' in n.pred.lemma:
                        try:
                            lemma, pos = n.pred.lemma.rsplit('/', 1)
                        except:
                            getLogger().warning("Invalid unknown pred: {}".format(n.pred))
                            raise
                        all_pos.count(pos)
                        unknown_preds.count((str(n.pred), lemma, pos))
                    else:
                        stats.count("UFO")
                else:
                    stats.count("Known predicates")
                    pred_counter.count(n.predstr)
    output.header("Summary", level="h0")
    stats.summarise(output)
    output.header("Empty sentences")
    output.print("\n".join(empty_sentences))
    if not_found is not None:
        output.header("Missing from TTL")
        for sid in not_found:
            output.print(sid)
    output.header("Unknown preds POS")
    for pos, count in all_pos.most_common():
        output.print(pos, count, separator='\t')
    output.header("Unknown preds")
    for (pred, lemma, pos), count in unknown_preds.most_common():
        output.print(pred, lemma, pos, count, separator='\t')
    output.header("Known preds", level="h1")
    pred_counter.summarise(output)

Example #23

0

Show file

File: gold.py Project: letuananh/intsem.fx

def map_predsense(cli, args):
    ''' Pred-Sense Mapping (gold DMRSes, gold Senses) '''
    rp = TextReport(args.output) if args.output else TextReport()
    rp.header("Pred-Sense mapping / strategy = {}".format(args.strat))
    if args.gold:
        sents = Document.from_file(args.gold)
        if args.patchsid:
            patch_gold_sid(sents)
    else:
        sents = read_gold_mrs()
        patch_gold_sid(sents)
    # ignore empty sentence
    empty_sents = [s for s in sents if not len(s)]
    not_empty_sents = [s for s in sents if len(s)]
    rp.print("MRS-Sents: {}".format(len(sents)))
    rp.print("MRS-Sents not empty: {}".format(len(not_empty_sents)))
    if args.ttl:
        doc = ttl.read(args.ttl, mode=args.ttl_format)
    else:
        # [XXX] using gold by default is bad ...
        doc = ttl.Document(name='gold', path='data').read()
    rp.print("TTL-Sents: {}".format(len(doc)))
    found_sents = 0
    for sent in not_empty_sents:
        if doc.get(sent.ident) is None:
            cli.logger.warning("Sentence {} could not be found".format(sent.ident))
        else:
            found_sents += 1
    rp.print("Matched: {}".format(found_sents))
    rp.print("Empty sentences: {}".format([s.ident for s in empty_sents]))
    # Now mapping is possible
    # ----------------------------------------
    ct = Counter()  # total
    cm = Counter()  # matched
    cnm = Counter()  # not matched
    cig = Counter()  # ignored
    sense_lemmas = dd(set)  # sense, lemma, map
    sense_sents = dd(set)  # not-matched senses to sentences
    lemma_sents = dd(set)  # not matched lemmas to sentences
    rp.print("Performing Pred-Sense Mapping")
    sents_to_map = not_empty_sents[:args.topk] if args.topk else not_empty_sents
    for sent in sents_to_map:
        sent.shallow = doc.get(sent.ident)
        for m, nm, ig in import_shallow(sent, mode=args.strat, no_small_sense=args.noss, fix_token=args.fixtoken, no_nonsense=args.nononsense):
            for c, nid, pred in m:
                ct.count(c.tag)
                cm.count(c.tag)
            for c in ig:
                sense_lemmas[c.tag].add(c.clemma)
                ct.count(c.tag)
                cig.count(c.tag)
            for c in nm:
                sense_lemmas[c.tag].add(c.clemma)
                ct.count(c.tag)
                cnm.count(c.tag)
                sense_sents[c.tag].add(sent)
                lemma_sents[c.clemma].add(sent)
            # print("Sent #{} - Not matched: {}".format(sent.ident, nm))
            # print("           Matched    : {}".format(len(m)))
    rp.header("Not matched", level='h0')
    for sid, c in cnm.most_common():
        rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid]))
    rp.header("Not matched (by lemma)", level='h0')
    for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True):
        rp.print("{}: {} | sents: {}".format(clemma, len(sents), [s.ident for s in sents]))
    if args.matched:
        rp.header("Total", level='h0')
        ct.summarise()
    rp.header("Ignored", level='h0')
    for sid, c in cig.most_common():
        rp.print("{}: {} | Lemmas: {}".format(sid, c, sense_lemmas[sid]))
    # show sense - sentences
    rp.header("Sense - Sentences", level='h0')
    for sid, c in cnm.most_common():
        sents = sense_sents[sid]
        rp.header("{} - {}".format(sid, sense_lemmas[sid]), level='h2')
        for sent in sents:
            ttl_sent = doc.get(sent.ident)
            rp.print(ttl_sent)
            for concept in ttl_sent.concepts:
                if concept.tag == sid:
                    rp.print('  -> {}'.format(concept))
    rp.header("Lemma - Sentences", level='h0')
    for clemma, sents in sorted(lemma_sents.items(), key=lambda x: len(x[1]), reverse=True):
        rp.header("#{}".format(clemma,))
        for sent in sents:
            ttl_sent = doc.get(sent.ident)
            rp.print(ttl_sent)
            for concept in ttl_sent.concepts:
                if concept.clemma == clemma:
                    rp.print('  -> {}'.format(concept))
        rp.print()
    # Show final numbers
    total_concepts = sum(x[1] for x in ct.most_common())
    total_matched = sum(x[1] for x in cm.most_common())
    total_notmatched = sum(x[1] for x in cnm.most_common())
    total_ignored = sum(x[1] for x in cig.most_common())
    rp.header("Summarise")
    rp.print("Total concepts: {}".format(total_concepts))
    rp.print("Matched: {}".format(total_matched))
    rp.print("Not matched: {}".format(total_notmatched))
    rp.print("Ignored: {}".format(total_ignored))
    if args.output:
        print("Total concepts: {}".format(total_concepts))
        print("Matched: {}".format(total_matched))
        print("Not matched: {}".format(total_notmatched))
        print("Ignored: {}".format(total_ignored))
        print("Output file: {}".format(args.output))
    print("Done!")
    return total_concepts, total_matched, total_notmatched, total_ignored