Example #1
0
def main(argv):
    import fileinput
    import getopt

    def usage():
        print(f'usage: {argv[0]} [-d] graph [feats ...]')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'd')
    except getopt.GetoptError:
        return usage()
    debug = 0
    for (k, v) in opts:
        if k == '-d': debug += 1
    if not args: return usage()

    path = args.pop(0)
    print(f'Loading: {path!r}...' % path)
    db = ClassDB()
    with open(path) as fp:
        for klass in load_klasses(fp):
            db.add_klass(klass)

    feats = {}
    a = None
    for line in fileinput.input(args):
        if line.startswith('! '):
            data = eval(line[2:])
            if data[0] == 'REF':
                item = data[1]
                if item in feats:
                    a = feats[item]
                else:
                    a = feats[item] = set()
        elif line.startswith('+ '):
            data = eval(line[2:])
            assert a is not None
            #feat = (data[0], data[1], data[2], data[3])
            feat = (data[0], data[2], data[3])
            a.add(feat)
    #
    items = list(feats.keys())
    r = []
    for (i, item0) in enumerate(items):
        feats0 = feats[item0]
        base0 = stripgeneric(item0)
        for item1 in items[i + 1:]:
            base1 = stripgeneric(item1)
            if base0 == base1: continue
            feats1 = feats[item1]
            sim = jaccard(feats0, feats1)
            r.append((sim, item0, item1))
    r.sort(reverse=True)
    for (sim, item0, item1) in r[:len(r) // 2]:
        score = db.cmp_fields(item0, item1)
        print(sim, stripid(item0), stripid(item1), score)
    return 0
Example #2
0
 def cmp_fields(self, ref1, ref2):
     name1 = stripid(ref1)
     name2 = stripid(ref2)
     words1 = splitwords(name1)
     words2 = splitwords(name2)
     namesim = jaccard(set(words1), set(words2))
     type1 = self.fields.get(ref1)
     type2 = self.fields.get(ref2)
     samehead = (words1[0] == words2[0])
     sametype = (type1 == type2)
     return (namesim, samehead, sametype)
Example #3
0
 def showrec_default(rid, rec):
     item = rec['ITEM']
     cands = rec['CANDS']
     old = stripid(item)
     new = getnewname(rec['WORDS'], cands)
     print(rid, item, old, new)
     return
Example #4
0
 def learn(tid, item, fids):
     name = stripid(item)
     words = splitwords(name)
     (count, _) = fids[0]
     feats = {feat: fc for (feat, (fc, _)) in fids.items()}
     for w in words:
         nb.adddict(w, count, feats)
     return True
Example #5
0
def dump(vtxs, method):
    for node in method:
        v0 = vtxs[node]
        for (link, v1, _) in v0.inputs:
            if link.startswith('_'): continue
            print(f(node),
                  stripid(node.ref or '') or '-', link or '<-', f(v1.node))
    print()
    return
Example #6
0
 def showrec(rid, rec):
     out.write(f'<h3 class=pair>Pair {rid}</h3>\n')
     out.write(
         f'<div class=cat><span id="{rid}" class=ui>Choice: <select>{OPTIONS}</select> &nbsp; Comment: <input size="30" /></span></div>\n')
     for (i,(item,srcs)) in enumerate(zip(rec['ITEMS'], rec['SRCS'])):
         name = stripid(item)
         showsrc(i, name, srcs)
     if randomized:
         print(rid, rec['SIM'])
     return
Example #7
0
def main(argv):
    import getopt

    def usage():
        print(f'usage: {argv[0]} ' '[-d] [-n limit] [-w] ' '[graph ...]')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'dWn:')
    except getopt.GetoptError:
        return usage()
    debug = 0
    limit = 10
    wordstat = False
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-W': wordstat = True
        elif k == '-n': limit = int(v)

    refs = {}
    for path in args:
        for method in get_graphs(path):
            for node in method:
                ref = node.ref
                if ref is None: continue
                if node.ntype is None: continue
                if ref[0] not in '$@': continue
                refs[ref] = node.ntype

    if wordstat:
        words = {}
        for ref in refs.keys():
            name = stripid(ref)
            if name is None: continue
            for (pos, w) in postag(reversed(splitwords(name))):
                if pos in words:
                    d = words[pos]
                else:
                    d = words[pos] = {}
                d[w] = d.get(w, 0) + 1
        print('counts', {pos: sum(d.values()) for (pos, d) in words.items()})
        print('words', {pos: len(d) for (pos, d) in words.items()})
        for (pos, d) in sorted(words.items(),
                               key=lambda x: len(x[1]),
                               reverse=True):
            print(pos)
            a = sorted(d.items(), key=lambda x: x[1], reverse=True)
            if 0 < limit:
                a = a[:limit]
            for (w, n) in a:
                print(f'  {n} {w}')
    else:
        for (ref, ntype) in sorted(refs.items()):
            print(ref, ntype)
    return 0
Example #8
0
def main(argv):
    global debug
    import fileinput
    import getopt

    def usage():
        print(f'usage: {argv[0]} [-d] [-n limit] [namecon ...]')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'dn:')
    except getopt.GetoptError:
        return usage()

    limit = 20
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-n': limit = int(v)
    if not args: return usage()

    for path in args:
        with open(path) as fp:
            recs = sorted(getrecs(fp),
                          key=lambda rec: rec['SCORE'],
                          reverse=True)

        new4olds = {}
        old4news = {}

        def add(d, x, y):
            if x in d:
                a = d[x]
            else:
                a = d[x] = set()
            a.add(y)
            return

        for rec in recs[:limit]:
            item = rec['ITEM']
            cands = rec['CANDS']
            old = stripid(item)
            new = getnewname(rec['WORDS'], cands)
            add(new4olds, new, old)
            add(old4news, old, new)

        print(path)
        for (new, olds) in new4olds.items():
            if len(olds) < 2: continue
            print(f' synonym: {olds} -> {new}')
        for (old, news) in old4news.items():
            if len(news) < 2: continue
            print(f' homonym: {news} <- {old}')
        print()
    return
Example #9
0
 def showrec_html(rid, rec):
     item = rec['ITEM']
     score = rec['SCORE']
     old = stripid(item)
     new = getnewname(rec['WORDS'], rec['CANDS'])
     assert old != new
     out.write(
         f'<h2>Rewrite {rid}: {old} &rarr; {new} ({score:.3f})</h2>\n')
     out.write(f'<h3><code class=old><mark>{old}</mark></code></h3>')
     srcs = dict(rec['SOURCE'])
     showsrc_html(srcs[0], 'old')
     for (w, wscore, feats) in rec['SUPPORTS']:
         showsupports_html(rid, w, feats)
     return
Example #10
0
 def showrec_plain(rid, rec):
     item = rec['ITEM']
     name = stripid(item)
     out.write(f'*** {item!r}\n\n')
     out.write(f'{rec["SCORE"]} {name} {rec["CANDS"]}\n\n')
     srcs = dict(rec['SOURCE'])
     showsrc_plain(srcs[0], ' ')
     for (w, wscore, feats) in rec['SUPPORTS']:
         out.write(f'* {w}\n')
         for ((fscore, fid, feat), (srcs0, item1, srcs1)) in feats:
             out.write(f'+ {feat}\n')
             showsrc_plain(srcs1, 'E')
             showsrc_plain(srcs0, 'S')
     return
Example #11
0
def main(argv):
    import getopt
    def usage():
        print('usage: %s [-t threshold] featdb1 featdb2 ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 't:')
    except getopt.GetoptError:
        return usage()

    threshold = 0.5
    for (k, v) in opts:
        if k == '-t': threshold = float(v)

    relword = {}
    ws = []
    for (i,path) in enumerate(args):
        print('Loading: %d: %r...' % (i,path), file=sys.stderr)
        db = FeatDB(path)
        wordfeats = {}
        for (tid,item) in db.get_items():
            feats = db.get_feats(tid, resolve=True)
            name = stripid(item)
            words = splitwords(name)
            for (i,w0) in enumerate(words):
                for w1 in words[i+1:]:
                    get(relword, w0, []).append(w1)
                    get(relword, w1, []).append(w0)
            for w in words:
                fs = get(wordfeats, w, {})
                for f in feats.keys():
                    if f not in fs:
                        fs[f] = 0
                    fs[f] += 1
        ws.append(wordfeats)

    sp = VSM()
    for (i,wordfeats) in enumerate(ws):
        for (w,fs) in wordfeats.items():
            sp.add((i,w), fs)
    sp.commit()

    for (sim,(i0,w0),(i1,w1)) in sp.findall(threshold=threshold, verbose=True):
        if w0 == w1: continue
        if w1 in relword and w0 in relword[w1]: continue
        if w0 in relword and w1 in relword[w0]: continue
        print(sim, (i0,w0), (i1,w1))

    return 0
Example #12
0
 def showsupports_html(rid, w, feats):
     for (sid, ((fscore, fid, feat), (srcs0, item1,
                                      srcs1))) in enumerate(feats):
         name1 = stripid(item1)
         out.write('<div class=cand>\n')
         out.write(
             f'<h3 class=support>Support ({sid}) for "<code>{w}</code>": <code class=new><mark>{name1}</mark></code> &nbsp; (<code>{feat}</code>)</h3>\n'
         )
         showsrc_html(srcs1, 'new')
         id = f'{rid}_{w}_{sid}'
         out.write(
             f'<a href="javascript:void(0)" onclick="toggle(\'{id}\')">[+]</a> Show Proof<br><div id={id} hidden>\n'
         )
         showsrc_html(srcs0, 'match')
         out.write('</div></div>\n')
     return
Example #13
0
 def getnamefeats(self, n):
     if n.kind in CALLS:
         (data,_,_) = n.data.partition(' ')
         (klass,name,func) = parsemethodname(data)
         return [ f'{n.kind}:{w}' for w in splitwords(name) ]
     elif n.kind in REFS or n.kind in ASSIGNS:
         if n.ref is self.ref0:
             return []
         elif n.ref.startswith('%'):
             return []
         else:
             if self.namefeat:
                 return [ f'{n.kind}:{w}' for w in splitwords(stripid(n.ref)) ]
             else:
                 return [ n.kind ]
     else:
         return []
Example #14
0
def doit(refs):
    prefix = {}
    midfix = {}
    suffix = {}
    for (ref, ntype) in refs.items():
        words = splitwords(stripid(ref))
        if len(words) < 2: continue
        get(prefix, (words[-1], ntype)).append(ref)
        for w in words[1:-1]:
            get(midfix, (w, ntype)).append(ref)
        get(suffix, (words[0], ntype)).append(ref)
    print('total:', len(refs))
    print('prefix:', showtop(prefix))
    print('midfix:', showtop(midfix))
    print('suffix:', showtop(suffix))
    print()
    return
Example #15
0
 def showrec_context(rid, rec):
     item = rec['ITEM']
     old = stripid(item)
     new = getnewname(rec['WORDS'], rec['CANDS'])
     assert old != new
     out.write(
         f'<h2>Rewrite {rid}: <code class=old><mark>{old}</mark></code> &rarr; <code class=new><mark>{new}</mark></code></h2>\n'
     )
     out.write(
         f'<div class=cat><span id="{rid}" class=ui>Choice: {showchoices(CONTEXT_CHOICES)}</select> &nbsp; Comment: <input size="30" /></span></div>\n'
     )
     out.write(f'<h3><code class=old><mark>{old}</mark></code></h3>')
     srcs = dict(rec['SOURCE'])
     showsrc_html(srcs[0], 'old')
     for (w, wscore, feats) in rec['SUPPORTS']:
         showsupports_html(rid, w, feats)
     print(rid, rec['SCORE'], rec['RANK'])
     return
Example #16
0
 def showrec_eval(rid, rec):
     item = rec['ITEM']
     score = rec['SCORE']
     base = rec.get('DEFAULT')
     cands = rec['CANDS']
     old = stripid(item)
     new = getnewname(rec['WORDS'], cands)
     assert old != new
     names = [new, old]
     keys = ['a', 'b']
     if base is not None and new != base and old != base:
         names.append(base)
         keys.append('c')
     random.shuffle(keys)
     out.write(f'<h2>Rewrite {rid} ({score:.3f})</h2>\n')
     choices = [('x', '???')] + list(sorted(zip(keys, names)))
     out.write(
         f'<div class=cat><span id="{rid}" class=ui>Choice: <code class=old><mark>xxx</mark></code> &rarr; {showchoices(choices)} &nbsp; Comment: <input size="30" /></span></div>\n'
     )
     srcs = dict(rec['SOURCE'])
     showsrc_html(srcs[0], 'old', old)
     srcs0 = srcs[0]
     for (d, srcs1) in rec['SOURCE']:
         if d == 0: continue
         srcs1 = [
             src1 for src1 in srcs1
             if not any(overlap(src1, src0) for src0 in srcs0)
         ]
         if not srcs1: continue
         if d < 0:
             out.write(f'<h3>Source</h3>\n')
         else:
             out.write(f'<h3>Destination</h3>\n')
         showsrc_html(srcs1, 'new')
     if len(keys) == 3:
         print(rid, item, old, new, base, keys[0], keys[2])
     else:
         print(rid, item, old, new, base, keys[0], 'null')
     return
Example #17
0
def getdefaultnames(types):
    names = {}
    for (v, t) in types.items():
        v = stripid(v)
        if t in names:
            c = names[t]
        else:
            c = names[t] = {}
        if v not in c:
            c[v] = 0
        c[v] += 1
    for (t, c) in names.items():
        maxn = -1
        maxv = None
        for (v, n) in c.items():
            if maxn < n:
                maxn = n
                maxv = v
        names[t] = maxv
    defaults = {}
    for (v, t) in types.items():
        defaults[v] = names[t]
    return defaults
Example #18
0
def main(argv):
    import fileinput
    import getopt

    def usage():
        print(f'usage: {argv[0]} [-d] [-n feats] srcdb featdb [word ...]')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'dc:n:')
    except getopt.GetoptError:
        return usage()
    debug = 0
    encoding = None
    ntop = 5
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-c': encoding = v
        elif k == '-n': ntop = int(v)
    if not args: return usage()
    basepath = args.pop(0)
    if not args: return usage()
    dbpath = args.pop(0)

    srcdb = SourceDB(basepath, encoding)
    db = FeatDB(dbpath)

    word2fids = {w: {} for w in args}

    for (tid, item) in db:
        name = stripid(item)
        words = splitwords(name)
        fids = db.get_feats(tid)
        for w in words:
            if w not in word2fids: continue
            fid2items = word2fids[w]
            for fid in fids:
                if fid in fid2items:
                    items = fid2items[fid]
                else:
                    items = fid2items[fid] = []
                assert tid not in items
                items.append(tid)
            assert fid in fid2items
        #sys.stderr.write('.'); sys.stderr.flush()

    nallitems = len(db.get_items())
    for (word, fid2items) in word2fids.items():
        if not fid2items: continue
        fscore = []
        iscore = {}
        nitems = len(fid2items[0])
        for (fid, items) in fid2items.items():
            if fid == 0: continue
            df = math.log(nallitems / db.get_numfeatitems(fid))
            feat = db.get_feat(fid)
            score = math.exp(-abs(feat[0])) * df * len(items)
            fscore.append((score, fid, items))
            for item in items:
                if item not in iscore:
                    iscore[item] = 0
                iscore[item] += score
        print(f'*** word: {word!r}, items: {nitems}, feats: {len(fscore)}\n')
        fscore.sort(reverse=True)
        for (score, fid, items) in fscore[:ntop]:
            feat = db.get_feat(fid)
            print('+FEAT', feat, len(items), score)
            items.sort(key=lambda item: iscore[item], reverse=True)
            for item in items[:1]:
                print('+ITEM', db.get_item(item))
                feats = db.get_feats(item, resolve=True, source=True)
                (fc, srcs) = feats[feat]
                if not srcs: continue
                #srcs.extend(feats[None])
                annot = SourceAnnot(srcdb)
                for src in srcs:
                    (path, start, end) = src
                    annot.add(path, start, end)
                annot.show_text()

    return 0
Example #19
0
 def __str__(self):
     return (stripid(self.ref) or self.ref)
Example #20
0
 def __init__(self, name):
     self.name = stripid(name)
     self.linkto = []
     return
Example #21
0
 def predict(tid, item, fids):
     name = stripid(item)
     words = splitwords(name)
     (count, _) = fids[0]
     feats = {fid: fc for (fid, (fc, _)) in fids.items() if fid != 0}
     # Use only prominent features that appears more than a certain threshold.
     threshold = int(max(feats.values()) * ratio)
     f2 = [feat for (feat, fc) in feats.items() if threshold <= fc]
     for w in words:
         nb.removedict(w, count, feats)
     cands = nb.getkeyfeats(f2)[:len(words)]
     for w in words:
         nb.adddict(w, count, feats)
     if not cands: return False
     cwords = [w for (_, w, _) in cands]
     topword = cwords[0]
     if topword in words: return True
     print('+ITEM', json.dumps(item))
     print('+WORDS', json.dumps(words))
     print('+CANDS', json.dumps(cwords))
     if item in defaultnames:
         print('+DEFAULT', json.dumps(defaultnames[item]))
     fids0 = db.get_feats(tid, source=True)
     srcs0 = {0: [], 1: [], -1: []}
     for (fid, (_, srcs)) in fids0.items():
         if fid == 0:
             d = 0
         else:
             d = db.get_feat(fid)[0]
         if d in srcs0:
             srcs0[d].extend(srcs)
     print(
         '+SOURCE',
         json.dumps([(d, list(set(srcs))) for (d, srcs) in srcs0.items()]))
     supports = []
     for (_, w, a) in cands:
         # Find top N features for each word.
         fs = []
         for (fid, c) in a[1:]:
             feat = db.get_feat(fid)
             assert feat is not None
             # A rarer feature overall means stronger indication.
             df = math.log(nallitems / db.get_numfeatitems(fid))
             # A more prominent feature for this category means stronger indication.
             ff = c / nb.fcount[fid][None]
             # Discount a "distant" feature from the subject.
             ds = math.exp(-C * abs(feat[0]))
             fs.append((ds * df * ff, fid, feat))
         fs = sorted(fs, reverse=True)[:maxsupports]
         score = sum(s for (s, _, _) in fs)
         # Find the variables that contains the same feature.
         ss = []
         for (_, fid, _) in fs:
             found = None
             (_, srcs0a) = fids0[0]
             (_, srcs0b) = fids0[fid]
             tids = db.get_featitems(fid)
             for tid1 in tids.keys():
                 if tid1 == tid: continue
                 item1 = db.get_item(tid1)
                 name1 = stripid(item1)
                 if w not in splitwords(name1): continue
                 fids1 = db.get_feats(tid1, source=True)
                 (_, srcs1a) = fids1[0]
                 (_, srcs1b) = fids1[fid]
                 found = (srcs0a + srcs0b, item1, srcs1a + srcs1b)
                 break
             ss.append(found)
         supports.append((w, score, list(zip(fs, ss))))
     print('+SCORE', json.dumps(sum(score for (_, score, _) in supports)))
     print('+SUPPORTS', json.dumps(supports))
     print()
     return False