def main(argv): import fileinput import getopt def usage(): print(f'usage: {argv[0]} [-d] graph [feats ...]') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'd') except getopt.GetoptError: return usage() debug = 0 for (k, v) in opts: if k == '-d': debug += 1 if not args: return usage() path = args.pop(0) print(f'Loading: {path!r}...' % path) db = ClassDB() with open(path) as fp: for klass in load_klasses(fp): db.add_klass(klass) feats = {} a = None for line in fileinput.input(args): if line.startswith('! '): data = eval(line[2:]) if data[0] == 'REF': item = data[1] if item in feats: a = feats[item] else: a = feats[item] = set() elif line.startswith('+ '): data = eval(line[2:]) assert a is not None #feat = (data[0], data[1], data[2], data[3]) feat = (data[0], data[2], data[3]) a.add(feat) # items = list(feats.keys()) r = [] for (i, item0) in enumerate(items): feats0 = feats[item0] base0 = stripgeneric(item0) for item1 in items[i + 1:]: base1 = stripgeneric(item1) if base0 == base1: continue feats1 = feats[item1] sim = jaccard(feats0, feats1) r.append((sim, item0, item1)) r.sort(reverse=True) for (sim, item0, item1) in r[:len(r) // 2]: score = db.cmp_fields(item0, item1) print(sim, stripid(item0), stripid(item1), score) return 0
def cmp_fields(self, ref1, ref2): name1 = stripid(ref1) name2 = stripid(ref2) words1 = splitwords(name1) words2 = splitwords(name2) namesim = jaccard(set(words1), set(words2)) type1 = self.fields.get(ref1) type2 = self.fields.get(ref2) samehead = (words1[0] == words2[0]) sametype = (type1 == type2) return (namesim, samehead, sametype)
def showrec_default(rid, rec): item = rec['ITEM'] cands = rec['CANDS'] old = stripid(item) new = getnewname(rec['WORDS'], cands) print(rid, item, old, new) return
def learn(tid, item, fids): name = stripid(item) words = splitwords(name) (count, _) = fids[0] feats = {feat: fc for (feat, (fc, _)) in fids.items()} for w in words: nb.adddict(w, count, feats) return True
def dump(vtxs, method): for node in method: v0 = vtxs[node] for (link, v1, _) in v0.inputs: if link.startswith('_'): continue print(f(node), stripid(node.ref or '') or '-', link or '<-', f(v1.node)) print() return
def showrec(rid, rec): out.write(f'<h3 class=pair>Pair {rid}</h3>\n') out.write( f'<div class=cat><span id="{rid}" class=ui>Choice: <select>{OPTIONS}</select> Comment: <input size="30" /></span></div>\n') for (i,(item,srcs)) in enumerate(zip(rec['ITEMS'], rec['SRCS'])): name = stripid(item) showsrc(i, name, srcs) if randomized: print(rid, rec['SIM']) return
def main(argv): import getopt def usage(): print(f'usage: {argv[0]} ' '[-d] [-n limit] [-w] ' '[graph ...]') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dWn:') except getopt.GetoptError: return usage() debug = 0 limit = 10 wordstat = False for (k, v) in opts: if k == '-d': debug += 1 elif k == '-W': wordstat = True elif k == '-n': limit = int(v) refs = {} for path in args: for method in get_graphs(path): for node in method: ref = node.ref if ref is None: continue if node.ntype is None: continue if ref[0] not in '$@': continue refs[ref] = node.ntype if wordstat: words = {} for ref in refs.keys(): name = stripid(ref) if name is None: continue for (pos, w) in postag(reversed(splitwords(name))): if pos in words: d = words[pos] else: d = words[pos] = {} d[w] = d.get(w, 0) + 1 print('counts', {pos: sum(d.values()) for (pos, d) in words.items()}) print('words', {pos: len(d) for (pos, d) in words.items()}) for (pos, d) in sorted(words.items(), key=lambda x: len(x[1]), reverse=True): print(pos) a = sorted(d.items(), key=lambda x: x[1], reverse=True) if 0 < limit: a = a[:limit] for (w, n) in a: print(f' {n} {w}') else: for (ref, ntype) in sorted(refs.items()): print(ref, ntype) return 0
def main(argv): global debug import fileinput import getopt def usage(): print(f'usage: {argv[0]} [-d] [-n limit] [namecon ...]') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dn:') except getopt.GetoptError: return usage() limit = 20 for (k, v) in opts: if k == '-d': debug += 1 elif k == '-n': limit = int(v) if not args: return usage() for path in args: with open(path) as fp: recs = sorted(getrecs(fp), key=lambda rec: rec['SCORE'], reverse=True) new4olds = {} old4news = {} def add(d, x, y): if x in d: a = d[x] else: a = d[x] = set() a.add(y) return for rec in recs[:limit]: item = rec['ITEM'] cands = rec['CANDS'] old = stripid(item) new = getnewname(rec['WORDS'], cands) add(new4olds, new, old) add(old4news, old, new) print(path) for (new, olds) in new4olds.items(): if len(olds) < 2: continue print(f' synonym: {olds} -> {new}') for (old, news) in old4news.items(): if len(news) < 2: continue print(f' homonym: {news} <- {old}') print() return
def showrec_html(rid, rec): item = rec['ITEM'] score = rec['SCORE'] old = stripid(item) new = getnewname(rec['WORDS'], rec['CANDS']) assert old != new out.write( f'<h2>Rewrite {rid}: {old} → {new} ({score:.3f})</h2>\n') out.write(f'<h3><code class=old><mark>{old}</mark></code></h3>') srcs = dict(rec['SOURCE']) showsrc_html(srcs[0], 'old') for (w, wscore, feats) in rec['SUPPORTS']: showsupports_html(rid, w, feats) return
def showrec_plain(rid, rec): item = rec['ITEM'] name = stripid(item) out.write(f'*** {item!r}\n\n') out.write(f'{rec["SCORE"]} {name} {rec["CANDS"]}\n\n') srcs = dict(rec['SOURCE']) showsrc_plain(srcs[0], ' ') for (w, wscore, feats) in rec['SUPPORTS']: out.write(f'* {w}\n') for ((fscore, fid, feat), (srcs0, item1, srcs1)) in feats: out.write(f'+ {feat}\n') showsrc_plain(srcs1, 'E') showsrc_plain(srcs0, 'S') return
def main(argv): import getopt def usage(): print('usage: %s [-t threshold] featdb1 featdb2 ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 't:') except getopt.GetoptError: return usage() threshold = 0.5 for (k, v) in opts: if k == '-t': threshold = float(v) relword = {} ws = [] for (i,path) in enumerate(args): print('Loading: %d: %r...' % (i,path), file=sys.stderr) db = FeatDB(path) wordfeats = {} for (tid,item) in db.get_items(): feats = db.get_feats(tid, resolve=True) name = stripid(item) words = splitwords(name) for (i,w0) in enumerate(words): for w1 in words[i+1:]: get(relword, w0, []).append(w1) get(relword, w1, []).append(w0) for w in words: fs = get(wordfeats, w, {}) for f in feats.keys(): if f not in fs: fs[f] = 0 fs[f] += 1 ws.append(wordfeats) sp = VSM() for (i,wordfeats) in enumerate(ws): for (w,fs) in wordfeats.items(): sp.add((i,w), fs) sp.commit() for (sim,(i0,w0),(i1,w1)) in sp.findall(threshold=threshold, verbose=True): if w0 == w1: continue if w1 in relword and w0 in relword[w1]: continue if w0 in relword and w1 in relword[w0]: continue print(sim, (i0,w0), (i1,w1)) return 0
def showsupports_html(rid, w, feats): for (sid, ((fscore, fid, feat), (srcs0, item1, srcs1))) in enumerate(feats): name1 = stripid(item1) out.write('<div class=cand>\n') out.write( f'<h3 class=support>Support ({sid}) for "<code>{w}</code>": <code class=new><mark>{name1}</mark></code> (<code>{feat}</code>)</h3>\n' ) showsrc_html(srcs1, 'new') id = f'{rid}_{w}_{sid}' out.write( f'<a href="javascript:void(0)" onclick="toggle(\'{id}\')">[+]</a> Show Proof<br><div id={id} hidden>\n' ) showsrc_html(srcs0, 'match') out.write('</div></div>\n') return
def getnamefeats(self, n): if n.kind in CALLS: (data,_,_) = n.data.partition(' ') (klass,name,func) = parsemethodname(data) return [ f'{n.kind}:{w}' for w in splitwords(name) ] elif n.kind in REFS or n.kind in ASSIGNS: if n.ref is self.ref0: return [] elif n.ref.startswith('%'): return [] else: if self.namefeat: return [ f'{n.kind}:{w}' for w in splitwords(stripid(n.ref)) ] else: return [ n.kind ] else: return []
def doit(refs): prefix = {} midfix = {} suffix = {} for (ref, ntype) in refs.items(): words = splitwords(stripid(ref)) if len(words) < 2: continue get(prefix, (words[-1], ntype)).append(ref) for w in words[1:-1]: get(midfix, (w, ntype)).append(ref) get(suffix, (words[0], ntype)).append(ref) print('total:', len(refs)) print('prefix:', showtop(prefix)) print('midfix:', showtop(midfix)) print('suffix:', showtop(suffix)) print() return
def showrec_context(rid, rec): item = rec['ITEM'] old = stripid(item) new = getnewname(rec['WORDS'], rec['CANDS']) assert old != new out.write( f'<h2>Rewrite {rid}: <code class=old><mark>{old}</mark></code> → <code class=new><mark>{new}</mark></code></h2>\n' ) out.write( f'<div class=cat><span id="{rid}" class=ui>Choice: {showchoices(CONTEXT_CHOICES)}</select> Comment: <input size="30" /></span></div>\n' ) out.write(f'<h3><code class=old><mark>{old}</mark></code></h3>') srcs = dict(rec['SOURCE']) showsrc_html(srcs[0], 'old') for (w, wscore, feats) in rec['SUPPORTS']: showsupports_html(rid, w, feats) print(rid, rec['SCORE'], rec['RANK']) return
def showrec_eval(rid, rec): item = rec['ITEM'] score = rec['SCORE'] base = rec.get('DEFAULT') cands = rec['CANDS'] old = stripid(item) new = getnewname(rec['WORDS'], cands) assert old != new names = [new, old] keys = ['a', 'b'] if base is not None and new != base and old != base: names.append(base) keys.append('c') random.shuffle(keys) out.write(f'<h2>Rewrite {rid} ({score:.3f})</h2>\n') choices = [('x', '???')] + list(sorted(zip(keys, names))) out.write( f'<div class=cat><span id="{rid}" class=ui>Choice: <code class=old><mark>xxx</mark></code> → {showchoices(choices)} Comment: <input size="30" /></span></div>\n' ) srcs = dict(rec['SOURCE']) showsrc_html(srcs[0], 'old', old) srcs0 = srcs[0] for (d, srcs1) in rec['SOURCE']: if d == 0: continue srcs1 = [ src1 for src1 in srcs1 if not any(overlap(src1, src0) for src0 in srcs0) ] if not srcs1: continue if d < 0: out.write(f'<h3>Source</h3>\n') else: out.write(f'<h3>Destination</h3>\n') showsrc_html(srcs1, 'new') if len(keys) == 3: print(rid, item, old, new, base, keys[0], keys[2]) else: print(rid, item, old, new, base, keys[0], 'null') return
def getdefaultnames(types): names = {} for (v, t) in types.items(): v = stripid(v) if t in names: c = names[t] else: c = names[t] = {} if v not in c: c[v] = 0 c[v] += 1 for (t, c) in names.items(): maxn = -1 maxv = None for (v, n) in c.items(): if maxn < n: maxn = n maxv = v names[t] = maxv defaults = {} for (v, t) in types.items(): defaults[v] = names[t] return defaults
def main(argv): import fileinput import getopt def usage(): print(f'usage: {argv[0]} [-d] [-n feats] srcdb featdb [word ...]') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dc:n:') except getopt.GetoptError: return usage() debug = 0 encoding = None ntop = 5 for (k, v) in opts: if k == '-d': debug += 1 elif k == '-c': encoding = v elif k == '-n': ntop = int(v) if not args: return usage() basepath = args.pop(0) if not args: return usage() dbpath = args.pop(0) srcdb = SourceDB(basepath, encoding) db = FeatDB(dbpath) word2fids = {w: {} for w in args} for (tid, item) in db: name = stripid(item) words = splitwords(name) fids = db.get_feats(tid) for w in words: if w not in word2fids: continue fid2items = word2fids[w] for fid in fids: if fid in fid2items: items = fid2items[fid] else: items = fid2items[fid] = [] assert tid not in items items.append(tid) assert fid in fid2items #sys.stderr.write('.'); sys.stderr.flush() nallitems = len(db.get_items()) for (word, fid2items) in word2fids.items(): if not fid2items: continue fscore = [] iscore = {} nitems = len(fid2items[0]) for (fid, items) in fid2items.items(): if fid == 0: continue df = math.log(nallitems / db.get_numfeatitems(fid)) feat = db.get_feat(fid) score = math.exp(-abs(feat[0])) * df * len(items) fscore.append((score, fid, items)) for item in items: if item not in iscore: iscore[item] = 0 iscore[item] += score print(f'*** word: {word!r}, items: {nitems}, feats: {len(fscore)}\n') fscore.sort(reverse=True) for (score, fid, items) in fscore[:ntop]: feat = db.get_feat(fid) print('+FEAT', feat, len(items), score) items.sort(key=lambda item: iscore[item], reverse=True) for item in items[:1]: print('+ITEM', db.get_item(item)) feats = db.get_feats(item, resolve=True, source=True) (fc, srcs) = feats[feat] if not srcs: continue #srcs.extend(feats[None]) annot = SourceAnnot(srcdb) for src in srcs: (path, start, end) = src annot.add(path, start, end) annot.show_text() return 0
def __str__(self): return (stripid(self.ref) or self.ref)
def __init__(self, name): self.name = stripid(name) self.linkto = [] return
def predict(tid, item, fids): name = stripid(item) words = splitwords(name) (count, _) = fids[0] feats = {fid: fc for (fid, (fc, _)) in fids.items() if fid != 0} # Use only prominent features that appears more than a certain threshold. threshold = int(max(feats.values()) * ratio) f2 = [feat for (feat, fc) in feats.items() if threshold <= fc] for w in words: nb.removedict(w, count, feats) cands = nb.getkeyfeats(f2)[:len(words)] for w in words: nb.adddict(w, count, feats) if not cands: return False cwords = [w for (_, w, _) in cands] topword = cwords[0] if topword in words: return True print('+ITEM', json.dumps(item)) print('+WORDS', json.dumps(words)) print('+CANDS', json.dumps(cwords)) if item in defaultnames: print('+DEFAULT', json.dumps(defaultnames[item])) fids0 = db.get_feats(tid, source=True) srcs0 = {0: [], 1: [], -1: []} for (fid, (_, srcs)) in fids0.items(): if fid == 0: d = 0 else: d = db.get_feat(fid)[0] if d in srcs0: srcs0[d].extend(srcs) print( '+SOURCE', json.dumps([(d, list(set(srcs))) for (d, srcs) in srcs0.items()])) supports = [] for (_, w, a) in cands: # Find top N features for each word. fs = [] for (fid, c) in a[1:]: feat = db.get_feat(fid) assert feat is not None # A rarer feature overall means stronger indication. df = math.log(nallitems / db.get_numfeatitems(fid)) # A more prominent feature for this category means stronger indication. ff = c / nb.fcount[fid][None] # Discount a "distant" feature from the subject. ds = math.exp(-C * abs(feat[0])) fs.append((ds * df * ff, fid, feat)) fs = sorted(fs, reverse=True)[:maxsupports] score = sum(s for (s, _, _) in fs) # Find the variables that contains the same feature. ss = [] for (_, fid, _) in fs: found = None (_, srcs0a) = fids0[0] (_, srcs0b) = fids0[fid] tids = db.get_featitems(fid) for tid1 in tids.keys(): if tid1 == tid: continue item1 = db.get_item(tid1) name1 = stripid(item1) if w not in splitwords(name1): continue fids1 = db.get_feats(tid1, source=True) (_, srcs1a) = fids1[0] (_, srcs1b) = fids1[fid] found = (srcs0a + srcs0b, item1, srcs1a + srcs1b) break ss.append(found) supports.append((w, score, list(zip(fs, ss)))) print('+SCORE', json.dumps(sum(score for (_, score, _) in supports))) print('+SUPPORTS', json.dumps(supports)) print() return False