def cmp_fields(self, ref1, ref2): name1 = stripid(ref1) name2 = stripid(ref2) words1 = splitwords(name1) words2 = splitwords(name2) namesim = jaccard(set(words1), set(words2)) type1 = self.fields.get(ref1) type2 = self.fields.get(ref2) samehead = (words1[0] == words2[0]) sametype = (type1 == type2) return (namesim, samehead, sametype)
def cmp_methods(self, id1, id2): method1 = self.methods[id1] method2 = self.methods[id2] overriden = (method1.name in method2.overrides or method2.name in method1.overrides) (name1, args1, retype1) = splitmethodname(method1.name) (name2, args2, retype2) = splitmethodname(method2.name) sameargs = (args1 == args2) sameretype = (retype1 == retype2) words1 = splitwords(name1) words2 = splitwords(name2) namesim = jaccard(set(words1), set(words2)) return (namesim, overriden, sameargs, sameretype)
def getnamefeats(self, n): if n.kind in CALLS: (data,_,_) = n.data.partition(' ') (name,_,_) = splitmethodname(data) return [ f'{n.kind}:{w}' for w in splitwords(name) ] elif n.kind in REFS or n.kind in ASSIGNS: if n.ref is self.ref0: return [] elif n.ref.startswith('%'): return [] else: if self.namefeat: return [ f'{n.kind}:{w}' for w in splitwords(stripref(n.ref)) ] else: return [ n.kind ] else: return []
def learn(tid, item, fids): name = stripid(item) words = splitwords(name) (count, _) = fids[0] feats = {feat: fc for (feat, (fc, _)) in fids.items()} for w in words: nb.adddict(w, count, feats) return True
def main(argv): import getopt def usage(): print(f'usage: {argv[0]} ' '[-d] [-n limit] [-w] ' '[graph ...]') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dWn:') except getopt.GetoptError: return usage() debug = 0 limit = 10 wordstat = False for (k, v) in opts: if k == '-d': debug += 1 elif k == '-W': wordstat = True elif k == '-n': limit = int(v) refs = {} for path in args: for method in get_graphs(path): for node in method: ref = node.ref if ref is None: continue if node.ntype is None: continue if ref[0] not in '$@': continue refs[ref] = node.ntype if wordstat: words = {} for ref in refs.keys(): name = stripid(ref) if name is None: continue for (pos, w) in postag(reversed(splitwords(name))): if pos in words: d = words[pos] else: d = words[pos] = {} d[w] = d.get(w, 0) + 1 print('counts', {pos: sum(d.values()) for (pos, d) in words.items()}) print('words', {pos: len(d) for (pos, d) in words.items()}) for (pos, d) in sorted(words.items(), key=lambda x: len(x[1]), reverse=True): print(pos) a = sorted(d.items(), key=lambda x: x[1], reverse=True) if 0 < limit: a = a[:limit] for (w, n) in a: print(f' {n} {w}') else: for (ref, ntype) in sorted(refs.items()): print(ref, ntype) return 0
def main(argv): import getopt def usage(): print('usage: %s [-t threshold] featdb1 featdb2 ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 't:') except getopt.GetoptError: return usage() threshold = 0.5 for (k, v) in opts: if k == '-t': threshold = float(v) relword = {} ws = [] for (i,path) in enumerate(args): print('Loading: %d: %r...' % (i,path), file=sys.stderr) db = FeatDB(path) wordfeats = {} for (tid,item) in db.get_items(): feats = db.get_feats(tid, resolve=True) name = stripid(item) words = splitwords(name) for (i,w0) in enumerate(words): for w1 in words[i+1:]: get(relword, w0, []).append(w1) get(relword, w1, []).append(w0) for w in words: fs = get(wordfeats, w, {}) for f in feats.keys(): if f not in fs: fs[f] = 0 fs[f] += 1 ws.append(wordfeats) sp = VSM() for (i,wordfeats) in enumerate(ws): for (w,fs) in wordfeats.items(): sp.add((i,w), fs) sp.commit() for (sim,(i0,w0),(i1,w1)) in sp.findall(threshold=threshold, verbose=True): if w0 == w1: continue if w1 in relword and w0 in relword[w1]: continue if w0 in relword and w1 in relword[w0]: continue print(sim, (i0,w0), (i1,w1)) return 0
def doit(refs): prefix = {} midfix = {} suffix = {} for (ref, ntype) in refs.items(): words = splitwords(stripid(ref)) if len(words) < 2: continue get(prefix, (words[-1], ntype)).append(ref) for w in words[1:-1]: get(midfix, (w, ntype)).append(ref) get(suffix, (words[0], ntype)).append(ref) print('total:', len(refs)) print('prefix:', showtop(prefix)) print('midfix:', showtop(midfix)) print('suffix:', showtop(suffix)) print() return
def main(argv): import getopt def usage(): print(f'usage: {argv[0]} ' '[-d] [-n limit] ' '[graph ...]') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dn:') except getopt.GetoptError: return usage() debug = 0 limit = 10 for (k, v) in opts: if k == '-d': debug += 1 elif k == '-n': limit = int(v) words = {} for path in args: for method in get_graphs(path): (name, args, retype) = splitmethodname(method.name) if name is None: continue #print(name) for (pos, w) in postag(reversed(splitwords(name))): if pos in words: d = words[pos] else: d = words[pos] = {} d[w] = d.get(w, 0) + 1 print('counts', {pos: sum(d.values()) for (pos, d) in words.items()}) print('words', {pos: len(d) for (pos, d) in words.items()}) for (pos, d) in sorted(words.items(), key=lambda x: len(x[1]), reverse=True): print(pos) a = sorted(d.items(), key=lambda x: x[1], reverse=True) if 0 < limit: a = a[:limit] for (w, n) in a: print(f' {n} {w}') return 0
def main(argv): import fileinput import getopt def usage(): print(f'usage: {argv[0]} [-d] [-n feats] srcdb featdb [word ...]') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dc:n:') except getopt.GetoptError: return usage() debug = 0 encoding = None ntop = 5 for (k, v) in opts: if k == '-d': debug += 1 elif k == '-c': encoding = v elif k == '-n': ntop = int(v) if not args: return usage() basepath = args.pop(0) if not args: return usage() dbpath = args.pop(0) srcdb = SourceDB(basepath, encoding) db = FeatDB(dbpath) word2fids = {w: {} for w in args} for (tid, item) in db: name = stripid(item) words = splitwords(name) fids = db.get_feats(tid) for w in words: if w not in word2fids: continue fid2items = word2fids[w] for fid in fids: if fid in fid2items: items = fid2items[fid] else: items = fid2items[fid] = [] assert tid not in items items.append(tid) assert fid in fid2items #sys.stderr.write('.'); sys.stderr.flush() nallitems = len(db.get_items()) for (word, fid2items) in word2fids.items(): if not fid2items: continue fscore = [] iscore = {} nitems = len(fid2items[0]) for (fid, items) in fid2items.items(): if fid == 0: continue df = math.log(nallitems / db.get_numfeatitems(fid)) feat = db.get_feat(fid) score = math.exp(-abs(feat[0])) * df * len(items) fscore.append((score, fid, items)) for item in items: if item not in iscore: iscore[item] = 0 iscore[item] += score print(f'*** word: {word!r}, items: {nitems}, feats: {len(fscore)}\n') fscore.sort(reverse=True) for (score, fid, items) in fscore[:ntop]: feat = db.get_feat(fid) print('+FEAT', feat, len(items), score) items.sort(key=lambda item: iscore[item], reverse=True) for item in items[:1]: print('+ITEM', db.get_item(item)) feats = db.get_feats(item, resolve=True, source=True) (fc, srcs) = feats[feat] if not srcs: continue #srcs.extend(feats[None]) annot = SourceAnnot(srcdb) for src in srcs: (path, start, end) = src annot.add(path, start, end) annot.show_text() return 0
def predict(tid, item, fids): name = stripid(item) words = splitwords(name) (count, _) = fids[0] feats = {fid: fc for (fid, (fc, _)) in fids.items() if fid != 0} # Use only prominent features that appears more than a certain threshold. threshold = int(max(feats.values()) * ratio) f2 = [feat for (feat, fc) in feats.items() if threshold <= fc] for w in words: nb.removedict(w, count, feats) cands = nb.getkeyfeats(f2)[:len(words)] for w in words: nb.adddict(w, count, feats) if not cands: return False cwords = [w for (_, w, _) in cands] topword = cwords[0] if topword in words: return True print('+ITEM', json.dumps(item)) print('+WORDS', json.dumps(words)) print('+CANDS', json.dumps(cwords)) if item in defaultnames: print('+DEFAULT', json.dumps(defaultnames[item])) fids0 = db.get_feats(tid, source=True) srcs0 = {0: [], 1: [], -1: []} for (fid, (_, srcs)) in fids0.items(): if fid == 0: d = 0 else: d = db.get_feat(fid)[0] if d in srcs0: srcs0[d].extend(srcs) print( '+SOURCE', json.dumps([(d, list(set(srcs))) for (d, srcs) in srcs0.items()])) supports = [] for (_, w, a) in cands: # Find top N features for each word. fs = [] for (fid, c) in a[1:]: feat = db.get_feat(fid) assert feat is not None # A rarer feature overall means stronger indication. df = math.log(nallitems / db.get_numfeatitems(fid)) # A more prominent feature for this category means stronger indication. ff = c / nb.fcount[fid][None] # Discount a "distant" feature from the subject. ds = math.exp(-C * abs(feat[0])) fs.append((ds * df * ff, fid, feat)) fs = sorted(fs, reverse=True)[:maxsupports] score = sum(s for (s, _, _) in fs) # Find the variables that contains the same feature. ss = [] for (_, fid, _) in fs: found = None (_, srcs0a) = fids0[0] (_, srcs0b) = fids0[fid] tids = db.get_featitems(fid) for tid1 in tids.keys(): if tid1 == tid: continue item1 = db.get_item(tid1) name1 = stripid(item1) if w not in splitwords(name1): continue fids1 = db.get_feats(tid1, source=True) (_, srcs1a) = fids1[0] (_, srcs1b) = fids1[fid] found = (srcs0a + srcs0b, item1, srcs1a + srcs1b) break ss.append(found) supports.append((w, score, list(zip(fs, ss)))) print('+SCORE', json.dumps(sum(score for (_, score, _) in supports))) print('+SUPPORTS', json.dumps(supports)) print() return False