Ejemplo n.º 1
0
 def cmp_fields(self, ref1, ref2):
     name1 = stripid(ref1)
     name2 = stripid(ref2)
     words1 = splitwords(name1)
     words2 = splitwords(name2)
     namesim = jaccard(set(words1), set(words2))
     type1 = self.fields.get(ref1)
     type2 = self.fields.get(ref2)
     samehead = (words1[0] == words2[0])
     sametype = (type1 == type2)
     return (namesim, samehead, sametype)
Ejemplo n.º 2
0
 def cmp_methods(self, id1, id2):
     method1 = self.methods[id1]
     method2 = self.methods[id2]
     overriden = (method1.name in method2.overrides
                  or method2.name in method1.overrides)
     (name1, args1, retype1) = splitmethodname(method1.name)
     (name2, args2, retype2) = splitmethodname(method2.name)
     sameargs = (args1 == args2)
     sameretype = (retype1 == retype2)
     words1 = splitwords(name1)
     words2 = splitwords(name2)
     namesim = jaccard(set(words1), set(words2))
     return (namesim, overriden, sameargs, sameretype)
Ejemplo n.º 3
0
 def getnamefeats(self, n):
     if n.kind in CALLS:
         (data,_,_) = n.data.partition(' ')
         (name,_,_) = splitmethodname(data)
         return [ f'{n.kind}:{w}' for w in splitwords(name) ]
     elif n.kind in REFS or n.kind in ASSIGNS:
         if n.ref is self.ref0:
             return []
         elif n.ref.startswith('%'):
             return []
         else:
             if self.namefeat:
                 return [ f'{n.kind}:{w}' for w in splitwords(stripref(n.ref)) ]
             else:
                 return [ n.kind ]
     else:
         return []
Ejemplo n.º 4
0
 def learn(tid, item, fids):
     name = stripid(item)
     words = splitwords(name)
     (count, _) = fids[0]
     feats = {feat: fc for (feat, (fc, _)) in fids.items()}
     for w in words:
         nb.adddict(w, count, feats)
     return True
Ejemplo n.º 5
0
def main(argv):
    import getopt

    def usage():
        print(f'usage: {argv[0]} ' '[-d] [-n limit] [-w] ' '[graph ...]')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'dWn:')
    except getopt.GetoptError:
        return usage()
    debug = 0
    limit = 10
    wordstat = False
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-W': wordstat = True
        elif k == '-n': limit = int(v)

    refs = {}
    for path in args:
        for method in get_graphs(path):
            for node in method:
                ref = node.ref
                if ref is None: continue
                if node.ntype is None: continue
                if ref[0] not in '$@': continue
                refs[ref] = node.ntype

    if wordstat:
        words = {}
        for ref in refs.keys():
            name = stripid(ref)
            if name is None: continue
            for (pos, w) in postag(reversed(splitwords(name))):
                if pos in words:
                    d = words[pos]
                else:
                    d = words[pos] = {}
                d[w] = d.get(w, 0) + 1
        print('counts', {pos: sum(d.values()) for (pos, d) in words.items()})
        print('words', {pos: len(d) for (pos, d) in words.items()})
        for (pos, d) in sorted(words.items(),
                               key=lambda x: len(x[1]),
                               reverse=True):
            print(pos)
            a = sorted(d.items(), key=lambda x: x[1], reverse=True)
            if 0 < limit:
                a = a[:limit]
            for (w, n) in a:
                print(f'  {n} {w}')
    else:
        for (ref, ntype) in sorted(refs.items()):
            print(ref, ntype)
    return 0
Ejemplo n.º 6
0
def main(argv):
    import getopt
    def usage():
        print('usage: %s [-t threshold] featdb1 featdb2 ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 't:')
    except getopt.GetoptError:
        return usage()

    threshold = 0.5
    for (k, v) in opts:
        if k == '-t': threshold = float(v)

    relword = {}
    ws = []
    for (i,path) in enumerate(args):
        print('Loading: %d: %r...' % (i,path), file=sys.stderr)
        db = FeatDB(path)
        wordfeats = {}
        for (tid,item) in db.get_items():
            feats = db.get_feats(tid, resolve=True)
            name = stripid(item)
            words = splitwords(name)
            for (i,w0) in enumerate(words):
                for w1 in words[i+1:]:
                    get(relword, w0, []).append(w1)
                    get(relword, w1, []).append(w0)
            for w in words:
                fs = get(wordfeats, w, {})
                for f in feats.keys():
                    if f not in fs:
                        fs[f] = 0
                    fs[f] += 1
        ws.append(wordfeats)

    sp = VSM()
    for (i,wordfeats) in enumerate(ws):
        for (w,fs) in wordfeats.items():
            sp.add((i,w), fs)
    sp.commit()

    for (sim,(i0,w0),(i1,w1)) in sp.findall(threshold=threshold, verbose=True):
        if w0 == w1: continue
        if w1 in relword and w0 in relword[w1]: continue
        if w0 in relword and w1 in relword[w0]: continue
        print(sim, (i0,w0), (i1,w1))

    return 0
Ejemplo n.º 7
0
def doit(refs):
    prefix = {}
    midfix = {}
    suffix = {}
    for (ref, ntype) in refs.items():
        words = splitwords(stripid(ref))
        if len(words) < 2: continue
        get(prefix, (words[-1], ntype)).append(ref)
        for w in words[1:-1]:
            get(midfix, (w, ntype)).append(ref)
        get(suffix, (words[0], ntype)).append(ref)
    print('total:', len(refs))
    print('prefix:', showtop(prefix))
    print('midfix:', showtop(midfix))
    print('suffix:', showtop(suffix))
    print()
    return
Ejemplo n.º 8
0
def main(argv):
    import getopt

    def usage():
        print(f'usage: {argv[0]} ' '[-d] [-n limit] ' '[graph ...]')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'dn:')
    except getopt.GetoptError:
        return usage()
    debug = 0
    limit = 10
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-n': limit = int(v)

    words = {}
    for path in args:
        for method in get_graphs(path):
            (name, args, retype) = splitmethodname(method.name)
            if name is None: continue
            #print(name)
            for (pos, w) in postag(reversed(splitwords(name))):
                if pos in words:
                    d = words[pos]
                else:
                    d = words[pos] = {}
                d[w] = d.get(w, 0) + 1

    print('counts', {pos: sum(d.values()) for (pos, d) in words.items()})
    print('words', {pos: len(d) for (pos, d) in words.items()})
    for (pos, d) in sorted(words.items(),
                           key=lambda x: len(x[1]),
                           reverse=True):
        print(pos)
        a = sorted(d.items(), key=lambda x: x[1], reverse=True)
        if 0 < limit:
            a = a[:limit]
        for (w, n) in a:
            print(f'  {n} {w}')
    return 0
Ejemplo n.º 9
0
def main(argv):
    import fileinput
    import getopt

    def usage():
        print(f'usage: {argv[0]} [-d] [-n feats] srcdb featdb [word ...]')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'dc:n:')
    except getopt.GetoptError:
        return usage()
    debug = 0
    encoding = None
    ntop = 5
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-c': encoding = v
        elif k == '-n': ntop = int(v)
    if not args: return usage()
    basepath = args.pop(0)
    if not args: return usage()
    dbpath = args.pop(0)

    srcdb = SourceDB(basepath, encoding)
    db = FeatDB(dbpath)

    word2fids = {w: {} for w in args}

    for (tid, item) in db:
        name = stripid(item)
        words = splitwords(name)
        fids = db.get_feats(tid)
        for w in words:
            if w not in word2fids: continue
            fid2items = word2fids[w]
            for fid in fids:
                if fid in fid2items:
                    items = fid2items[fid]
                else:
                    items = fid2items[fid] = []
                assert tid not in items
                items.append(tid)
            assert fid in fid2items
        #sys.stderr.write('.'); sys.stderr.flush()

    nallitems = len(db.get_items())
    for (word, fid2items) in word2fids.items():
        if not fid2items: continue
        fscore = []
        iscore = {}
        nitems = len(fid2items[0])
        for (fid, items) in fid2items.items():
            if fid == 0: continue
            df = math.log(nallitems / db.get_numfeatitems(fid))
            feat = db.get_feat(fid)
            score = math.exp(-abs(feat[0])) * df * len(items)
            fscore.append((score, fid, items))
            for item in items:
                if item not in iscore:
                    iscore[item] = 0
                iscore[item] += score
        print(f'*** word: {word!r}, items: {nitems}, feats: {len(fscore)}\n')
        fscore.sort(reverse=True)
        for (score, fid, items) in fscore[:ntop]:
            feat = db.get_feat(fid)
            print('+FEAT', feat, len(items), score)
            items.sort(key=lambda item: iscore[item], reverse=True)
            for item in items[:1]:
                print('+ITEM', db.get_item(item))
                feats = db.get_feats(item, resolve=True, source=True)
                (fc, srcs) = feats[feat]
                if not srcs: continue
                #srcs.extend(feats[None])
                annot = SourceAnnot(srcdb)
                for src in srcs:
                    (path, start, end) = src
                    annot.add(path, start, end)
                annot.show_text()

    return 0
Ejemplo n.º 10
0
 def predict(tid, item, fids):
     name = stripid(item)
     words = splitwords(name)
     (count, _) = fids[0]
     feats = {fid: fc for (fid, (fc, _)) in fids.items() if fid != 0}
     # Use only prominent features that appears more than a certain threshold.
     threshold = int(max(feats.values()) * ratio)
     f2 = [feat for (feat, fc) in feats.items() if threshold <= fc]
     for w in words:
         nb.removedict(w, count, feats)
     cands = nb.getkeyfeats(f2)[:len(words)]
     for w in words:
         nb.adddict(w, count, feats)
     if not cands: return False
     cwords = [w for (_, w, _) in cands]
     topword = cwords[0]
     if topword in words: return True
     print('+ITEM', json.dumps(item))
     print('+WORDS', json.dumps(words))
     print('+CANDS', json.dumps(cwords))
     if item in defaultnames:
         print('+DEFAULT', json.dumps(defaultnames[item]))
     fids0 = db.get_feats(tid, source=True)
     srcs0 = {0: [], 1: [], -1: []}
     for (fid, (_, srcs)) in fids0.items():
         if fid == 0:
             d = 0
         else:
             d = db.get_feat(fid)[0]
         if d in srcs0:
             srcs0[d].extend(srcs)
     print(
         '+SOURCE',
         json.dumps([(d, list(set(srcs))) for (d, srcs) in srcs0.items()]))
     supports = []
     for (_, w, a) in cands:
         # Find top N features for each word.
         fs = []
         for (fid, c) in a[1:]:
             feat = db.get_feat(fid)
             assert feat is not None
             # A rarer feature overall means stronger indication.
             df = math.log(nallitems / db.get_numfeatitems(fid))
             # A more prominent feature for this category means stronger indication.
             ff = c / nb.fcount[fid][None]
             # Discount a "distant" feature from the subject.
             ds = math.exp(-C * abs(feat[0]))
             fs.append((ds * df * ff, fid, feat))
         fs = sorted(fs, reverse=True)[:maxsupports]
         score = sum(s for (s, _, _) in fs)
         # Find the variables that contains the same feature.
         ss = []
         for (_, fid, _) in fs:
             found = None
             (_, srcs0a) = fids0[0]
             (_, srcs0b) = fids0[fid]
             tids = db.get_featitems(fid)
             for tid1 in tids.keys():
                 if tid1 == tid: continue
                 item1 = db.get_item(tid1)
                 name1 = stripid(item1)
                 if w not in splitwords(name1): continue
                 fids1 = db.get_feats(tid1, source=True)
                 (_, srcs1a) = fids1[0]
                 (_, srcs1b) = fids1[fid]
                 found = (srcs0a + srcs0b, item1, srcs1a + srcs1b)
                 break
             ss.append(found)
         supports.append((w, score, list(zip(fs, ss))))
     print('+SCORE', json.dumps(sum(score for (_, score, _) in supports)))
     print('+SUPPORTS', json.dumps(supports))
     print()
     return False