def test_FastFuzzySearch():
    import helper, os
    import time
    import random
    import numpy as np
    fname = os.path.expanduser('~/passwords/rockyou-withcount.txt.bz2')
    pws = list(
        set(
            unicode(pw)
            for pw, f in helper.get_line(helper.open_(fname), lim=10000)
            if len(pw) > 5))
    idxs = [random.randint(0, len(pws)) for _ in range(10)]
    eds = [0, 1, 2]
    # print list(ffs.ffs[1].words_with_prefix(tw)
    #            for tw in ffs.ffs[2][2].query('clover')))
    # raise AssertionError
    normalt, fastt = [], []
    for ed in eds:
        s = time.time()
        ffs = Fast2FuzzySearch(pws)
        print "Creation time: {} microsec".format(ed, 1e6 * (time.time() - s))
        for id_ in idxs:
            s = time.time()
            res1 = set(pw for pw in pws if distance(pw, pws[id_]) <= ed)
            e = time.time()
            # print "\nNormal computation (ed={}) time: {:.3f} ms".format(ed, 1000*(e-s))
            normalt.append(1000 * (e - s))
            res2 = set(ffs.query(pws[id_], ed=ed))
            # print "FastFuzzy (ed={}) time: {:.3f} ms".format(ed, 1000*(time.time()-e))
            fastt.append(1000 * (time.time() - e))
            assert res1 == res2
    print "Naive approach:"
    print "Mean: {}\tstd:{}\n".format(np.mean(normalt), np.std(normalt))
    print "Fast string search approach:"
    print "Mean: {}\tstd:{}\n".format(np.mean(fastt), np.std(fastt))
Exemple #2
0
def buildpcfg(passwd_dictionary, start=0, end=-1):
    #MIN_COUNT=1000
    R = RuleSet()
    # resource track
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2'
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary)):
        if n<start: continue
        if n>end: break
        if n>resource_tracker:
            l = check_resource(n)
            if not l:
                break
            else:
                resource_tracker += l
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue    # not ascii hence return
        if c < MIN_COUNT : # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print "Word frequency dropped to %d for %s" % (c, w), n
            break  # Careful!!!
        T = parse(w)
        R.update_set(T.rule_set(), with_freq=True, freq=c)
    if end>0: return R
    R.save(bz2.BZ2File(out_grammar_fl, 'wb'))
Exemple #3
0
    def load(self, filename):
        self.G = json.load(open_(filename), object_pairs_hook=OrderedDict)
        # 这里读取字典的键值
        for k, v in self.G.items():
            if self.cal_cdf:
                # print_err("Calculating CDF!")
                # lf表示的是当前规则中的数量
                # 每一个规则都要把上一次的规则的数量加载其中(有点像是处理(5)-(0)就能求出从1~5的规则出现的次数
                lf = 0
                for l, f in v.items():
                    # v[l] += lf
                    lf += f
                v['__total__'] = lf
            else:
                v['__total__'] = sum(v.values())

        # 然后这里统计出现的所有的W字符串,在一会得字符串生成过程中使用
        self.Wlist = []
        for k, D in self.G.items():
            if k.startswith('W'):
                self.Wlist.extend([x for x in D])

        # 设定data变量,方便管理日期规则
        self.date = Date()
        # 建立dawg,方便生成管理word规则
        self.Wlist = IntDAWG(self.Wlist)
Exemple #4
0
def buildpcfg(passwd_dictionary, start=0, end=-1):
    #MIN_COUNT=1000
    R = RuleSet()
    # resource track
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2'
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary)):
        if n < start: continue
        if n > end: break
        if n > resource_tracker:
            l = check_resource(n)
            if not l:
                break
            else:
                resource_tracker += l
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue  # not ascii hence return
        if c < MIN_COUNT:  # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print "Word frequency dropped to %d for %s" % (c, w), n
            break  # Careful!!!
        T = parse(w)
        R.update_set(T.rule_set(), with_freq=True, freq=c)
    if end > 0: return R
    R.save(bz2.BZ2File(out_grammar_fl, 'wb'))
Exemple #5
0
 def __init__(self, vault_fl, mp):
     self.pcfg = TrainedGrammar() # Default large trained PCFG 
     domain_hash_map_fl = hny_config.STATIC_DOMAIN_HASH_LIST
     self.domain_hash_map = json.load(open_(domain_hash_map_fl))
     self.vault_fl = vault_fl
     self.mp = mp
     self.initialize_vault(mp)
     self.dte = DTE(self.pcfg.decode_grammar(self.H))
Exemple #6
0
def build_dawg(file_name, out_fl=None):
    """
    takes a file name as input and converts that into a dawg.DAWG
    """
    from helper import open_
    import dawg
    with open_(file_name) as f:
        L = (l.strip() for l in f)
        D = dawg.DAWG(L)
        if not out_fl:
            f, e = os.path.splitext(file_name)
            out_fl = f + ".dawg"
        D.save(out_fl)
Exemple #7
0
    def __init__(self):
        self.G = json.load(open_(VAULT_DIST_FILE),
                           object_pairs_hook=OrderedDict)
        # Add dummy entries for new non-terminals now
        # TODO: Learn them by vault analysis. 
        # uniformly distribute these values between 1 and 30
        use_ful = 5
        for k in ['W', 'D', 'Y', 'R', 'T']:
            self.G[k] = OrderedDict(zip((str(x) for x in range(MAX_ALLOWED+1)[1:]), 
                                 [100]*use_ful + [5]*(MAX_ALLOWED-use_ful)))

        for k,v in self.G.items():
            v['__total__'] = sum(v.values())
Exemple #8
0
    def __init__(self):
        self.G = json.load(open_(VAULT_DIST_FILE),
                           object_pairs_hook=OrderedDict)
        # Add dummy entries for new non-terminals now
        # TODO: Learn them by vault analysis. 
        # uniformly distribute these values between 1 and 30
        use_ful = 5
        for k in ['W', 'D', 'Y', 'R', 'T']:
            self.G[k] = OrderedDict(zip((str(x) for x in range(MAX_ALLOWED+1)[1:]), 
                                 [100]*use_ful + [5]*(MAX_ALLOWED-use_ful)))

        for k,v in self.G.items():
            v['__total__'] = sum(v.values())
Exemple #9
0
def build_dawg( file_name, out_fl = None ):
    """
    takes a file name as input and converts that into a dawg.DAWG
    """
    from helper import open_
    import dawg
    with open_(file_name) as f:
        L = ( l.strip() for l in f )
        D = dawg.DAWG(L)
        if not out_fl:
            f, e = os.path.splitext(file_name)
            out_fl = f + ".dawg"
        D.save(out_fl)
Exemple #10
0
def build_int_dawg(filename):
    with open_(filename) as inpf:
        freq_style = get_f_w_freq
        f_line = inpf.readline()
        w = []
        if f_line.startswith('#'):
            words = f_line.strip().split()
            freq_style = get_file_data_format(words[1:])
        else:
            w = [freq_style(f_line)]
        w.extend([freq_style(line) for line in inpf])
        w.append(('__total__', sum_freq))
        int_dawg = IntDAWG(w)
        of = filename.split('.')[0] + '.dawg'
        with open(of, 'wb') as o:
            int_dawg.write(o)
        test_dawg(of, w[:10] + w[-10:])
Exemple #11
0
def build_int_dawg(filename):
    with open_(filename) as inpf:
        freq_style = get_f_w_freq
        f_line = inpf.readline()
        w = []
        if f_line.startswith('#'):
            words = f_line.strip().split()
            freq_style = get_file_data_format(words[1:])
        else:
            w = [freq_style(f_line)]
        w.extend([freq_style(line) 
             for line in inpf])
        w.append(('__total__', sum_freq))
        int_dawg = IntDAWG(w)
        of = filename.split('.')[0] + '.dawg'
        with open(of, 'wb') as o:
            int_dawg.write(o)
        test_dawg(of, w[:10] + w[-10:])
Exemple #12
0
def buildpcfg(passwd_dictionary):
    G = Grammar()
    # resource track
    resource_tracker = 5240
    allowed_sym = re.compile(r'[ \-_]')
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg'
    for n, line in enumerate(open_(passwd_dictionary)):
        if n>resource_tracker:
            r = MEMLIMMIT*1024 - \
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss;
            print "Memory Usage:", (MEMLIMMIT - r/1024.0), "Lineno:", n
            if r < 0:
                print '''
Hitting the memory limit of 1GB,
please increase the limit or use smaller data set.
Lines processed, {0:d}
'''.format(n)
                break;
            resource_tracker += r/10+100;
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue    # not ascii hence return
        if c < MIN_COUNT : # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print "Word frequency dropped to %d for %s" % (c, w), n
            break  # Careful!!!
        G.insert(w, c)
        # print t
        # root_fl.write("%s\t<>\t%s\n" % (' '.join(line), '~'.join(str((x,y)) for x,y in zip(W, Tag))))
        
    # TODO
    #push_DotStar_IntoGrammar( grammar );
    G.update_total_freq()
    G.save(bz2.BZ2File(out_grammar_fl, 'w'))
    #marisa_trie.Trie(Grammar.inversemap.keys()).save(out_trie_fl)
    return G
Exemple #13
0
    def load(self, filename):
        self.G = json.load(open_(filename), object_pairs_hook=OrderedDict)
        for k, v in list(self.G.items()):
            if self.cal_cdf:
                print_err("Calculating CDF!")
                lf = 0
                for l, f in list(v.items()):
                    v[l] += lf
                    lf += f
                v['__total__'] = lf
            else:
                v['__total__'] = sum(v.values())

        # Create dawg/trie of the Wlist items for fast retrieval
        Wlist = [
            x for k, v in list(self.G.items()) for x in v if k.startswith('W')
        ]
        self.date = Date()
        self.Wdawg = IntDAWG(Wlist)
Exemple #14
0
def buildpcfg(passwd_dictionary):
    G = Grammar()
    # resource track
    resource_tracker = 5240
    allowed_sym = re.compile(r'[ \-_]')
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg'
    for n, line in enumerate(open_(passwd_dictionary)):
        if n > resource_tracker:
            r = MEMLIMMIT*1024 - \
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
            print "Memory Usage:", (MEMLIMMIT - r / 1024.0), "Lineno:", n
            if r < 0:
                print '''
Hitting the memory limit of 1GB,
please increase the limit or use smaller data set.
Lines processed, {0:d}
'''.format(n)
                break
            resource_tracker += r / 10 + 100
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue  # not ascii hence return
        if c < MIN_COUNT:  # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print "Word frequency dropped to %d for %s" % (c, w), n
            break  # Careful!!!
        G.insert(w, c)
        # print t
        # root_fl.write("%s\t<>\t%s\n" % (' '.join(line), '~'.join(str((x,y)) for x,y in zip(W, Tag))))

    # TODO
    #push_DotStar_IntoGrammar( grammar );
    G.update_total_freq()
    G.save(bz2.BZ2File(out_grammar_fl, 'w'))
    #marisa_trie.Trie(Grammar.inversemap.keys()).save(out_trie_fl)
    return G
Exemple #15
0
def buildpcfg(passwd_dictionary,
              start=0,
              end=-1,
              outf=hny_config.TRAINED_GRAMMAR_FILE):
    # MIN_COUNT=1000
    R = RuleSet()
    # resource track
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary, 'r')):
        try:
            line = line.decode('utf-8')
        except UnicodeDecodeError:
            print("Cannot decode: {}".format(line))
            continue
        if n < start: continue
        if n > end: break
        if n > resource_tracker:
            l = check_resource(n)
            if not l:
                break
            else:
                resource_tracker += l
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        # not ascii hence return
        if not isascii(w):
            continue
        if c < MIN_COUNT:  # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print("Word frequency dropped to %d for %s" % (c, w), n)
            break  # Careful!!!
        T = parse(w)
        R.update_set(T.rule_set(), with_freq=True, freq=c)

    if not outf:
        return R
    else:
        R.save(gzip.open(outf, 'wt'))
Exemple #16
0
    def load(self, filename):
        self.G = json.load(open_(filename),
                           object_pairs_hook=OrderedDict)
        for k, v in list(self.G.items()):
            if self.cal_cdf:
                print_err("Calculating CDF!")
                lf = 0
                for l, f in list(v.items()):
                    v[l] += lf
                    lf += f
                v['__total__'] = lf
            else:
                v['__total__'] = sum(v.values())

        # Create dawg/trie of the Wlist items for fast retrieval
        Wlist = [x
                 for k, v in list(self.G.items())
                 for x in v
                 if k.startswith('W')]
        self.date = Date()
        self.Wdawg = IntDAWG(Wlist)
Exemple #17
0
def buildpcfg(passwd_dictionary, start=0, end=-1,
              outf=hny_config.TRAINED_GRAMMAR_FILE):
    # MIN_COUNT=1000
    R = RuleSet()
    # resource track
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary, 'r')):
        try:
            line = line.decode('utf-8')
        except UnicodeDecodeError:
            print("Cannot decode: {}".format(line))
            continue
        if n < start: continue
        if n > end: break
        if n > resource_tracker:
            l = check_resource(n)
            if not l:
                break
            else:
                resource_tracker += l
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        # not ascii hence return
        if not isascii(w):
            continue
        if c < MIN_COUNT:  # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print("Word frequency dropped to %d for %s" % (c, w), n)
            break  # Careful!!!
        T = parse(w)
        R.update_set(T.rule_set(), with_freq=True, freq=c)

    if not outf:
        return R
    else:
        R.save(gzip.open(outf, 'wt'))
Exemple #18
0
def buildpcfg(passwd_dictionary):
    G = Grammar()
    # resource track
    resource_tracker = 5240   # Number of lines
    # allowed_sym = re.compile(r'[ \-_]')
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.gzip'
    for n, line in enumerate(open_(passwd_dictionary, 'rt')):
        if n > resource_tracker:
            r = memusage_toomuch(n)
            if r < 0:
                break
            resource_tracker += r / 10 + 100

        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        # try:
        #     w.decode('ascii')
        # except UnicodeDecodeError:
        #     continue  # not ascii hence return
        if c < MIN_COUNT:  # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print("Word frequency dropped to %d for %s" % (c, w), n)
            break  # Careful!!!
        G.insert(w, c)
        # print t
        # root_fl.write("%s\t<>\t%s\n" % (' '.join(line), '~'.join(str((x,y)) for x,y in zip(W, Tag))))

    # TODO
    # push_DotStar_IntoGrammar( grammar );
    G.update_total_freq()
    G.save(gzip.open(out_grammar_fl, 'wt'))
    # marisa_trie.Trie(Grammar.inversemap.keys()).save(out_trie_fl)
    return G
Exemple #19
0
def buildpcfg(passwd_dictionary):
    G = Grammar()
    # resource track
    resource_tracker = 5240  # Number of lines
    # allowed_sym = re.compile(r'[ \-_]')
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.gzip'
    for n, line in enumerate(open_(passwd_dictionary, 'rt')):
        if n > resource_tracker:
            r = memusage_toomuch(n)
            if r < 0:
                break
            resource_tracker += r / 10 + 100

        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        # try:
        #     w.decode('ascii')
        # except UnicodeDecodeError:
        #     continue  # not ascii hence return
        if c < MIN_COUNT:  # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print("Word frequency dropped to %d for %s" % (c, w), n)
            break  # Careful!!!
        G.insert(w, c)
        # print t
        # root_fl.write("%s\t<>\t%s\n" % (' '.join(line), '~'.join(str((x,y)) for x,y in zip(W, Tag))))

    # TODO
    # push_DotStar_IntoGrammar( grammar );
    G.update_total_freq()
    G.save(gzip.open(out_grammar_fl, 'wt'))
    # marisa_trie.Trie(Grammar.inversemap.keys()).save(out_trie_fl)
    return G
Exemple #20
0
def breakwordsintotokens(passwd_dictionary):
    """
    Takes a password list file and break every password into possible tokens,
    writes back to a output_file, named as <input_fl>_out.tar.gz,in csv format.
    """
    # for the direcotry
    if not os.path.exists(GRAMMAR_DIR):
        os.mkdir(GRAMMAR_DIR)
    G_out_files = dict()
    for k, f in GrammarStructure().getTermFiles().items():
        G_out_files[k] = os.path.join(GRAMMAR_DIR, f)
    Arr = {}
    for k in G_out_files.keys():
        Arr[k] = dict()
    out_file_name = 'data/'+os.path.basename(passwd_dictionary).split('.')[0]+'_out.tar.gz'
    print passwd_dictionary, out_file_name
    output_file = open(out_file_name, 'wb')
    csv_writer = csv.writer(output_file, delimiter=',',
                            quotechar='"')
    T = Scanner()
    # G = Grammar(scanner=T)
    # resource track
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary)):
        if n>resource_tracker:
            r = MEMLIMMIT*1024 - resource.getrusage(resource.RUSAGE_SELF).ru_maxrss;
            print "Memory Usage:", (MEMLIMMIT - r/1024.0), "Lineno:", n
            if r < 0:
                print """
Hitting the memory limit of 1GB,
please increase the limit or use smaller data set.
Lines processed, %d
""" % n
                break
            resource_tracker += r/10+100
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue     # not ascii hence return
        if c < MIN_COUNT:
            break
        # P is the patterns, W is the unmangled words, U is the original
        Tags, W, U  = T.tokenize(w, True) 
        # print t
        if 'password' in w:
            print Tags, W
        if Tags:
            for t,w in zip(Tags, W):
                try:
                    Arr[t][w] += c
                except KeyError:
                    try: Arr[t][w] = c
                    except KeyError:
                        print "Something is wrong:", Tags, W
            csv_writer.writerow([c, w, str(Tags), str(W), str(U)])
        else:
            print 'Failed to Parse:', w
    for k, D in Arr.items():
        T = marisa_trie.Trie(D.keys())
        T.save(G_out_files[k] + '.tri')
        n = len(D.keys())+1
        A = [0 for i in xrange(n)]
        s = 0
        for w,c in D.items():
            i = T.key_id(unicode(w))
            try: 
                A[i] =  c
                s += c
            except IndexError: 
                print "IndexError", w
        A[-1] = s
        with open(G_out_files[k] + '.py', 'w') as f:
            f.write('%s = [' % k)
            f.write(',\n'.join(['%d' % x for x in A]))
            f.write(']\n')
        # root_fl.write("%d,\'%s\t<>\t%s\n" % ( ' '.join(line), '~'.join(((t)))))
        
    # TODO
    #push_DotStar_IntoGrammar( grammar );
    output_file.close()
Exemple #21
0
    if args.buildG:
        fname = args.buildG
        if args.parallel:
            parallel_buildpcfg(fname, 1e3)
        else:
            buildpcfg(fname, 0, 1e6)
    elif args.parse:
        for w in args.parse:
            T = parse(w)
            print("Parsing: {}\nParse-tree: {},\nSmallGrammar: {}".format(
                w, T.parse_tree(), T.rule_set()))

    elif args.parsef:
        fname = args.parsef
        R = RuleSet()
        with open_(fname) as f:
            for i, line in enumerate(f):
                if i < 5000: continue
                l = line.strip().split()
                w, c = ' '.join(l[1:]), int(l[0])
                try:
                    w.decode('ascii')
                except UnicodeDecodeError:
                    continue  # not ascii hence return
                if not w or len(w.strip()) < 1:
                    continue
                T = parse(w)
                R.update_set(T.rule_set(), with_freq=True, freq=c)
                if i % 100 == 0: print(i)
                if i > 5200: break
        print(R)
Exemple #22
0
         for c in range(0, 10**6, load_each)]
    R = p.map(wraper_buildpcfg, a)
    for r in R:
        Complete_grammar.update_set(r, with_freq=True)
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2'
    Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb'))


if __name__ == "__main__":
    if sys.argv[1] == '-buildG':
        print buildpcfg(sys.argv[2], 0, 100)
    elif sys.argv[1] == '-buildparallelG':
        parallel_buildpcfg(sys.argv[2])
    elif sys.argv[1]=='-file':
        R = RuleSet()
        with open_(sys.argv[2]) as f:
            for i, line in enumerate(f):
                if i<5000: continue
                l = line.strip().split()
                w, c = ' '.join(l[1:]), int(l[0])
                try: w.decode('ascii')
                except UnicodeDecodeError:
                    continue    # not ascii hence return
                if not w or len(w.strip())<1:
                    continue
                T = parse(w)
                R.update_set(T.rule_set(), with_freq=True, freq=c)
                if i%100==0: print i
                if i>5200: break
        print R
    elif sys.argv[1] == '-parse':
Exemple #23
0
        fname = args.buildG
        if args.parallel:
            parallel_buildpcfg(fname, 1e3)
        else:
            buildpcfg(fname, 0, 1e6)
    elif args.parse:
        for w in args.parse:
            T = parse(w)
            print("Parsing: {}\nParse-tree: {},\nSmallGrammar: {}".format(
                w, T.parse_tree(), T.rule_set()
            ))

    elif args.parsef:
        fname = args.parsef
        R = RuleSet()
        with open_(fname) as f:
            for i, line in enumerate(f):
                if i < 5000: continue
                l = line.strip().split()
                w, c = ' '.join(l[1:]), int(l[0])
                try:
                    w.decode('ascii')
                except UnicodeDecodeError:
                    continue  # not ascii hence return
                if not w or len(w.strip()) < 1:
                    continue
                T = parse(w)
                R.update_set(T.rule_set(), with_freq=True, freq=c)
                if i % 100 == 0: print(i)
                if i > 5200: break
        print(R)
Exemple #24
0
         for c in range(0, 10**6, load_each)]
    R = p.map(wraper_buildpcfg, a)
    for r in R:
        Complete_grammar.update_set(r, with_freq=True)
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2'
    Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb'))


if __name__ == "__main__":
    if sys.argv[1] == '-buildG':
        print buildpcfg(sys.argv[2], 0, 100)
    elif sys.argv[1] == '-buildparallelG':
        parallel_buildpcfg(sys.argv[2])
    elif sys.argv[1] == '-file':
        R = RuleSet()
        with open_(sys.argv[2]) as f:
            for i, line in enumerate(f):
                if i < 5000: continue
                l = line.strip().split()
                w, c = ' '.join(l[1:]), int(l[0])
                try:
                    w.decode('ascii')
                except UnicodeDecodeError:
                    continue  # not ascii hence return
                if not w or len(w.strip()) < 1:
                    continue
                T = parse(w)
                R.update_set(T.rule_set(), with_freq=True, freq=c)
                if i % 100 == 0: print i
                if i > 5200: break
        print R
Exemple #25
0
def breakwordsintotokens(passwd_dictionary):
    """
    Takes a password list file and break every password into possible tokens,
    writes back to a output_file, named as <input_fl>_out.tar.gz,in csv format.
    """
    # for the direcotry
    if not os.path.exists(GRAMMAR_DIR):
        os.mkdir(GRAMMAR_DIR)
    G_out_files = dict()
    for k, f in list(GrammarStructure().getTermFiles().items()):
        G_out_files[k] = os.path.join(GRAMMAR_DIR, f)
    Arr = {}
    for k in list(G_out_files.keys()):
        Arr[k] = dict()
    out_file_name = 'data/' +\
                    os.path.basename(passwd_dictionary).split('.')[0] +\
                    '_out.tar.gz'
    print(passwd_dictionary, out_file_name)
    output_file = gzip.open(out_file_name, 'wt')
    csv_writer = csv.writer(output_file, delimiter=',',
                            quotechar='"')
    T = Scanner()
    # G = Grammar(scanner=T)
    # resource track
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary, 'rt')):
        if n > resource_tracker:
            r = memusage_toomuch(n)
            if r < 0:
                break
            resource_tracker += r / 10 + 100
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        # try:
        #     w.decode('ascii')
        # except UnicodeDecodeError:
        #     continue  # not ascii hence return
        if c < MIN_COUNT:
            break
        # P is the patterns, W is the unmangled words, U is the original
        Tags, W, U = T.tokenize(w, True)
        # print t
        if 'password' in w:
            print(Tags, W)
        if Tags:
            for t, w in zip(Tags, W):
                try:
                    Arr[t][w] += c
                except KeyError:
                    try:
                        Arr[t][w] = c
                    except KeyError:
                        print("Something is wrong:", Tags, W)
            csv_writer.writerow([c, w, str(Tags), str(W), str(U)])
        else:
            print('Failed to Parse:', w)
    for k, D in list(Arr.items()):
        T = marisa_trie.Trie(list(D.keys()))
        T.save(G_out_files[k] + '.tri')
        n = len(list(D.keys())) + 1
        A = [0 for i in range(n)]
        s = 0
        for w, c in list(D.items()):
            i = T.key_id(str(w))
            try:
                A[i] = c
                s += c
            except IndexError:
                print("IndexError", w)
        A[-1] = s
        with open(G_out_files[k] + '.py', 'w') as f:
            f.write('%s = [' % k)
            f.write(',\n'.join(['%d' % x for x in A]))
            f.write(']\n')
            # root_fl.write("%d,\'%s\t<>\t%s\n" % ( ' '.join(line), '~'.join(((t)))))

    # TODO
    # push_DotStar_IntoGrammar( grammar );
    output_file.close()
Exemple #26
0
def breakwordsintotokens(passwd_dictionary):
    """
    Takes a password list file and break every password into possible tokens,
    writes back to a output_file, named as <input_fl>_out.tar.gz,in csv format.
    """
    # for the direcotry
    if not os.path.exists(GRAMMAR_DIR):
        os.mkdir(GRAMMAR_DIR)
    G_out_files = dict()
    for k, f in GrammarStructure().getTermFiles().items():
        G_out_files[k] = os.path.join(GRAMMAR_DIR, f)
    Arr = {}
    for k in G_out_files.keys():
        Arr[k] = dict()
    out_file_name = 'data/' + os.path.basename(passwd_dictionary).split(
        '.')[0] + '_out.tar.gz'
    print passwd_dictionary, out_file_name
    output_file = open(out_file_name, 'wb')
    csv_writer = csv.writer(output_file, delimiter=',', quotechar='"')
    T = Scanner()
    # G = Grammar(scanner=T)
    # resource track
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary)):
        if n > resource_tracker:
            r = MEMLIMMIT * 1024 - resource.getrusage(
                resource.RUSAGE_SELF).ru_maxrss
            print "Memory Usage:", (MEMLIMMIT - r / 1024.0), "Lineno:", n
            if r < 0:
                print """
Hitting the memory limit of 1GB,
please increase the limit or use smaller data set.
Lines processed, %d
""" % n
                break
            resource_tracker += r / 10 + 100
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue  # not ascii hence return
        if c < MIN_COUNT:
            break
        # P is the patterns, W is the unmangled words, U is the original
        Tags, W, U = T.tokenize(w, True)
        # print t
        if 'password' in w:
            print Tags, W
        if Tags:
            for t, w in zip(Tags, W):
                try:
                    Arr[t][w] += c
                except KeyError:
                    try:
                        Arr[t][w] = c
                    except KeyError:
                        print "Something is wrong:", Tags, W
            csv_writer.writerow([c, w, str(Tags), str(W), str(U)])
        else:
            print 'Failed to Parse:', w
    for k, D in Arr.items():
        T = marisa_trie.Trie(D.keys())
        T.save(G_out_files[k] + '.tri')
        n = len(D.keys()) + 1
        A = [0 for i in xrange(n)]
        s = 0
        for w, c in D.items():
            i = T.key_id(unicode(w))
            try:
                A[i] = c
                s += c
            except IndexError:
                print "IndexError", w
        A[-1] = s
        with open(G_out_files[k] + '.py', 'w') as f:
            f.write('%s = [' % k)
            f.write(',\n'.join(['%d' % x for x in A]))
            f.write(']\n')
        # root_fl.write("%d,\'%s\t<>\t%s\n" % ( ' '.join(line), '~'.join(((t)))))

    # TODO
    #push_DotStar_IntoGrammar( grammar );
    output_file.close()
Exemple #27
0
 def load(self, in_file):
     self.G = json.load(open_(in_file),
                        object_pairs_hook=OrderedDict)
 def __init__(self):
     self.G = json.load(open_(VAULT_DIST_FILE),
                        object_pairs_hook=OrderedDict)
     for k,v in self.G.items():
         v['__total__'] = sum(v.values())
Exemple #29
0
def breakwordsintotokens(passwd_dictionary):
    """
    Takes a password list file and break every password into possible tokens,
    writes back to a output_file, named as <input_fl>_out.tar.gz,in csv format.
    """
    # for the direcotry
    if not os.path.exists(GRAMMAR_DIR):
        os.mkdir(GRAMMAR_DIR)
    G_out_files = dict()
    for k, f in list(GrammarStructure().getTermFiles().items()):
        G_out_files[k] = os.path.join(GRAMMAR_DIR, f)
    Arr = {}
    for k in list(G_out_files.keys()):
        Arr[k] = dict()
    out_file_name = 'data/' +\
                    os.path.basename(passwd_dictionary).split('.')[0] +\
                    '_out.tar.gz'
    print(passwd_dictionary, out_file_name)
    output_file = gzip.open(out_file_name, 'wt')
    csv_writer = csv.writer(output_file, delimiter=',', quotechar='"')
    T = Scanner()
    # G = Grammar(scanner=T)
    # resource track
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary, 'rt')):
        if n > resource_tracker:
            r = memusage_toomuch(n)
            if r < 0:
                break
            resource_tracker += r / 10 + 100
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        # try:
        #     w.decode('ascii')
        # except UnicodeDecodeError:
        #     continue  # not ascii hence return
        if c < MIN_COUNT:
            break
        # P is the patterns, W is the unmangled words, U is the original
        Tags, W, U = T.tokenize(w, True)
        # print t
        if 'password' in w:
            print(Tags, W)
        if Tags:
            for t, w in zip(Tags, W):
                try:
                    Arr[t][w] += c
                except KeyError:
                    try:
                        Arr[t][w] = c
                    except KeyError:
                        print("Something is wrong:", Tags, W)
            csv_writer.writerow([c, w, str(Tags), str(W), str(U)])
        else:
            print('Failed to Parse:', w)
    for k, D in list(Arr.items()):
        T = marisa_trie.Trie(list(D.keys()))
        T.save(G_out_files[k] + '.tri')
        n = len(list(D.keys())) + 1
        A = [0 for i in range(n)]
        s = 0
        for w, c in list(D.items()):
            i = T.key_id(str(w))
            try:
                A[i] = c
                s += c
            except IndexError:
                print("IndexError", w)
        A[-1] = s
        with open(G_out_files[k] + '.py', 'w') as f:
            f.write('%s = [' % k)
            f.write(',\n'.join(['%d' % x for x in A]))
            f.write(']\n')
            # root_fl.write("%d,\'%s\t<>\t%s\n" % ( ' '.join(line), '~'.join(((t)))))

    # TODO
    # push_DotStar_IntoGrammar( grammar );
    output_file.close()
 def __init__(self):
     self.G = json.load(open_(VAULT_DIST_FILE),
                        object_pairs_hook=OrderedDict)
     for k, v in self.G.items():
         v['__total__'] = sum(v.values())