Example #1
0
def cal_stat(fds=[], fnames=[]):
    V = {}
    R = [[] for n in NT]
    if not fds:
        fds = [open_(f) for f in fnames]
    for fn in fds:
        print fn.name
        for k, v in json.load(fn, object_pairs_hook=OrderedDict).items():
            if v['length'] > 50: continue
            k = ','.join(str(x) for x in [ v["length"]]+ \
                             [max(v.get(nt,0),0) for nt in NT])
            if k not in V:
                for i, nt in enumerate(NT):
                    if v.get(nt, -1) > 0:
                        R[i].append(v[nt])
                V[k] = [max(v.get(nt, -1), 0) for nt in NT]

    G = defaultdict(dict)
    for i, nt in enumerate(NT):
        for f in R[i]:
            G[nt][f] = G[nt].get(f, 0) + 1

    # s = [sum(R[i])/float(len(R[i])) for i in range(len(NT))]
    print G.keys()
    for k, v in G.items():
        if len(v) < 30:
            for i in range(1, len(v) + 30):
                v[i] = 5 * v.get(i, 1)
    json.dump(G,
              open(GRAMMAR_DIR + 'vault_dist.cfg', 'wb'),
              indent=2,
              separators=(',', ':'),
              sort_keys=True)
Example #2
0
def cal_stat( fds=[], fnames=[] ):
    V = {}
    R = [[] for n in NT]
    if not fds:
        fds = [open_(f) for f in fnames]
    for fn in fds:
        print fn.name
        for k,v in json.load(fn,
                             object_pairs_hook=OrderedDict).items():
            if v['length']>50: continue
            k = ','.join(str(x) for x in [ v["length"]]+ \
                             [max(v.get(nt,0),0) for nt in NT])
            if k not in V:
                for i,nt in enumerate(NT):
                    if v.get(nt,-1) > 0:
                        R[i].append(v[nt])
                V[k] = [max(v.get(nt, -1),0) for nt in NT]
    
    G = defaultdict(dict)
    for i,nt in enumerate(NT):
        for f in R[i]:
            G[nt][f] = G[nt].get(f, 0) + 1

    # s = [sum(R[i])/float(len(R[i])) for i in range(len(NT))]
    print G.keys()
    for k,v in G.items():                              
        if len(v)<30:                 
            for i in range(1,len(v)+30):   
                v[i] = 5*v.get(i, 1) 
    json.dump(G, open(GRAMMAR_DIR+'vault_dist.cfg', 'wb'),
              indent=2, separators=(',',':'), sort_keys=True)
Example #3
0
def buildpcfg(passwd_dictionary, start=0, end=-1):
    #MIN_COUNT=1000
    R = RuleSet()
    # resource track
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2'
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary)):
        if n < start: continue
        if n > end: break
        if n > resource_tracker:
            l = check_resource(n)
            if not l:
                break
            else:
                resource_tracker += l
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue  # not ascii hence return
        if c < MIN_COUNT:  # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print "Word frequency dropped to %d for %s" % (c, w), n
            break  # Careful!!!
        T = parse(w)
        R.update_set(T.rule_set(), with_freq=True, freq=c)
    if end > 0: return R
    R.save(bz2.BZ2File(out_grammar_fl, 'wb'))
Example #4
0
def buildpcfg(passwd_dictionary, start=0, end=-1):
    #MIN_COUNT=1000
    R = RuleSet()
    # resource track
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2'
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary)):
        if n<start: continue
        if n>end: break
        if n>resource_tracker:
            l = check_resource(n)
            if not l:
                break
            else:
                resource_tracker += l
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue    # not ascii hence return
        if c < MIN_COUNT : # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print "Word frequency dropped to %d for %s" % (c, w), n
            break  # Careful!!!
        T = parse(w)
        R.update_set(T.rule_set(), with_freq=True, freq=c)
    if end>0: return R
    R.save(bz2.BZ2File(out_grammar_fl, 'wb'))
Example #5
0
 def __init__(self, vault_fl, mp):
     self.pcfg = DTE_large()
     domain_hash_map_fl = hny_config.STATIC_DOMAIN_HASH_LIST
     self.domain_hash_map = json.load(open_(domain_hash_map_fl))
     self.vault_fl = vault_fl
     self.mp = mp
     self.initialize_vault(mp)
     self.dte = DTE(self.pcfg.decode_grammar(self.H))
Example #6
0
 def __init__(self, vault_fl, mp):
     self.pcfg = DTE_large()
     domain_hash_map_fl = hny_config.STATIC_DOMAIN_HASH_LIST
     self.domain_hash_map = json.load(open_(domain_hash_map_fl))
     self.vault_fl = vault_fl
     self.mp = mp
     self.initialize_vault(mp)
     self.dte = DTE(self.pcfg.decode_grammar(self.H))
Example #7
0
 def load(self, filename):
     self.G = json.load(open_(filename), object_pairs_hook=OrderedDict)
     for k, v in self.G.items():
         if self.cal_cdf:
             print_err("Calculating CDF!")
             lf = 0
             for l, f in v.items():
                 v[l] += lf
                 lf += f
             v['__total__'] = lf
         else:
             v['__total__'] = sum(v.values())
     Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')]
     self.date = Date()
     self.Wdawg = IntDAWG(Wlist)
def build_int_dawg(filename):
    with open_(filename) as inpf:
        freq_style = get_f_w_freq
        f_line = inpf.readline()
        w = []
        if f_line.startswith('#'):
            words = f_line.strip().split()
            freq_style = get_file_data_format(words[1:])
        else:
            w = [freq_style(f_line)]
        w.extend([freq_style(line) for line in inpf])
        w.append(('__total__', sum_freq))
        int_dawg = IntDAWG(w)
        of = filename.split('.')[0] + '.dawg'
        with open(of, 'wb') as o:
            int_dawg.write(o)
        test_dawg(of, w[:10] + w[-10:])
def build_int_dawg(filename):
    with open_(filename) as inpf:
        freq_style = get_f_w_freq
        f_line = inpf.readline()
        w = []
        if f_line.startswith('#'):
            words = f_line.strip().split()
            freq_style = get_file_data_format(words[1:])
        else:
            w = [freq_style(f_line)]
        w.extend([freq_style(line) 
             for line in inpf])
        w.append(('__total__', sum_freq))
        int_dawg = IntDAWG(w)
        of = filename.split('.')[0] + '.dawg'
        with open(of, 'wb') as o:
            int_dawg.write(o)
        test_dawg(of, w[:10] + w[-10:])
Example #10
0
 def load(self, filename):
     self.G = json.load(open_(filename),
                        object_pairs_hook=OrderedDict)
     for k,v in self.G.items():
         if self.cal_cdf:
             print_err("Calculating CDF!")
             lf = 0
             for l,f in v.items():
                 v[l] += lf
                 lf += f
             v['__total__'] = lf
         else:
             v['__total__'] = sum(v.values())
     Wlist = [x 
              for k,v in self.G.items()
              for x in v
              if k.startswith('W')]
     self.date = Date()
     self.Wdawg = IntDAWG(Wlist)
Example #11
0
def buildpcfg(passwd_dictionary):
    G = Grammar()
    # resource track
    resource_tracker = 5240
    allowed_sym = re.compile(r'[ \-_]')
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg'
    for n, line in enumerate(open_(passwd_dictionary)):
        if n>resource_tracker:
            r = MEMLIMMIT*1024 - \
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss;
            print "Memory Usage:", (MEMLIMMIT - r/1024.0), "Lineno:", n
            if r < 0:
                print '''
Hitting the memory limit of 1GB,
please increase the limit or use smaller data set.
Lines processed, {0:d}
'''.format(n)
                break;
            resource_tracker += r/10+100;
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue    # not ascii hence return
        if c < MIN_COUNT : # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0):
            print "Word frequency dropped to %d for %s" % (c, w), n
            break  # Careful!!!
        G.insert(w, c)
        # print t
        # root_fl.write("%s\t<>\t%s\n" % (' '.join(line), '~'.join(str((x,y)) for x,y in zip(W, Tag))))
        
    # TODO
    #push_DotStar_IntoGrammar( grammar );
    G.update_total_freq()
    G.save(bz2.BZ2File(out_grammar_fl, 'w'))
    #marisa_trie.Trie(Grammar.inversemap.keys()).save(out_trie_fl)
    return G
Example #12
0
 def load(self, in_file):
     self.G = json.load(open_(in_file), 
                        object_pairs_hook=OrderedDict)
Example #13
0
         for c in range(0, 10**6, load_each)]
    R = p.map(wraper_buildpcfg, a)
    for r in R:
        Complete_grammar.update_set(r, with_freq=True)
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2'
    Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb'))


if __name__ == "__main__":
    if sys.argv[1] == '-buildG':
        print buildpcfg(sys.argv[2], 0, 100)
    elif sys.argv[1] == '-buildparallelG':
        parallel_buildpcfg(sys.argv[2])
    elif sys.argv[1] == '-file':
        R = RuleSet()
        with open_(sys.argv[2]) as f:
            for i, line in enumerate(f):
                if i < 5000: continue
                l = line.strip().split()
                w, c = ' '.join(l[1:]), int(l[0])
                try:
                    w.decode('ascii')
                except UnicodeDecodeError:
                    continue  # not ascii hence return
                if not w or len(w.strip()) < 1:
                    continue
                T = parse(w)
                R.update_set(T.rule_set(), with_freq=True, freq=c)
                if i % 100 == 0: print i
                if i > 5200: break
        print R
Example #14
0
def breakwordsintotokens(passwd_dictionary):
    """
    Takes a password list file and break every password into possible tokens,
    writes back to a output_file, named as <input_fl>_out.tar.gz,in csv format.
    """
    # for the direcotry
    if not os.path.exists(GRAMMAR_DIR):
        os.mkdir(GRAMMAR_DIR)
    G_out_files = dict()
    for k, f in GrammarStructure().getTermFiles().items():
        G_out_files[k] = os.path.join(GRAMMAR_DIR, f)
    Arr = {}
    for k in G_out_files.keys():
        Arr[k] = dict()
    out_file_name = 'data/'+os.path.basename(passwd_dictionary).split('.')[0]+'_out.tar.gz'
    print passwd_dictionary, out_file_name
    output_file = open(out_file_name, 'wb')
    csv_writer = csv.writer(output_file, delimiter=',',
                            quotechar='"')
    T = Scanner()
    # G = Grammar(scanner=T)
    # resource track
    resource_tracker = 5240
    for n, line in enumerate(open_(passwd_dictionary)):
        if n>resource_tracker:
            r = MEMLIMMIT*1024 - resource.getrusage(resource.RUSAGE_SELF).ru_maxrss;
            print "Memory Usage:", (MEMLIMMIT - r/1024.0), "Lineno:", n
            if r < 0:
                print """
Hitting the memory limit of 1GB,
please increase the limit or use smaller data set.
Lines processed, %d
""" % n
                break
            resource_tracker += r/10+100
        # if n%1000==0: print n;
        line = line.strip().split()
        if len(line) > 1 and line[0].isdigit():
            w, c = ' '.join(line[1:]), int(line[0])
        else:
            continue
            w, c = ' '.join(line), 1
        try:
            w.decode('ascii')
        except UnicodeDecodeError:
            continue     # not ascii hence return
        if c < MIN_COUNT:
            break
        # P is the patterns, W is the unmangled words, U is the original
        Tags, W, U  = T.tokenize(w, True) 
        # print t
        if 'password' in w:
            print Tags, W
        if Tags:
            for t,w in zip(Tags, W):
                try:
                    Arr[t][w] += c
                except KeyError:
                    try: Arr[t][w] = c
                    except KeyError:
                        print "Something is wrong:", Tags, W
            csv_writer.writerow([c, w, str(Tags), str(W), str(U)])
        else:
            print 'Failed to Parse:', w
    for k, D in Arr.items():
        T = marisa_trie.Trie(D.keys())
        T.save(G_out_files[k] + '.tri')
        n = len(D.keys())+1
        A = [0 for i in xrange(n)]
        s = 0
        for w,c in D.items():
            i = T.key_id(unicode(w))
            try: 
                A[i] =  c
                s += c
            except IndexError: 
                print "IndexError", w
        A[-1] = s
        with open(G_out_files[k] + '.py', 'w') as f:
            f.write('%s = [' % k)
            f.write(',\n'.join(['%d' % x for x in A]))
            f.write(']\n')
        # root_fl.write("%d,\'%s\t<>\t%s\n" % ( ' '.join(line), '~'.join(((t)))))
        
    # TODO
    #push_DotStar_IntoGrammar( grammar );
    output_file.close()
Example #15
0
         for c in range(0, 10**6, load_each)]
    R = p.map(wraper_buildpcfg, a)
    for r in R:
        Complete_grammar.update_set(r, with_freq=True)
    out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2'
    Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb'))


if __name__ == "__main__":
    if sys.argv[1] == '-buildG':
        print buildpcfg(sys.argv[2], 0, 100)
    elif sys.argv[1] == '-buildparallelG':
        parallel_buildpcfg(sys.argv[2])
    elif sys.argv[1]=='-file':
        R = RuleSet()
        with open_(sys.argv[2]) as f:
            for i, line in enumerate(f):
                if i<5000: continue
                l = line.strip().split()
                w, c = ' '.join(l[1:]), int(l[0])
                try: w.decode('ascii')
                except UnicodeDecodeError:
                    continue    # not ascii hence return
                if not w or len(w.strip())<1:
                    continue
                T = parse(w)
                R.update_set(T.rule_set(), with_freq=True, freq=c)
                if i%100==0: print i
                if i>5200: break
        print R
    elif sys.argv[1] == '-parse':
Example #16
0
 def __init__(self):
     self.G = json.load(open_(VAULT_DIST_FILE),
                        object_pairs_hook=OrderedDict)
     for k, v in self.G.items():
         v['__total__'] = sum(v.values())