def cal_stat(fds=[], fnames=[]): V = {} R = [[] for n in NT] if not fds: fds = [open_(f) for f in fnames] for fn in fds: print fn.name for k, v in json.load(fn, object_pairs_hook=OrderedDict).items(): if v['length'] > 50: continue k = ','.join(str(x) for x in [ v["length"]]+ \ [max(v.get(nt,0),0) for nt in NT]) if k not in V: for i, nt in enumerate(NT): if v.get(nt, -1) > 0: R[i].append(v[nt]) V[k] = [max(v.get(nt, -1), 0) for nt in NT] G = defaultdict(dict) for i, nt in enumerate(NT): for f in R[i]: G[nt][f] = G[nt].get(f, 0) + 1 # s = [sum(R[i])/float(len(R[i])) for i in range(len(NT))] print G.keys() for k, v in G.items(): if len(v) < 30: for i in range(1, len(v) + 30): v[i] = 5 * v.get(i, 1) json.dump(G, open(GRAMMAR_DIR + 'vault_dist.cfg', 'wb'), indent=2, separators=(',', ':'), sort_keys=True)
def cal_stat( fds=[], fnames=[] ): V = {} R = [[] for n in NT] if not fds: fds = [open_(f) for f in fnames] for fn in fds: print fn.name for k,v in json.load(fn, object_pairs_hook=OrderedDict).items(): if v['length']>50: continue k = ','.join(str(x) for x in [ v["length"]]+ \ [max(v.get(nt,0),0) for nt in NT]) if k not in V: for i,nt in enumerate(NT): if v.get(nt,-1) > 0: R[i].append(v[nt]) V[k] = [max(v.get(nt, -1),0) for nt in NT] G = defaultdict(dict) for i,nt in enumerate(NT): for f in R[i]: G[nt][f] = G[nt].get(f, 0) + 1 # s = [sum(R[i])/float(len(R[i])) for i in range(len(NT))] print G.keys() for k,v in G.items(): if len(v)<30: for i in range(1,len(v)+30): v[i] = 5*v.get(i, 1) json.dump(G, open(GRAMMAR_DIR+'vault_dist.cfg', 'wb'), indent=2, separators=(',',':'), sort_keys=True)
def buildpcfg(passwd_dictionary, start=0, end=-1): #MIN_COUNT=1000 R = RuleSet() # resource track out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' resource_tracker = 5240 for n, line in enumerate(open_(passwd_dictionary)): if n < start: continue if n > end: break if n > resource_tracker: l = check_resource(n) if not l: break else: resource_tracker += l # if n%1000==0: print n; line = line.strip().split() if len(line) > 1 and line[0].isdigit(): w, c = ' '.join(line[1:]), int(line[0]) else: continue w, c = ' '.join(line), 1 try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if c < MIN_COUNT: # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0): print "Word frequency dropped to %d for %s" % (c, w), n break # Careful!!! T = parse(w) R.update_set(T.rule_set(), with_freq=True, freq=c) if end > 0: return R R.save(bz2.BZ2File(out_grammar_fl, 'wb'))
def buildpcfg(passwd_dictionary, start=0, end=-1): #MIN_COUNT=1000 R = RuleSet() # resource track out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' resource_tracker = 5240 for n, line in enumerate(open_(passwd_dictionary)): if n<start: continue if n>end: break if n>resource_tracker: l = check_resource(n) if not l: break else: resource_tracker += l # if n%1000==0: print n; line = line.strip().split() if len(line) > 1 and line[0].isdigit(): w, c = ' '.join(line[1:]), int(line[0]) else: continue w, c = ' '.join(line), 1 try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if c < MIN_COUNT : # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0): print "Word frequency dropped to %d for %s" % (c, w), n break # Careful!!! T = parse(w) R.update_set(T.rule_set(), with_freq=True, freq=c) if end>0: return R R.save(bz2.BZ2File(out_grammar_fl, 'wb'))
def __init__(self, vault_fl, mp): self.pcfg = DTE_large() domain_hash_map_fl = hny_config.STATIC_DOMAIN_HASH_LIST self.domain_hash_map = json.load(open_(domain_hash_map_fl)) self.vault_fl = vault_fl self.mp = mp self.initialize_vault(mp) self.dte = DTE(self.pcfg.decode_grammar(self.H))
def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k, v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l, f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) Wlist = [x for k, v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist)
def build_int_dawg(filename): with open_(filename) as inpf: freq_style = get_f_w_freq f_line = inpf.readline() w = [] if f_line.startswith('#'): words = f_line.strip().split() freq_style = get_file_data_format(words[1:]) else: w = [freq_style(f_line)] w.extend([freq_style(line) for line in inpf]) w.append(('__total__', sum_freq)) int_dawg = IntDAWG(w) of = filename.split('.')[0] + '.dawg' with open(of, 'wb') as o: int_dawg.write(o) test_dawg(of, w[:10] + w[-10:])
def load(self, filename): self.G = json.load(open_(filename), object_pairs_hook=OrderedDict) for k,v in self.G.items(): if self.cal_cdf: print_err("Calculating CDF!") lf = 0 for l,f in v.items(): v[l] += lf lf += f v['__total__'] = lf else: v['__total__'] = sum(v.values()) Wlist = [x for k,v in self.G.items() for x in v if k.startswith('W')] self.date = Date() self.Wdawg = IntDAWG(Wlist)
def buildpcfg(passwd_dictionary): G = Grammar() # resource track resource_tracker = 5240 allowed_sym = re.compile(r'[ \-_]') out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg' for n, line in enumerate(open_(passwd_dictionary)): if n>resource_tracker: r = MEMLIMMIT*1024 - \ resource.getrusage(resource.RUSAGE_SELF).ru_maxrss; print "Memory Usage:", (MEMLIMMIT - r/1024.0), "Lineno:", n if r < 0: print ''' Hitting the memory limit of 1GB, please increase the limit or use smaller data set. Lines processed, {0:d} '''.format(n) break; resource_tracker += r/10+100; # if n%1000==0: print n; line = line.strip().split() if len(line) > 1 and line[0].isdigit(): w, c = ' '.join(line[1:]), int(line[0]) else: continue w, c = ' '.join(line), 1 try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if c < MIN_COUNT : # or (len(w) > 2 and not w[:-2].isalnum() and len(re.findall(allowed_sym, w)) == 0): print "Word frequency dropped to %d for %s" % (c, w), n break # Careful!!! G.insert(w, c) # print t # root_fl.write("%s\t<>\t%s\n" % (' '.join(line), '~'.join(str((x,y)) for x,y in zip(W, Tag)))) # TODO #push_DotStar_IntoGrammar( grammar ); G.update_total_freq() G.save(bz2.BZ2File(out_grammar_fl, 'w')) #marisa_trie.Trie(Grammar.inversemap.keys()).save(out_trie_fl) return G
def load(self, in_file): self.G = json.load(open_(in_file), object_pairs_hook=OrderedDict)
for c in range(0, 10**6, load_each)] R = p.map(wraper_buildpcfg, a) for r in R: Complete_grammar.update_set(r, with_freq=True) out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb')) if __name__ == "__main__": if sys.argv[1] == '-buildG': print buildpcfg(sys.argv[2], 0, 100) elif sys.argv[1] == '-buildparallelG': parallel_buildpcfg(sys.argv[2]) elif sys.argv[1] == '-file': R = RuleSet() with open_(sys.argv[2]) as f: for i, line in enumerate(f): if i < 5000: continue l = line.strip().split() w, c = ' '.join(l[1:]), int(l[0]) try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if not w or len(w.strip()) < 1: continue T = parse(w) R.update_set(T.rule_set(), with_freq=True, freq=c) if i % 100 == 0: print i if i > 5200: break print R
def breakwordsintotokens(passwd_dictionary): """ Takes a password list file and break every password into possible tokens, writes back to a output_file, named as <input_fl>_out.tar.gz,in csv format. """ # for the direcotry if not os.path.exists(GRAMMAR_DIR): os.mkdir(GRAMMAR_DIR) G_out_files = dict() for k, f in GrammarStructure().getTermFiles().items(): G_out_files[k] = os.path.join(GRAMMAR_DIR, f) Arr = {} for k in G_out_files.keys(): Arr[k] = dict() out_file_name = 'data/'+os.path.basename(passwd_dictionary).split('.')[0]+'_out.tar.gz' print passwd_dictionary, out_file_name output_file = open(out_file_name, 'wb') csv_writer = csv.writer(output_file, delimiter=',', quotechar='"') T = Scanner() # G = Grammar(scanner=T) # resource track resource_tracker = 5240 for n, line in enumerate(open_(passwd_dictionary)): if n>resource_tracker: r = MEMLIMMIT*1024 - resource.getrusage(resource.RUSAGE_SELF).ru_maxrss; print "Memory Usage:", (MEMLIMMIT - r/1024.0), "Lineno:", n if r < 0: print """ Hitting the memory limit of 1GB, please increase the limit or use smaller data set. Lines processed, %d """ % n break resource_tracker += r/10+100 # if n%1000==0: print n; line = line.strip().split() if len(line) > 1 and line[0].isdigit(): w, c = ' '.join(line[1:]), int(line[0]) else: continue w, c = ' '.join(line), 1 try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if c < MIN_COUNT: break # P is the patterns, W is the unmangled words, U is the original Tags, W, U = T.tokenize(w, True) # print t if 'password' in w: print Tags, W if Tags: for t,w in zip(Tags, W): try: Arr[t][w] += c except KeyError: try: Arr[t][w] = c except KeyError: print "Something is wrong:", Tags, W csv_writer.writerow([c, w, str(Tags), str(W), str(U)]) else: print 'Failed to Parse:', w for k, D in Arr.items(): T = marisa_trie.Trie(D.keys()) T.save(G_out_files[k] + '.tri') n = len(D.keys())+1 A = [0 for i in xrange(n)] s = 0 for w,c in D.items(): i = T.key_id(unicode(w)) try: A[i] = c s += c except IndexError: print "IndexError", w A[-1] = s with open(G_out_files[k] + '.py', 'w') as f: f.write('%s = [' % k) f.write(',\n'.join(['%d' % x for x in A])) f.write(']\n') # root_fl.write("%d,\'%s\t<>\t%s\n" % ( ' '.join(line), '~'.join(((t))))) # TODO #push_DotStar_IntoGrammar( grammar ); output_file.close()
for c in range(0, 10**6, load_each)] R = p.map(wraper_buildpcfg, a) for r in R: Complete_grammar.update_set(r, with_freq=True) out_grammar_fl = GRAMMAR_DIR + '/grammar.cfg.bz2' Complete_grammar.save(bz2.BZ2File(out_grammar_fl, 'wb')) if __name__ == "__main__": if sys.argv[1] == '-buildG': print buildpcfg(sys.argv[2], 0, 100) elif sys.argv[1] == '-buildparallelG': parallel_buildpcfg(sys.argv[2]) elif sys.argv[1]=='-file': R = RuleSet() with open_(sys.argv[2]) as f: for i, line in enumerate(f): if i<5000: continue l = line.strip().split() w, c = ' '.join(l[1:]), int(l[0]) try: w.decode('ascii') except UnicodeDecodeError: continue # not ascii hence return if not w or len(w.strip())<1: continue T = parse(w) R.update_set(T.rule_set(), with_freq=True, freq=c) if i%100==0: print i if i>5200: break print R elif sys.argv[1] == '-parse':
def __init__(self): self.G = json.load(open_(VAULT_DIST_FILE), object_pairs_hook=OrderedDict) for k, v in self.G.items(): v['__total__'] = sum(v.values())