def create_pcfg(vault_leak, password_leak=None): # learn the grammar vault_d = json.load(open(vault_leak)) print "# of vaults: ", len(vault_d) print "max size of vault:", max(len(x) for x in vault_d.values()) print "max size of vault:", min(len(x) for x in vault_d.values()) if not password_leak: D = defaultdict(int) for k, v in vault_d.items(): if len(v) > 40: continue for x in v: D[x] += 1 password_leak = PW_TMP_FILE with open(password_leak, 'w') as f: f.write('\n'.join('%d\t%s' % (f, p) for p, f in sorted( D.items(), key=lambda x: x[1], reverse=True))) print "Password file created" parallel_buildpcfg(password_leak) # learn the vault distribution tg = TrainedGrammar() G = cal_size_subG(tg, vault_leak) f = os.tmpfile() json.dump(G, f) f.seek(0) cal_stat(fds=[f]) f.close()
# s = [sum(R[i])/float(len(R[i])) for i in range(len(NT))] print G.keys() for k, v in G.items(): if len(v) < 30: for i in range(1, len(v) + 30): v[i] = 5 * v.get(i, 1) json.dump(G, open(GRAMMAR_DIR + 'vault_dist.cfg', 'wb'), indent=2, separators=(',', ':'), sort_keys=True) if __name__ == "__main__": if sys.argv[1] == '-process': tg = TrainedGrammar() print json.dumps(cal_size_subG(tg, sys.argv[2]), indent=2) elif sys.argv[1] == '-stat': # give the vaultcleaned files, cal_stat(fnames=sys.argv[2:]) elif sys.argv[1] == '-default': tg = TrainedGrammar() files = [ "data_vault/%s_vaultcleaned.json" % x for x in ['joe', 'weir'][:1] ] G = {} for f in files: G.update(cal_size_subG(tg, f)) f = os.tmpfile() json.dump(G, f) f.seek(0)
def __init__(self, grammar=None, cal_cdf=False): self.G = grammar if not self.G: self.G = TrainedGrammar(cal_cdf=cal_cdf)