def compute_secloss(guess_file, attpwf, chlpwf, q=100): chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5) attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5) guesses = [w for w, _ in json.load(open(guess_file))] guess_set = set(guesses) q = len(guesses) print("Found {} guesses".format(q)) lambda_q = sum(chlpwm.pw2freq(pw) for _id, pw, f in attpwm.iterpws(q))/float(chlpwm.totalf()) print("Normal succces: {}".format(lambda_q)) union_ball = set([ rpw for w in guesses for rpw in KB.word_to_typos(str(w)) if chlpwm.pw2id(rpw)>=0 ]) | guess_set print("Worst case success rate = {}"\ .format(sum(chlpwm.pw2freq(w) for w in union_ball)/float(chlpwm.totalf()))) # global N # N = 10000 # M, A, typo_trie, _ = read_pw_nh_graph(chlpwf, N) # Mprime = np.zeros((M.shape[0], NH_SIZE+1)) # B = [[] for _ in guesses] # # for g in xrange(M.shape[0]): # M = Mprime # fuzzlambda_q = 0.0 # guess_key_ids = [get_trie_id(typo_trie, g) for g in guess_set] # killed = [] # for rpw in union_ball: # try: # rpwid = typo_trie.key_id(unicode(rpw)) # for g in guess_key_ids: # if (M[M[:, 0] == rpwid] == g).any: # killed.append(rpw) # except KeyError: # continue # fuzzlambda_q = sum([chlpwm.pw2freq(w) for w in killed])/chlpwm.totalf() # for rpw in union_ball: # a = set(get_topk_typos(rpw, NH_SIZE+1)) & guess_set # if a: # print rpw, chlpwm.pw2freq(rpw) fuzzlambda_q = sum( chlpwm.pw2freq(rpw) for rpw in union_ball if len(set(get_topk_typos(rpw, NH_SIZE)) & guess_set)>0 )/float(chlpwm.totalf()) # print("fuzzlambda_q:", fuzzlambda_q), # lambda_topk_q = sum( # chlpwm.pw2freq(rpw) # for rpw in union_ball # if len(set(get_typodist_nh(rpw, NH_SIZE)) & guess_set)>0 # )/chlpwm.totalf() print("fuzzlambda_q: ", fuzzlambda_q) print("Secloss:", fuzzlambda_q - lambda_q)
def read_pw_nh_graph(fname, q=-1, _N=-1): """Reads the typo trie file and the neighborhood map created by `create_pw_nh_graph` function. Returns: (M, A, typo_trie) M is the rpw -> Neighborhood information - M[i][0] is the rpw_id, of i-th most probable password - M[i][1:] is the neighborhood, truncted to MAX_NH_SIZE (500) A is the weight of the balls of all the typos we collected - A[i] = Total sum of frequencies of all the rpw in the ball of i-th password in trie. (see typo_trie) typo_trie is a maping from typo_id to typos, so, to retrieve the i-th typo in A[i], use typo_trie.restore_key(i). typo_trie is not required for computing the total success of an attacker. q: Prune the typo list based on q value, so that don't worry about typos that are very low in the tail, for example, a typo with total ball weight < 10*q-th most probable typo, is most likely useless. Where assume the average ball size is 10. """ # N = 1000 global N if _N>0: N = _N typodir = '{}/typodir'.format(pwd) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) N = min(N, len(pwm)) tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\ .format(typodir, pwm.fbasename, 0, N) rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\ .format(typodir, pwm.fbasename, 0, N) typo_trie = marisa_trie.Trie() typo_trie.load(tpw_trie_fname) M = np.load(rpw_nh_graph)['M'] ## Extra fix ## M[M==0] = -1 d = len(typo_trie) A = np.zeros(len(typo_trie)) for i in xrange(M.shape[0]): if M[i, 0] <=0: continue p_rpw = pwm.pw2freq(typo_trie.restore_key(M[i, 0])) A[M[i, M[i]>=0]] += p_rpw print("Done creating the 'A' array. Size={}".format(A.shape)) # # Prune the typos, Not all typos are useful, any typo with # # frequency less than i_th most probable password will never be # # queried. # b = (M>0).sum() / float(A.shape[0]) # average ball size # print("Average ball size: {}".format(b)) # bq_th_pw_f = pwm.id2freq(M[int(b*q)][0]) # useful_typos = (A>=bq_th_pw_f) # print("Useful typos (> {}): {}/{}".format( # bq_th_pw_f, useful_typos.sum(), A.shape[0] # )) return M, A, typo_trie, pwm
def read_pw_nh_graph(fname, q=-1, _N=-1): """Reads the typo trie file and the neighborhood map created by `create_pw_nh_graph` function. Returns: (M, A, typo_trie) M is the rpw -> Neighborhood information - M[i][0] is the rpw_id, of i-th most probable password - M[i][1:] is the neighborhood, truncted to MAX_NH_SIZE (500) A is the weight of the balls of all the typos we collected - A[i] = Total sum of frequencies of all the rpw in the ball of i-th password in trie. (see typo_trie) typo_trie is a maping from typo_id to typos, so, to retrieve the i-th typo in A[i], use typo_trie.restore_key(i). typo_trie is not required for computing the total success of an attacker. q: Prune the typo list based on q value, so that don't worry about typos that are very low in the tail, for example, a typo with total ball weight < 10*q-th most probable typo, is most likely useless. Where assume the average ball size is 10. """ # N = 1000 global N if _N > 0: N = _N typodir = '{}/typodir'.format(pwd) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) N = min(N, len(pwm)) tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\ .format(typodir, pwm.fbasename, 0, N) rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\ .format(typodir, pwm.fbasename, 0, N) typo_trie = marisa_trie.Trie() typo_trie.load(tpw_trie_fname) M = np.load(rpw_nh_graph)['M'] ## Extra fix ## M[M == 0] = -1 d = len(typo_trie) A = np.zeros(len(typo_trie)) for i in xrange(M.shape[0]): if M[i, 0] <= 0: continue p_rpw = pwm.pw2freq(typo_trie.restore_key(M[i, 0])) A[M[i, M[i] >= 0]] += p_rpw print("Done creating the 'A' array. Size={}".format(A.shape)) # # Prune the typos, Not all typos are useful, any typo with # # frequency less than i_th most probable password will never be # # queried. # b = (M>0).sum() / float(A.shape[0]) # average ball size # print("Average ball size: {}".format(b)) # bq_th_pw_f = pwm.id2freq(M[int(b*q)][0]) # useful_typos = (A>=bq_th_pw_f) # print("Useful typos (> {}): {}/{}".format( # bq_th_pw_f, useful_typos.sum(), A.shape[0] # )) return M, A, typo_trie, pwm
def compute_secloss_with_varying_q(guess_file, attpwf, chlpwf, q=100): chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5) attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5) guesses = [w for w, _ in json.load(open(guess_file))] guess_set = dict((g, i) for i, g in enumerate(guesses)) q = len(guesses) union_ball = list(set([ rpw for w in guesses for rpw in KB.word_to_typos(str(w)) if chlpwm.pw2id(rpw)>=0 ])) freqs = np.array([chlpwm.pw2freq(w) for w in union_ball]) M = np.full((len(union_ball), NH_SIZE+1), -1, dtype=np.int32) for i, rpw in enumerate(union_ball): for j, tpw in enumerate(get_topk_typos(rpw, NH_SIZE)): M[i, j] = guess_set.get(tpw, -1) print("Useful typos:", (M>0).sum()) tq = 1 lambda_topk_q = [] while tq<q: if lambda_topk_q: last_suc = lambda_topk_q[-1][1] else: last_suc = 0 for g in guesses[tq:tq*10]: t = guess_set[g] last_suc += freqs[(M==t).sum(axis=1)>0].sum()/float(chlpwm.totalf()) freqs[(M==t).sum(axis=1)>0] = 0 lambda_topk_q.append((tq*10, last_suc)) print(lambda_topk_q[-1]) tq *= 10 with open('guess_file.csv', 'wb') as f: csvf = csv.writer(f) csvf.writerow('q,lambda_q,secloss'.split()) for tq, succ in lambda_topk_q: lambda_q = chlpwm.sumvalues(tq)/float(chlpwm.totalf()) csvf.writerow([tq, lambda_q, succ-lambda_q])
def compute_guesses_using_typodist(fname, q, nh_size=5, topk=False, offline=False): """ Computes the Neighborhood based on sampling from the typo distribution. """ # Re-create the neighborhood, it should be small global proc_name, N print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size) if topk: proc_name = "TOPKTypo-{}-{}-{}".format else: proc_name = "TYPODIST-{}-{}-{}".format proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, ('off' if offline else 'on')) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) typodir = '{}/typodir'.format(pwd) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) N = min(N, len(pwm)) tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\ .format(typodir, pwm.fbasename, N, proc_name) rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\ .format(typodir, pwm.fbasename, N, proc_name) if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph): M, B, A, typo_trie = _read_typos(pwm, N, proc_name) else: M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk) np.savez_compressed(rpw_nh_graph, M=M) typo_trie.save(tpw_trie_fname) guesses = [] i = 0 killed = np.ones(M.shape[0], dtype=bool) while len(guesses)<q: gi = A.argmax() # tpwid of the i-th guess # Set of rows where gi exists killed_gi = B[gi] killed[killed_gi] = False if not offline else True e = (typo_trie.restore_key(gi), A[gi]/float(pwm.totalf())) assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\ .format(guesses, e, gi, M[killed_gi]) if not guesses: print "gi={}, {} -> {} ({}), "\ .format(gi, e[0], len(B[gi]), [typo_trie.restore_key(c) for c in M[killed_gi, 0]]) guesses.append(e) for ri in killed_gi: row = M[ri] f = pwm.pw2freq(typo_trie.restore_key(row[0])) if f<=0: print("RPW freq is zero! rpw={}, f={}, guess={}"\ .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi))) continue if offline: if gi == row[0]: killed[ri] = False A[gi] = 0 else: A[gi] -= f/float(nh_size) else: A[row] -= f print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format( proc_name, len(guesses), guesses[-1][0], guesses[-1][1]*100, len(killed_gi), M.shape[0]-killed.sum() )) # Sanity check killed_ids = set(itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses])) killed_pws_weight = sum( pwm.pw2freq(typo_trie.restore_key(M[i, 0])) for i in killed_ids ) fuzzlambda_q = sum(g[1] for g in guesses) assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\ .format(fuzzlambda_q, killed_pws_weight) print("({}): Total fuzzy success: {}"\ .format(proc_name, 100*fuzzlambda_q)) print("({}): Total normal success: {}"\ .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf()))) guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\ .format(pwm.fbasename, q, nh_size, proc_name) print("Saving the guesses:", guess_f) with open(guess_f, 'w') as f: json.dump(guesses, f, indent=4)
def compute_guesses_using_typodist(fname, q, nh_size=5, topk=False, offline=False): """ Computes the Neighborhood based on sampling from the typo distribution. """ # Re-create the neighborhood, it should be small global proc_name, N print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size) if topk: proc_name = "TOPKTypo-{}-{}-{}".format else: proc_name = "TYPODIST-{}-{}-{}".format proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, ('off' if offline else 'on')) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) typodir = '{}/typodir'.format(pwd) pwm = Passwords(fname, max_pass_len=25, min_pass_len=5) N = min(N, len(pwm)) tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\ .format(typodir, pwm.fbasename, N, proc_name) rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\ .format(typodir, pwm.fbasename, N, proc_name) if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph): M, B, A, typo_trie = _read_typos(pwm, N, proc_name) else: M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk) np.savez_compressed(rpw_nh_graph, M=M) typo_trie.save(tpw_trie_fname) guesses = [] i = 0 killed = np.ones(M.shape[0], dtype=bool) while len(guesses) < q: gi = A.argmax() # tpwid of the i-th guess # Set of rows where gi exists killed_gi = B[gi] killed[killed_gi] = False if not offline else True e = (typo_trie.restore_key(gi), A[gi] / float(pwm.totalf())) assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\ .format(guesses, e, gi, M[killed_gi]) if not guesses: print "gi={}, {} -> {} ({}), "\ .format(gi, e[0], len(B[gi]), [typo_trie.restore_key(c) for c in M[killed_gi, 0]]) guesses.append(e) for ri in killed_gi: row = M[ri] f = pwm.pw2freq(typo_trie.restore_key(row[0])) if f <= 0: print("RPW freq is zero! rpw={}, f={}, guess={}"\ .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi))) continue if offline: if gi == row[0]: killed[ri] = False A[gi] = 0 else: A[gi] -= f / float(nh_size) else: A[row] -= f print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format( proc_name, len(guesses), guesses[-1][0], guesses[-1][1] * 100, len(killed_gi), M.shape[0] - killed.sum())) # Sanity check killed_ids = set( itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses])) killed_pws_weight = sum( pwm.pw2freq(typo_trie.restore_key(M[i, 0])) for i in killed_ids) fuzzlambda_q = sum(g[1] for g in guesses) assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\ .format(fuzzlambda_q, killed_pws_weight) print("({}): Total fuzzy success: {}"\ .format(proc_name, 100*fuzzlambda_q)) print("({}): Total normal success: {}"\ .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf()))) guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\ .format(pwm.fbasename, q, nh_size, proc_name) print("Saving the guesses:", guess_f) with open(guess_f, 'w') as f: json.dump(guesses, f, indent=4)