Example #1
0
def compute_secloss(guess_file, attpwf, chlpwf, q=100):
    chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5)
    attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5)
    guesses = [w for w, _ in json.load(open(guess_file))]
    guess_set = set(guesses)
    q = len(guesses)
    print("Found {} guesses".format(q))
    lambda_q = sum(chlpwm.pw2freq(pw) for _id, pw, f
                   in attpwm.iterpws(q))/float(chlpwm.totalf())
    print("Normal succces: {}".format(lambda_q))
    union_ball = set([
        rpw
        for w in guesses
        for rpw in KB.word_to_typos(str(w))
        if chlpwm.pw2id(rpw)>=0
    ]) | guess_set

    print("Worst case success rate = {}"\
          .format(sum(chlpwm.pw2freq(w) for w in union_ball)/float(chlpwm.totalf())))

    # global N
    # N = 10000
    # M, A, typo_trie, _ = read_pw_nh_graph(chlpwf, N)
    # Mprime = np.zeros((M.shape[0], NH_SIZE+1))
    # B = [[] for _ in guesses]
    # # for g in xrange(M.shape[0]):
    # M = Mprime
    # fuzzlambda_q = 0.0
    # guess_key_ids = [get_trie_id(typo_trie, g) for g in guess_set]
    # killed = []

    # for rpw in union_ball:
    #     try:
    #         rpwid = typo_trie.key_id(unicode(rpw))
    #         for g in guess_key_ids:
    #             if (M[M[:, 0] == rpwid] == g).any:
    #                 killed.append(rpw)
    #     except KeyError:
    #         continue
    # fuzzlambda_q = sum([chlpwm.pw2freq(w) for w in killed])/chlpwm.totalf()
    # for rpw in union_ball:
    #     a = set(get_topk_typos(rpw, NH_SIZE+1)) & guess_set
    #     if a:
    #         print rpw, chlpwm.pw2freq(rpw)

    fuzzlambda_q = sum(
        chlpwm.pw2freq(rpw)
        for rpw in union_ball
        if len(set(get_topk_typos(rpw, NH_SIZE)) & guess_set)>0
    )/float(chlpwm.totalf())
    # print("fuzzlambda_q:", fuzzlambda_q),

    # lambda_topk_q = sum(
    #     chlpwm.pw2freq(rpw)
    #     for rpw in union_ball
    #     if len(set(get_typodist_nh(rpw, NH_SIZE)) & guess_set)>0
    # )/chlpwm.totalf()
    print("fuzzlambda_q: ", fuzzlambda_q)
    print("Secloss:", fuzzlambda_q - lambda_q)
def read_pw_nh_graph(fname, q=-1, _N=-1):
    """Reads the typo trie file and the neighborhood map created by
    `create_pw_nh_graph` function.

    Returns: (M, A, typo_trie)
    M is the rpw -> Neighborhood information
      - M[i][0] is the rpw_id, of i-th most probable password
      - M[i][1:] is the neighborhood, truncted to MAX_NH_SIZE (500)
    A is the weight of the balls of all the typos we collected
      - A[i] = Total sum of frequencies of all the rpw in the ball
               of i-th password in trie. (see typo_trie)
    typo_trie is a maping from typo_id to typos, so, to retrieve
    the i-th typo in A[i], use typo_trie.restore_key(i).
    typo_trie is not required for computing the total success of
    an attacker.
    q: Prune the typo list based on q value, so that don't worry
       about typos that are very low in the tail, for example, a
       typo with total ball weight < 10*q-th most probable typo, is
       most likely useless. Where assume the average ball size is 10.
    """
    # N = 1000
    global N
    if _N>0:
        N = _N
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, 0, N)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, 0, N)

    typo_trie = marisa_trie.Trie()
    typo_trie.load(tpw_trie_fname)
    M = np.load(rpw_nh_graph)['M']
    ## Extra fix ##
    M[M==0] = -1
    d = len(typo_trie)
    A = np.zeros(len(typo_trie))
    for i in xrange(M.shape[0]):
        if M[i, 0] <=0:
            continue
        p_rpw = pwm.pw2freq(typo_trie.restore_key(M[i, 0]))
        A[M[i, M[i]>=0]] += p_rpw

    print("Done creating the 'A' array. Size={}".format(A.shape))
    # # Prune the typos, Not all typos are useful, any typo with
    # # frequency less than i_th most probable password will never be
    # # queried.
    # b = (M>0).sum() / float(A.shape[0])   # average ball size
    # print("Average ball size: {}".format(b))
    # bq_th_pw_f = pwm.id2freq(M[int(b*q)][0])
    # useful_typos = (A>=bq_th_pw_f)
    # print("Useful typos (> {}): {}/{}".format(
    #     bq_th_pw_f, useful_typos.sum(), A.shape[0]
    # ))
    return M, A, typo_trie, pwm
def read_pw_nh_graph(fname, q=-1, _N=-1):
    """Reads the typo trie file and the neighborhood map created by
    `create_pw_nh_graph` function.

    Returns: (M, A, typo_trie)
    M is the rpw -> Neighborhood information
      - M[i][0] is the rpw_id, of i-th most probable password
      - M[i][1:] is the neighborhood, truncted to MAX_NH_SIZE (500)
    A is the weight of the balls of all the typos we collected
      - A[i] = Total sum of frequencies of all the rpw in the ball
               of i-th password in trie. (see typo_trie)
    typo_trie is a maping from typo_id to typos, so, to retrieve
    the i-th typo in A[i], use typo_trie.restore_key(i).
    typo_trie is not required for computing the total success of
    an attacker.
    q: Prune the typo list based on q value, so that don't worry
       about typos that are very low in the tail, for example, a
       typo with total ball weight < 10*q-th most probable typo, is
       most likely useless. Where assume the average ball size is 10.
    """
    # N = 1000
    global N
    if _N > 0:
        N = _N
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, 0, N)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, 0, N)

    typo_trie = marisa_trie.Trie()
    typo_trie.load(tpw_trie_fname)
    M = np.load(rpw_nh_graph)['M']
    ## Extra fix ##
    M[M == 0] = -1
    d = len(typo_trie)
    A = np.zeros(len(typo_trie))
    for i in xrange(M.shape[0]):
        if M[i, 0] <= 0:
            continue
        p_rpw = pwm.pw2freq(typo_trie.restore_key(M[i, 0]))
        A[M[i, M[i] >= 0]] += p_rpw

    print("Done creating the 'A' array. Size={}".format(A.shape))
    # # Prune the typos, Not all typos are useful, any typo with
    # # frequency less than i_th most probable password will never be
    # # queried.
    # b = (M>0).sum() / float(A.shape[0])   # average ball size
    # print("Average ball size: {}".format(b))
    # bq_th_pw_f = pwm.id2freq(M[int(b*q)][0])
    # useful_typos = (A>=bq_th_pw_f)
    # print("Useful typos (> {}): {}/{}".format(
    #     bq_th_pw_f, useful_typos.sum(), A.shape[0]
    # ))
    return M, A, typo_trie, pwm
Example #4
0
def compute_secloss_with_varying_q(guess_file, attpwf, chlpwf, q=100):
    chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5)
    attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5)

    guesses = [w for w, _ in json.load(open(guess_file))]
    guess_set = dict((g, i) for i, g in enumerate(guesses))

    q = len(guesses)
    union_ball = list(set([
        rpw
        for w in guesses
        for rpw in KB.word_to_typos(str(w))
        if chlpwm.pw2id(rpw)>=0
    ]))

    freqs = np.array([chlpwm.pw2freq(w) for w in union_ball])
    M = np.full((len(union_ball), NH_SIZE+1), -1, dtype=np.int32)
    for i, rpw in enumerate(union_ball):
        for j, tpw in enumerate(get_topk_typos(rpw, NH_SIZE)):
            M[i, j] = guess_set.get(tpw, -1)
    print("Useful typos:", (M>0).sum())
    tq = 1
    lambda_topk_q = []
    while tq<q:
        if lambda_topk_q:
            last_suc = lambda_topk_q[-1][1]
        else:
            last_suc = 0
        for g in guesses[tq:tq*10]:
            t = guess_set[g]
            last_suc += freqs[(M==t).sum(axis=1)>0].sum()/float(chlpwm.totalf())
            freqs[(M==t).sum(axis=1)>0] = 0
        lambda_topk_q.append((tq*10, last_suc))
        print(lambda_topk_q[-1])
        tq *= 10

    with open('guess_file.csv', 'wb') as f:
        csvf = csv.writer(f)
        csvf.writerow('q,lambda_q,secloss'.split())
        for tq, succ in lambda_topk_q:
            lambda_q = chlpwm.sumvalues(tq)/float(chlpwm.totalf())
            csvf.writerow([tq, lambda_q, succ-lambda_q])
def compute_guesses_using_typodist(fname, q, nh_size=5, topk=False, offline=False):
    """
    Computes the Neighborhood based on sampling from the typo distribution.
    """
    # Re-create the neighborhood, it should be small
    global proc_name, N
    print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size)
    if topk:
        proc_name = "TOPKTypo-{}-{}-{}".format
    else:
        proc_name = "TYPODIST-{}-{}-{}".format
    proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF,
                          ('off' if offline else 'on'))

    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, N, proc_name)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, N, proc_name)
    if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph):
        M, B, A, typo_trie = _read_typos(pwm, N, proc_name)
    else:
        M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk)
        np.savez_compressed(rpw_nh_graph, M=M)
        typo_trie.save(tpw_trie_fname)

    guesses = []
    i = 0
    killed = np.ones(M.shape[0], dtype=bool)
    while len(guesses)<q:
        gi = A.argmax() # tpwid of the i-th guess
        # Set of rows where gi exists
        killed_gi = B[gi]
        killed[killed_gi] = False if not offline else True
        e = (typo_trie.restore_key(gi), A[gi]/float(pwm.totalf()))
        assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\
            .format(guesses, e, gi, M[killed_gi])
        if not guesses:
            print "gi={}, {} -> {} ({}), "\
                .format(gi, e[0], len(B[gi]),
                        [typo_trie.restore_key(c)
                         for c in M[killed_gi, 0]])
        guesses.append(e)
        for ri in killed_gi:
            row = M[ri]
            f = pwm.pw2freq(typo_trie.restore_key(row[0]))
            if f<=0:
                print("RPW freq is zero! rpw={}, f={}, guess={}"\
                      .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi)))
                continue
            if offline:
                if gi == row[0]:
                    killed[ri] = False
                    A[gi] = 0
                else:
                    A[gi] -= f/float(nh_size)
            else:
                A[row] -= f
        print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format(
            proc_name,
            len(guesses), guesses[-1][0],
            guesses[-1][1]*100, len(killed_gi), M.shape[0]-killed.sum()
        ))
    # Sanity check
    killed_ids = set(itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses]))
    killed_pws_weight = sum(
        pwm.pw2freq(typo_trie.restore_key(M[i, 0]))
        for i in killed_ids
    )
    fuzzlambda_q = sum(g[1] for g in guesses)
    assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\
        .format(fuzzlambda_q, killed_pws_weight)
    print("({}): Total fuzzy success: {}"\
          .format(proc_name, 100*fuzzlambda_q))
    print("({}): Total normal success: {}"\
          .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf())))
    guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\
              .format(pwm.fbasename, q, nh_size, proc_name)
    print("Saving the guesses:", guess_f)
    with open(guess_f, 'w') as f:
        json.dump(guesses, f, indent=4)
def compute_guesses_using_typodist(fname,
                                   q,
                                   nh_size=5,
                                   topk=False,
                                   offline=False):
    """
    Computes the Neighborhood based on sampling from the typo distribution.
    """
    # Re-create the neighborhood, it should be small
    global proc_name, N
    print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size)
    if topk:
        proc_name = "TOPKTypo-{}-{}-{}".format
    else:
        proc_name = "TYPODIST-{}-{}-{}".format
    proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF,
                          ('off' if offline else 'on'))

    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, N, proc_name)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, N, proc_name)
    if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph):
        M, B, A, typo_trie = _read_typos(pwm, N, proc_name)
    else:
        M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk)
        np.savez_compressed(rpw_nh_graph, M=M)
        typo_trie.save(tpw_trie_fname)

    guesses = []
    i = 0
    killed = np.ones(M.shape[0], dtype=bool)
    while len(guesses) < q:
        gi = A.argmax()  # tpwid of the i-th guess
        # Set of rows where gi exists
        killed_gi = B[gi]
        killed[killed_gi] = False if not offline else True
        e = (typo_trie.restore_key(gi), A[gi] / float(pwm.totalf()))
        assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\
            .format(guesses, e, gi, M[killed_gi])
        if not guesses:
            print "gi={}, {} -> {} ({}), "\
                .format(gi, e[0], len(B[gi]),
                        [typo_trie.restore_key(c)
                         for c in M[killed_gi, 0]])
        guesses.append(e)
        for ri in killed_gi:
            row = M[ri]
            f = pwm.pw2freq(typo_trie.restore_key(row[0]))
            if f <= 0:
                print("RPW freq is zero! rpw={}, f={}, guess={}"\
                      .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi)))
                continue
            if offline:
                if gi == row[0]:
                    killed[ri] = False
                    A[gi] = 0
                else:
                    A[gi] -= f / float(nh_size)
            else:
                A[row] -= f
        print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format(
            proc_name, len(guesses), guesses[-1][0], guesses[-1][1] * 100,
            len(killed_gi), M.shape[0] - killed.sum()))
    # Sanity check
    killed_ids = set(
        itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses]))
    killed_pws_weight = sum(
        pwm.pw2freq(typo_trie.restore_key(M[i, 0])) for i in killed_ids)
    fuzzlambda_q = sum(g[1] for g in guesses)
    assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\
        .format(fuzzlambda_q, killed_pws_weight)
    print("({}): Total fuzzy success: {}"\
          .format(proc_name, 100*fuzzlambda_q))
    print("({}): Total normal success: {}"\
          .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf())))
    guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\
              .format(pwm.fbasename, q, nh_size, proc_name)
    print("Saving the guesses:", guess_f)
    with open(guess_f, 'w') as f:
        json.dump(guesses, f, indent=4)