def greedy_maxcoverage_heap(fname, q, **kwargs):
    global pwm
    pwm = Passwords(fname)
    subset_heap = priority_dict()
    covered = set()
    guess_list = []
    ballsize = 2000 # I don't care any bigger ball
    freq_cache = {}
    done = set()
    pwfreq = np.copy(pwm.values()) # deep copy of the frequencies
    l = 1
    st = time.time()
    pool = multiprocessing.Pool(5)
    for i, (pwid, f) in enumerate(pwm):
        rpw = pwm.id2pw(pwid)
        if len(rpw)<6: continue
        pw = pwm.id2pw(pwid)
        p = pwm.prob(pw)
        neighbors = set(apply_edits(pw.encode('ascii', errors='ignore'))) - done
        for tpw, w in subset_heap.sorted_iter():
            w = -w
            ball = getball(tpw)
            nw = pwfreq[ball].sum()
            if w == nw:
                if w >= f*ballsize: # correct value
                    print("Guess({}/{}): {} weight: {}"\
                        .format(len(guess_list), q, tpw, w/pwm.totalf()))
                    done.add(tpw)
                    guess_list.append(tpw)
                    pwfreq[ball] = 0
                    if len(guess_list)>=q:
                        break
                else:  # The ball weight is still small
                    subset_heap[tpw] = -nw
                    break
            else:
                subset_heap[tpw] = -nw
        b_max = 0
        for tpw, ball in zip(neighbors, pool.map(getball, iter(neighbors))):
            subset_heap[tpw] = -pwfreq[ball].sum()
            b_max = max(b_max, ball.shape[0])
        ballsize = ballsize*0.9 + b_max*0.1

        if len(subset_heap) > l:
            print(">< ({}) : Heap size: {} ballsize: {}".format(
                time.time()-st, len(subset_heap), ballsize
            ))
            l = len(subset_heap) * 2
        if i%10==0:
            print("({}) : {}: {} ({})".format(time.time()-st, i, rpw, f))
        if len(guess_list)>=q:
            break
    normal_succ = pwm.sumvalues(q=q)/pwm.totalf()
    guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list)))
    fuzzy_succ = pwm.values()[guessed_pws].sum()/pwm.totalf()
    print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ))
    with open('guess_{}.json'.format(q), 'w') as f:
        json.dump(guess_list, f)
    return guess_list
def read_pw_nh_graph(fname, q=-1, _N=-1):
    """Reads the typo trie file and the neighborhood map created by
    `create_pw_nh_graph` function.

    Returns: (M, A, typo_trie)
    M is the rpw -> Neighborhood information
      - M[i][0] is the rpw_id, of i-th most probable password
      - M[i][1:] is the neighborhood, truncted to MAX_NH_SIZE (500)
    A is the weight of the balls of all the typos we collected
      - A[i] = Total sum of frequencies of all the rpw in the ball
               of i-th password in trie. (see typo_trie)
    typo_trie is a maping from typo_id to typos, so, to retrieve
    the i-th typo in A[i], use typo_trie.restore_key(i).
    typo_trie is not required for computing the total success of
    an attacker.
    q: Prune the typo list based on q value, so that don't worry
       about typos that are very low in the tail, for example, a
       typo with total ball weight < 10*q-th most probable typo, is
       most likely useless. Where assume the average ball size is 10.
    """
    # N = 1000
    global N
    if _N>0:
        N = _N
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, 0, N)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, 0, N)

    typo_trie = marisa_trie.Trie()
    typo_trie.load(tpw_trie_fname)
    M = np.load(rpw_nh_graph)['M']
    ## Extra fix ##
    M[M==0] = -1
    d = len(typo_trie)
    A = np.zeros(len(typo_trie))
    for i in xrange(M.shape[0]):
        if M[i, 0] <=0:
            continue
        p_rpw = pwm.pw2freq(typo_trie.restore_key(M[i, 0]))
        A[M[i, M[i]>=0]] += p_rpw

    print("Done creating the 'A' array. Size={}".format(A.shape))
    # # Prune the typos, Not all typos are useful, any typo with
    # # frequency less than i_th most probable password will never be
    # # queried.
    # b = (M>0).sum() / float(A.shape[0])   # average ball size
    # print("Average ball size: {}".format(b))
    # bq_th_pw_f = pwm.id2freq(M[int(b*q)][0])
    # useful_typos = (A>=bq_th_pw_f)
    # print("Useful typos (> {}): {}/{}".format(
    #     bq_th_pw_f, useful_typos.sum(), A.shape[0]
    # ))
    return M, A, typo_trie, pwm
def read_pw_nh_graph(fname, q=-1, _N=-1):
    """Reads the typo trie file and the neighborhood map created by
    `create_pw_nh_graph` function.

    Returns: (M, A, typo_trie)
    M is the rpw -> Neighborhood information
      - M[i][0] is the rpw_id, of i-th most probable password
      - M[i][1:] is the neighborhood, truncted to MAX_NH_SIZE (500)
    A is the weight of the balls of all the typos we collected
      - A[i] = Total sum of frequencies of all the rpw in the ball
               of i-th password in trie. (see typo_trie)
    typo_trie is a maping from typo_id to typos, so, to retrieve
    the i-th typo in A[i], use typo_trie.restore_key(i).
    typo_trie is not required for computing the total success of
    an attacker.
    q: Prune the typo list based on q value, so that don't worry
       about typos that are very low in the tail, for example, a
       typo with total ball weight < 10*q-th most probable typo, is
       most likely useless. Where assume the average ball size is 10.
    """
    # N = 1000
    global N
    if _N > 0:
        N = _N
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, 0, N)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, 0, N)

    typo_trie = marisa_trie.Trie()
    typo_trie.load(tpw_trie_fname)
    M = np.load(rpw_nh_graph)['M']
    ## Extra fix ##
    M[M == 0] = -1
    d = len(typo_trie)
    A = np.zeros(len(typo_trie))
    for i in xrange(M.shape[0]):
        if M[i, 0] <= 0:
            continue
        p_rpw = pwm.pw2freq(typo_trie.restore_key(M[i, 0]))
        A[M[i, M[i] >= 0]] += p_rpw

    print("Done creating the 'A' array. Size={}".format(A.shape))
    # # Prune the typos, Not all typos are useful, any typo with
    # # frequency less than i_th most probable password will never be
    # # queried.
    # b = (M>0).sum() / float(A.shape[0])   # average ball size
    # print("Average ball size: {}".format(b))
    # bq_th_pw_f = pwm.id2freq(M[int(b*q)][0])
    # useful_typos = (A>=bq_th_pw_f)
    # print("Useful typos (> {}): {}/{}".format(
    #     bq_th_pw_f, useful_typos.sum(), A.shape[0]
    # ))
    return M, A, typo_trie, pwm
def verify(fname):
    pwm = Passwords(fname)
    typodir = '{}/typodir'.format(pwd)
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'.format(typodir, pwm.fbasename, 0, N)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'.format(typodir, pwm.fbasename, 0, N)
    print tpw_trie_fname, rpw_nh_graph
    typo_trie = marisa_trie.Trie()
    typo_trie.load(tpw_trie_fname)
    M = np.load(rpw_nh_graph)['M']
    for i, (pwid, f) in enumerate(pwm):
        if random.randint(0, 10000)<=1:
            continue
        if i>=N: break
        rpw = str(pwm.id2pw(pwid).encode('ascii', errors='ignore'))
        nh = get_nh(rpw)
        assert rpw == typo_trie.restore_key(M[i, 0]), \
            "{} <--> {}".format(pwm.id2pw(pwid), typo_trie.restore_key(M[i, 0]))
        nh_t = [typo_trie.restore_key(c) for c in M[i] if c>=0]
        assert nh == nh_t, ">>> i: {}\nNH-NH_t={}\nNH_t-NH={},\nlen(nh)={}"\
            .format(i, set(nh)-set(nh_t), set(nh_t)-set(nh), len(nh))
        if (i%100==0):
            print "Done {}".format(i)
Example #5
0
def compute_black_list_succ(fname, b, q, sketch_size):
    """Computes the offline success rate of an attacker who has access to
    the sketch and wants to make q (int) queries per password.  

    b is either a number or a set. If b is a number then this specify
    black listing top b passwords.  fname is the attacker's password
    model. In case b is a number, then the black list is chosen from
    the top b passwords of the attacker's model, which sounds iffy,
    but that implies that the attacker has complete knowledge of the
    real password distribuion. 
    """
    pwf = Passwords(fname)
    n_sketches = 2**sketch_size
    n = q * n_sketches
    pwarr, farr = ['' for _ in range(n)], [0 for _ in range(n)]
    pwiter = pwf.iterpws()
    for i in range(n):
        pwarr[i], farr[i] = pwiter.next()
    if isinstance(b, int):
        b = pwarr[:b]
    if not isinstance(b, set):
        b = set(b)
    i, j = 0, 0
    nfarr = np.zeros(n * n_sketches)
    for i in range(n):
        if pwarr[i] in b:
            nfarr[j:j+n_sketches] = float(farr[i])/n_sketches
            j += n_sketches
        else:
            nfarr[j] = farr[i]
            j += 1
        if j>nfarr.shape[0]: break
    print nfarr.shape, n
    if nfarr.shape[0]<n:
        return  -np.partition(-nfarr, n)[:n].sum()/pwf.totalf()
    else:
        return nfarr.sum()/pwf.totalf()
def verify(fname):
    pwm = Passwords(fname)
    typodir = '{}/typodir'.format(pwd)
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'.format(typodir, pwm.fbasename, 0,
                                                     N)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'.format(
        typodir, pwm.fbasename, 0, N)
    print tpw_trie_fname, rpw_nh_graph
    typo_trie = marisa_trie.Trie()
    typo_trie.load(tpw_trie_fname)
    M = np.load(rpw_nh_graph)['M']
    for i, (pwid, f) in enumerate(pwm):
        if random.randint(0, 10000) <= 1:
            continue
        if i >= N: break
        rpw = str(pwm.id2pw(pwid).encode('ascii', errors='ignore'))
        nh = get_nh(rpw)
        assert rpw == typo_trie.restore_key(M[i, 0]), \
            "{} <--> {}".format(pwm.id2pw(pwid), typo_trie.restore_key(M[i, 0]))
        nh_t = [typo_trie.restore_key(c) for c in M[i] if c >= 0]
        assert nh == nh_t, ">>> i: {}\nNH-NH_t={}\nNH_t-NH={},\nlen(nh)={}"\
            .format(i, set(nh)-set(nh_t), set(nh_t)-set(nh), len(nh))
        if (i % 100 == 0):
            print "Done {}".format(i)
def create_pw_nh_graph(fname):
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    split = SPLIT
    # N = 1000
    pool = multiprocessing.Pool()
    # Create with split 1000
    args = [(pwm, i, i + split) for i in xrange(0, N, split)]
    pool.map(create_part_pw_nh_graph, args)
    print("Done creating all the parts")
    # Join 10 at time.
    multiplier = 10
    if N < 1e5:
        join_pw_nh_graphs(pwm, split, 0, N)
    else:
        args1 = [(pwm, split, i, i + split * 100)
                 for i in xrange(0, N, split * 100)]
        pool.map(join_pw_nh_graphs, args1)
        join_pw_nh_graphs((pwm, split * 100, 0, N))
Example #8
0
def compute_secloss_with_varying_q(guess_file, attpwf, chlpwf, q=100):
    chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5)
    attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5)

    guesses = [w for w, _ in json.load(open(guess_file))]
    guess_set = dict((g, i) for i, g in enumerate(guesses))

    q = len(guesses)
    union_ball = list(set([
        rpw
        for w in guesses
        for rpw in KB.word_to_typos(str(w))
        if chlpwm.pw2id(rpw)>=0
    ]))

    freqs = np.array([chlpwm.pw2freq(w) for w in union_ball])
    M = np.full((len(union_ball), NH_SIZE+1), -1, dtype=np.int32)
    for i, rpw in enumerate(union_ball):
        for j, tpw in enumerate(get_topk_typos(rpw, NH_SIZE)):
            M[i, j] = guess_set.get(tpw, -1)
    print("Useful typos:", (M>0).sum())
    tq = 1
    lambda_topk_q = []
    while tq<q:
        if lambda_topk_q:
            last_suc = lambda_topk_q[-1][1]
        else:
            last_suc = 0
        for g in guesses[tq:tq*10]:
            t = guess_set[g]
            last_suc += freqs[(M==t).sum(axis=1)>0].sum()/float(chlpwm.totalf())
            freqs[(M==t).sum(axis=1)>0] = 0
        lambda_topk_q.append((tq*10, last_suc))
        print(lambda_topk_q[-1])
        tq *= 10

    with open('guess_file.csv', 'wb') as f:
        csvf = csv.writer(f)
        csvf.writerow('q,lambda_q,secloss'.split())
        for tq, succ in lambda_topk_q:
            lambda_q = chlpwm.sumvalues(tq)/float(chlpwm.totalf())
            csvf.writerow([tq, lambda_q, succ-lambda_q])
def greedy_maxcoverage_heap(fname, q, **kwargs):
    global pwm
    pwm = Passwords(fname)
    subset_heap = priority_dict()
    covered = set()
    guess_list = []
    ballsize = 2000  # I don't care any bigger ball
    freq_cache = {}
    done = set()
    pwfreq = np.copy(pwm.values())  # deep copy of the frequencies
    l = 1
    st = time.time()
    pool = multiprocessing.Pool(5)
    for i, (pwid, f) in enumerate(pwm):
        rpw = pwm.id2pw(pwid)
        if len(rpw) < 6: continue
        pw = pwm.id2pw(pwid)
        p = pwm.prob(pw)
        neighbors = set(apply_edits(pw.encode('ascii',
                                              errors='ignore'))) - done
        for tpw, w in subset_heap.sorted_iter():
            w = -w
            ball = getball(tpw)
            nw = pwfreq[ball].sum()
            if w == nw:
                if w >= f * ballsize:  # correct value
                    print("Guess({}/{}): {} weight: {}"\
                        .format(len(guess_list), q, tpw, w/pwm.totalf()))
                    done.add(tpw)
                    guess_list.append(tpw)
                    pwfreq[ball] = 0
                    if len(guess_list) >= q:
                        break
                else:  # The ball weight is still small
                    subset_heap[tpw] = -nw
                    break
            else:
                subset_heap[tpw] = -nw
        b_max = 0
        for tpw, ball in zip(neighbors, pool.map(getball, iter(neighbors))):
            subset_heap[tpw] = -pwfreq[ball].sum()
            b_max = max(b_max, ball.shape[0])
        ballsize = ballsize * 0.9 + b_max * 0.1

        if len(subset_heap) > l:
            print(">< ({}) : Heap size: {} ballsize: {}".format(
                time.time() - st, len(subset_heap), ballsize))
            l = len(subset_heap) * 2
        if i % 10 == 0:
            print("({}) : {}: {} ({})".format(time.time() - st, i, rpw, f))
        if len(guess_list) >= q:
            break
    normal_succ = pwm.sumvalues(q=q) / pwm.totalf()
    guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list)))
    fuzzy_succ = pwm.values()[guessed_pws].sum() / pwm.totalf()
    print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ))
    with open('guess_{}.json'.format(q), 'w') as f:
        json.dump(guess_list, f)
    return guess_list
def approx_guesses(fname, q):
    """
    TODO: WRITE SOMETHING HERE
    """
    global pwm
    pwm = Passwords(fname)
    subset_heap = priority_dict()
    covered = set()
    guess_list = []
    ballsize = 1000  # I don't care any bigger ball
    freq_cache = {}
    done = set()
    pwfreq = np.copy(pwm.values())  # deep copy of the frequencies
    l = 1
    st = time.time()
    for i, (pwid, f) in enumerate(pwm):
        rpw = pwm.id2pw(pwid)
        if len(rpw) < 6: continue
        pw = pwm.id2pw(pwid)
        p = pwm.prob(pw)
        neighbors = [rpw]
        for tpw, w in subset_heap.sorted_iter():
            w = -w
            ball = getball(tpw)
            nw = pwfreq[ball].sum()
            if w == nw:
                if w >= f * ballsize:  # correct value
                    print "Guess({}/{}): {} weight: {}"\
                        .format(len(guess_list), q, tpw, w/pwm.totalf())
                    done.add(tpw)
                    guess_list.append(tpw)
                    pwfreq[ball] = 0
                    if len(guess_list) >= q:
                        break
                else:  # The ball weight is still small
                    subset_heap[tpw] = -nw
                    break
            else:
                subset_heap[tpw] = -nw
        for tpw, ball in zip(neighbors, map(getball, iter(neighbors))):
            ballsize = ballsize * 0.9 + ball.shape[0] * 0.1
            subset_heap[tpw] = -pwfreq[ball].sum()

        if len(subset_heap) > l:
            print(">> ({}) : Heap size: {} ballsize: {}".format(
                time.time() - st, len(subset_heap), ballsize))
            l = len(subset_heap) * 2
        if i % 30 == 0:
            print(">> ({}) : {}: {!r} ({})".format(time.time() - st, i, rpw,
                                                   f))
        if len(guess_list) >= q:
            break
    normal_succ = pwm.sumvalues(q=q) / pwm.totalf()
    pool = multiprocessing.Pool(7)
    guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list)))
    fuzzy_succ = pwm.values()[guessed_pws].sum() / pwm.totalf()
    print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ))
    with open('approx_guess_{}.json'.format(q), 'wb') as f:
        json.dump(guess_list, f)
    return guess_list
def compute_guesses_using_typodist(fname, q, nh_size=5, topk=False, offline=False):
    """
    Computes the Neighborhood based on sampling from the typo distribution.
    """
    # Re-create the neighborhood, it should be small
    global proc_name, N
    print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size)
    if topk:
        proc_name = "TOPKTypo-{}-{}-{}".format
    else:
        proc_name = "TYPODIST-{}-{}-{}".format
    proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF,
                          ('off' if offline else 'on'))

    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, N, proc_name)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, N, proc_name)
    if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph):
        M, B, A, typo_trie = _read_typos(pwm, N, proc_name)
    else:
        M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk)
        np.savez_compressed(rpw_nh_graph, M=M)
        typo_trie.save(tpw_trie_fname)

    guesses = []
    i = 0
    killed = np.ones(M.shape[0], dtype=bool)
    while len(guesses)<q:
        gi = A.argmax() # tpwid of the i-th guess
        # Set of rows where gi exists
        killed_gi = B[gi]
        killed[killed_gi] = False if not offline else True
        e = (typo_trie.restore_key(gi), A[gi]/float(pwm.totalf()))
        assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\
            .format(guesses, e, gi, M[killed_gi])
        if not guesses:
            print "gi={}, {} -> {} ({}), "\
                .format(gi, e[0], len(B[gi]),
                        [typo_trie.restore_key(c)
                         for c in M[killed_gi, 0]])
        guesses.append(e)
        for ri in killed_gi:
            row = M[ri]
            f = pwm.pw2freq(typo_trie.restore_key(row[0]))
            if f<=0:
                print("RPW freq is zero! rpw={}, f={}, guess={}"\
                      .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi)))
                continue
            if offline:
                if gi == row[0]:
                    killed[ri] = False
                    A[gi] = 0
                else:
                    A[gi] -= f/float(nh_size)
            else:
                A[row] -= f
        print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format(
            proc_name,
            len(guesses), guesses[-1][0],
            guesses[-1][1]*100, len(killed_gi), M.shape[0]-killed.sum()
        ))
    # Sanity check
    killed_ids = set(itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses]))
    killed_pws_weight = sum(
        pwm.pw2freq(typo_trie.restore_key(M[i, 0]))
        for i in killed_ids
    )
    fuzzlambda_q = sum(g[1] for g in guesses)
    assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\
        .format(fuzzlambda_q, killed_pws_weight)
    print("({}): Total fuzzy success: {}"\
          .format(proc_name, 100*fuzzlambda_q))
    print("({}): Total normal success: {}"\
          .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf())))
    guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\
              .format(pwm.fbasename, q, nh_size, proc_name)
    print("Saving the guesses:", guess_f)
    with open(guess_f, 'w') as f:
        json.dump(guesses, f, indent=4)
def approx_guesses(fname, q):
    """
    TODO: WRITE SOMETHING HERE
    """
    global pwm
    pwm = Passwords(fname)
    subset_heap = priority_dict()
    covered = set()
    guess_list = []
    ballsize = 1000 # I don't care any bigger ball
    freq_cache = {}
    done = set()
    pwfreq = np.copy(pwm.values()) # deep copy of the frequencies
    l = 1
    st = time.time()
    for i, (pwid, f) in enumerate(pwm):
        rpw = pwm.id2pw(pwid)
        if len(rpw)<6: continue
        pw = pwm.id2pw(pwid)
        p = pwm.prob(pw)
        neighbors = [rpw]
        for tpw, w in subset_heap.sorted_iter():
            w = -w
            ball = getball(tpw)
            nw = pwfreq[ball].sum()
            if w == nw:
                if w >= f*ballsize: # correct value
                    print "Guess({}/{}): {} weight: {}"\
                        .format(len(guess_list), q, tpw, w/pwm.totalf())
                    done.add(tpw)
                    guess_list.append(tpw)
                    pwfreq[ball] = 0
                    if len(guess_list)>=q:
                        break
                else:  # The ball weight is still small
                    subset_heap[tpw] = -nw
                    break
            else:
                subset_heap[tpw] = -nw
        for tpw, ball in zip(neighbors, map(getball, iter(neighbors))):
            ballsize = ballsize*0.9 + ball.shape[0]*0.1
            subset_heap[tpw] = -pwfreq[ball].sum()

        if len(subset_heap) > l:
            print(">> ({}) : Heap size: {} ballsize: {}".format(
                time.time()-st, len(subset_heap), ballsize
            ))
            l = len(subset_heap) * 2
        if i%30==0:
            print(">> ({}) : {}: {!r} ({})".format(time.time()-st, i, rpw, f))
        if len(guess_list)>=q:
            break
    normal_succ = pwm.sumvalues(q=q)/pwm.totalf()
    pool = multiprocessing.Pool(7)
    guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list)))
    fuzzy_succ = pwm.values()[
        guessed_pws
    ].sum()/pwm.totalf()
    print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ))
    with open('approx_guess_{}.json'.format(q), 'wb') as f:
        json.dump(guess_list, f)
    return guess_list
def compute_guesses_using_typodist(fname,
                                   q,
                                   nh_size=5,
                                   topk=False,
                                   offline=False):
    """
    Computes the Neighborhood based on sampling from the typo distribution.
    """
    # Re-create the neighborhood, it should be small
    global proc_name, N
    print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size)
    if topk:
        proc_name = "TOPKTypo-{}-{}-{}".format
    else:
        proc_name = "TYPODIST-{}-{}-{}".format
    proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF,
                          ('off' if offline else 'on'))

    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, N, proc_name)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, N, proc_name)
    if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph):
        M, B, A, typo_trie = _read_typos(pwm, N, proc_name)
    else:
        M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk)
        np.savez_compressed(rpw_nh_graph, M=M)
        typo_trie.save(tpw_trie_fname)

    guesses = []
    i = 0
    killed = np.ones(M.shape[0], dtype=bool)
    while len(guesses) < q:
        gi = A.argmax()  # tpwid of the i-th guess
        # Set of rows where gi exists
        killed_gi = B[gi]
        killed[killed_gi] = False if not offline else True
        e = (typo_trie.restore_key(gi), A[gi] / float(pwm.totalf()))
        assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\
            .format(guesses, e, gi, M[killed_gi])
        if not guesses:
            print "gi={}, {} -> {} ({}), "\
                .format(gi, e[0], len(B[gi]),
                        [typo_trie.restore_key(c)
                         for c in M[killed_gi, 0]])
        guesses.append(e)
        for ri in killed_gi:
            row = M[ri]
            f = pwm.pw2freq(typo_trie.restore_key(row[0]))
            if f <= 0:
                print("RPW freq is zero! rpw={}, f={}, guess={}"\
                      .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi)))
                continue
            if offline:
                if gi == row[0]:
                    killed[ri] = False
                    A[gi] = 0
                else:
                    A[gi] -= f / float(nh_size)
            else:
                A[row] -= f
        print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format(
            proc_name, len(guesses), guesses[-1][0], guesses[-1][1] * 100,
            len(killed_gi), M.shape[0] - killed.sum()))
    # Sanity check
    killed_ids = set(
        itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses]))
    killed_pws_weight = sum(
        pwm.pw2freq(typo_trie.restore_key(M[i, 0])) for i in killed_ids)
    fuzzlambda_q = sum(g[1] for g in guesses)
    assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\
        .format(fuzzlambda_q, killed_pws_weight)
    print("({}): Total fuzzy success: {}"\
          .format(proc_name, 100*fuzzlambda_q))
    print("({}): Total normal success: {}"\
          .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf())))
    guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\
              .format(pwm.fbasename, q, nh_size, proc_name)
    print("Saving the guesses:", guess_f)
    with open(guess_f, 'w') as f:
        json.dump(guesses, f, indent=4)
Example #14
0
def compute_secloss(guess_file, attpwf, chlpwf, q=100):
    chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5)
    attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5)
    guesses = [w for w, _ in json.load(open(guess_file))]
    guess_set = set(guesses)
    q = len(guesses)
    print("Found {} guesses".format(q))
    lambda_q = sum(chlpwm.pw2freq(pw) for _id, pw, f
                   in attpwm.iterpws(q))/float(chlpwm.totalf())
    print("Normal succces: {}".format(lambda_q))
    union_ball = set([
        rpw
        for w in guesses
        for rpw in KB.word_to_typos(str(w))
        if chlpwm.pw2id(rpw)>=0
    ]) | guess_set

    print("Worst case success rate = {}"\
          .format(sum(chlpwm.pw2freq(w) for w in union_ball)/float(chlpwm.totalf())))

    # global N
    # N = 10000
    # M, A, typo_trie, _ = read_pw_nh_graph(chlpwf, N)
    # Mprime = np.zeros((M.shape[0], NH_SIZE+1))
    # B = [[] for _ in guesses]
    # # for g in xrange(M.shape[0]):
    # M = Mprime
    # fuzzlambda_q = 0.0
    # guess_key_ids = [get_trie_id(typo_trie, g) for g in guess_set]
    # killed = []

    # for rpw in union_ball:
    #     try:
    #         rpwid = typo_trie.key_id(unicode(rpw))
    #         for g in guess_key_ids:
    #             if (M[M[:, 0] == rpwid] == g).any:
    #                 killed.append(rpw)
    #     except KeyError:
    #         continue
    # fuzzlambda_q = sum([chlpwm.pw2freq(w) for w in killed])/chlpwm.totalf()
    # for rpw in union_ball:
    #     a = set(get_topk_typos(rpw, NH_SIZE+1)) & guess_set
    #     if a:
    #         print rpw, chlpwm.pw2freq(rpw)

    fuzzlambda_q = sum(
        chlpwm.pw2freq(rpw)
        for rpw in union_ball
        if len(set(get_topk_typos(rpw, NH_SIZE)) & guess_set)>0
    )/float(chlpwm.totalf())
    # print("fuzzlambda_q:", fuzzlambda_q),

    # lambda_topk_q = sum(
    #     chlpwm.pw2freq(rpw)
    #     for rpw in union_ball
    #     if len(set(get_typodist_nh(rpw, NH_SIZE)) & guess_set)>0
    # )/chlpwm.totalf()
    print("fuzzlambda_q: ", fuzzlambda_q)
    print("Secloss:", fuzzlambda_q - lambda_q)