def approx_guesses(fname, q):
    """
    TODO: WRITE SOMETHING HERE
    """
    global pwm
    pwm = Passwords(fname)
    subset_heap = priority_dict()
    covered = set()
    guess_list = []
    ballsize = 1000  # I don't care any bigger ball
    freq_cache = {}
    done = set()
    pwfreq = np.copy(pwm.values())  # deep copy of the frequencies
    l = 1
    st = time.time()
    for i, (pwid, f) in enumerate(pwm):
        rpw = pwm.id2pw(pwid)
        if len(rpw) < 6: continue
        pw = pwm.id2pw(pwid)
        p = pwm.prob(pw)
        neighbors = [rpw]
        for tpw, w in subset_heap.sorted_iter():
            w = -w
            ball = getball(tpw)
            nw = pwfreq[ball].sum()
            if w == nw:
                if w >= f * ballsize:  # correct value
                    print "Guess({}/{}): {} weight: {}"\
                        .format(len(guess_list), q, tpw, w/pwm.totalf())
                    done.add(tpw)
                    guess_list.append(tpw)
                    pwfreq[ball] = 0
                    if len(guess_list) >= q:
                        break
                else:  # The ball weight is still small
                    subset_heap[tpw] = -nw
                    break
            else:
                subset_heap[tpw] = -nw
        for tpw, ball in zip(neighbors, map(getball, iter(neighbors))):
            ballsize = ballsize * 0.9 + ball.shape[0] * 0.1
            subset_heap[tpw] = -pwfreq[ball].sum()

        if len(subset_heap) > l:
            print(">> ({}) : Heap size: {} ballsize: {}".format(
                time.time() - st, len(subset_heap), ballsize))
            l = len(subset_heap) * 2
        if i % 30 == 0:
            print(">> ({}) : {}: {!r} ({})".format(time.time() - st, i, rpw,
                                                   f))
        if len(guess_list) >= q:
            break
    normal_succ = pwm.sumvalues(q=q) / pwm.totalf()
    pool = multiprocessing.Pool(7)
    guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list)))
    fuzzy_succ = pwm.values()[guessed_pws].sum() / pwm.totalf()
    print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ))
    with open('approx_guess_{}.json'.format(q), 'wb') as f:
        json.dump(guess_list, f)
    return guess_list
def greedy_maxcoverage_heap(fname, q, **kwargs):
    global pwm
    pwm = Passwords(fname)
    subset_heap = priority_dict()
    covered = set()
    guess_list = []
    ballsize = 2000  # I don't care any bigger ball
    freq_cache = {}
    done = set()
    pwfreq = np.copy(pwm.values())  # deep copy of the frequencies
    l = 1
    st = time.time()
    pool = multiprocessing.Pool(5)
    for i, (pwid, f) in enumerate(pwm):
        rpw = pwm.id2pw(pwid)
        if len(rpw) < 6: continue
        pw = pwm.id2pw(pwid)
        p = pwm.prob(pw)
        neighbors = set(apply_edits(pw.encode('ascii',
                                              errors='ignore'))) - done
        for tpw, w in subset_heap.sorted_iter():
            w = -w
            ball = getball(tpw)
            nw = pwfreq[ball].sum()
            if w == nw:
                if w >= f * ballsize:  # correct value
                    print("Guess({}/{}): {} weight: {}"\
                        .format(len(guess_list), q, tpw, w/pwm.totalf()))
                    done.add(tpw)
                    guess_list.append(tpw)
                    pwfreq[ball] = 0
                    if len(guess_list) >= q:
                        break
                else:  # The ball weight is still small
                    subset_heap[tpw] = -nw
                    break
            else:
                subset_heap[tpw] = -nw
        b_max = 0
        for tpw, ball in zip(neighbors, pool.map(getball, iter(neighbors))):
            subset_heap[tpw] = -pwfreq[ball].sum()
            b_max = max(b_max, ball.shape[0])
        ballsize = ballsize * 0.9 + b_max * 0.1

        if len(subset_heap) > l:
            print(">< ({}) : Heap size: {} ballsize: {}".format(
                time.time() - st, len(subset_heap), ballsize))
            l = len(subset_heap) * 2
        if i % 10 == 0:
            print("({}) : {}: {} ({})".format(time.time() - st, i, rpw, f))
        if len(guess_list) >= q:
            break
    normal_succ = pwm.sumvalues(q=q) / pwm.totalf()
    guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list)))
    fuzzy_succ = pwm.values()[guessed_pws].sum() / pwm.totalf()
    print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ))
    with open('guess_{}.json'.format(q), 'w') as f:
        json.dump(guess_list, f)
    return guess_list
def greedy_maxcoverage_heap(fname, q, **kwargs):
    global pwm
    pwm = Passwords(fname)
    subset_heap = priority_dict()
    covered = set()
    guess_list = []
    ballsize = 2000 # I don't care any bigger ball
    freq_cache = {}
    done = set()
    pwfreq = np.copy(pwm.values()) # deep copy of the frequencies
    l = 1
    st = time.time()
    pool = multiprocessing.Pool(5)
    for i, (pwid, f) in enumerate(pwm):
        rpw = pwm.id2pw(pwid)
        if len(rpw)<6: continue
        pw = pwm.id2pw(pwid)
        p = pwm.prob(pw)
        neighbors = set(apply_edits(pw.encode('ascii', errors='ignore'))) - done
        for tpw, w in subset_heap.sorted_iter():
            w = -w
            ball = getball(tpw)
            nw = pwfreq[ball].sum()
            if w == nw:
                if w >= f*ballsize: # correct value
                    print("Guess({}/{}): {} weight: {}"\
                        .format(len(guess_list), q, tpw, w/pwm.totalf()))
                    done.add(tpw)
                    guess_list.append(tpw)
                    pwfreq[ball] = 0
                    if len(guess_list)>=q:
                        break
                else:  # The ball weight is still small
                    subset_heap[tpw] = -nw
                    break
            else:
                subset_heap[tpw] = -nw
        b_max = 0
        for tpw, ball in zip(neighbors, pool.map(getball, iter(neighbors))):
            subset_heap[tpw] = -pwfreq[ball].sum()
            b_max = max(b_max, ball.shape[0])
        ballsize = ballsize*0.9 + b_max*0.1

        if len(subset_heap) > l:
            print(">< ({}) : Heap size: {} ballsize: {}".format(
                time.time()-st, len(subset_heap), ballsize
            ))
            l = len(subset_heap) * 2
        if i%10==0:
            print("({}) : {}: {} ({})".format(time.time()-st, i, rpw, f))
        if len(guess_list)>=q:
            break
    normal_succ = pwm.sumvalues(q=q)/pwm.totalf()
    guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list)))
    fuzzy_succ = pwm.values()[guessed_pws].sum()/pwm.totalf()
    print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ))
    with open('guess_{}.json'.format(q), 'w') as f:
        json.dump(guess_list, f)
    return guess_list
Example #4
0
def compute_secloss_with_varying_q(guess_file, attpwf, chlpwf, q=100):
    chlpwm = Passwords(chlpwf, max_pass_len=25, min_pass_len=5)
    attpwm = Passwords(attpwf, max_pass_len=25, min_pass_len=5)

    guesses = [w for w, _ in json.load(open(guess_file))]
    guess_set = dict((g, i) for i, g in enumerate(guesses))

    q = len(guesses)
    union_ball = list(set([
        rpw
        for w in guesses
        for rpw in KB.word_to_typos(str(w))
        if chlpwm.pw2id(rpw)>=0
    ]))

    freqs = np.array([chlpwm.pw2freq(w) for w in union_ball])
    M = np.full((len(union_ball), NH_SIZE+1), -1, dtype=np.int32)
    for i, rpw in enumerate(union_ball):
        for j, tpw in enumerate(get_topk_typos(rpw, NH_SIZE)):
            M[i, j] = guess_set.get(tpw, -1)
    print("Useful typos:", (M>0).sum())
    tq = 1
    lambda_topk_q = []
    while tq<q:
        if lambda_topk_q:
            last_suc = lambda_topk_q[-1][1]
        else:
            last_suc = 0
        for g in guesses[tq:tq*10]:
            t = guess_set[g]
            last_suc += freqs[(M==t).sum(axis=1)>0].sum()/float(chlpwm.totalf())
            freqs[(M==t).sum(axis=1)>0] = 0
        lambda_topk_q.append((tq*10, last_suc))
        print(lambda_topk_q[-1])
        tq *= 10

    with open('guess_file.csv', 'wb') as f:
        csvf = csv.writer(f)
        csvf.writerow('q,lambda_q,secloss'.split())
        for tq, succ in lambda_topk_q:
            lambda_q = chlpwm.sumvalues(tq)/float(chlpwm.totalf())
            csvf.writerow([tq, lambda_q, succ-lambda_q])
def compute_guesses_using_typodist(fname, q, nh_size=5, topk=False, offline=False):
    """
    Computes the Neighborhood based on sampling from the typo distribution.
    """
    # Re-create the neighborhood, it should be small
    global proc_name, N
    print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size)
    if topk:
        proc_name = "TOPKTypo-{}-{}-{}".format
    else:
        proc_name = "TYPODIST-{}-{}-{}".format
    proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF,
                          ('off' if offline else 'on'))

    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, N, proc_name)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, N, proc_name)
    if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph):
        M, B, A, typo_trie = _read_typos(pwm, N, proc_name)
    else:
        M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk)
        np.savez_compressed(rpw_nh_graph, M=M)
        typo_trie.save(tpw_trie_fname)

    guesses = []
    i = 0
    killed = np.ones(M.shape[0], dtype=bool)
    while len(guesses)<q:
        gi = A.argmax() # tpwid of the i-th guess
        # Set of rows where gi exists
        killed_gi = B[gi]
        killed[killed_gi] = False if not offline else True
        e = (typo_trie.restore_key(gi), A[gi]/float(pwm.totalf()))
        assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\
            .format(guesses, e, gi, M[killed_gi])
        if not guesses:
            print "gi={}, {} -> {} ({}), "\
                .format(gi, e[0], len(B[gi]),
                        [typo_trie.restore_key(c)
                         for c in M[killed_gi, 0]])
        guesses.append(e)
        for ri in killed_gi:
            row = M[ri]
            f = pwm.pw2freq(typo_trie.restore_key(row[0]))
            if f<=0:
                print("RPW freq is zero! rpw={}, f={}, guess={}"\
                      .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi)))
                continue
            if offline:
                if gi == row[0]:
                    killed[ri] = False
                    A[gi] = 0
                else:
                    A[gi] -= f/float(nh_size)
            else:
                A[row] -= f
        print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format(
            proc_name,
            len(guesses), guesses[-1][0],
            guesses[-1][1]*100, len(killed_gi), M.shape[0]-killed.sum()
        ))
    # Sanity check
    killed_ids = set(itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses]))
    killed_pws_weight = sum(
        pwm.pw2freq(typo_trie.restore_key(M[i, 0]))
        for i in killed_ids
    )
    fuzzlambda_q = sum(g[1] for g in guesses)
    assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\
        .format(fuzzlambda_q, killed_pws_weight)
    print("({}): Total fuzzy success: {}"\
          .format(proc_name, 100*fuzzlambda_q))
    print("({}): Total normal success: {}"\
          .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf())))
    guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\
              .format(pwm.fbasename, q, nh_size, proc_name)
    print("Saving the guesses:", guess_f)
    with open(guess_f, 'w') as f:
        json.dump(guesses, f, indent=4)
def approx_guesses(fname, q):
    """
    TODO: WRITE SOMETHING HERE
    """
    global pwm
    pwm = Passwords(fname)
    subset_heap = priority_dict()
    covered = set()
    guess_list = []
    ballsize = 1000 # I don't care any bigger ball
    freq_cache = {}
    done = set()
    pwfreq = np.copy(pwm.values()) # deep copy of the frequencies
    l = 1
    st = time.time()
    for i, (pwid, f) in enumerate(pwm):
        rpw = pwm.id2pw(pwid)
        if len(rpw)<6: continue
        pw = pwm.id2pw(pwid)
        p = pwm.prob(pw)
        neighbors = [rpw]
        for tpw, w in subset_heap.sorted_iter():
            w = -w
            ball = getball(tpw)
            nw = pwfreq[ball].sum()
            if w == nw:
                if w >= f*ballsize: # correct value
                    print "Guess({}/{}): {} weight: {}"\
                        .format(len(guess_list), q, tpw, w/pwm.totalf())
                    done.add(tpw)
                    guess_list.append(tpw)
                    pwfreq[ball] = 0
                    if len(guess_list)>=q:
                        break
                else:  # The ball weight is still small
                    subset_heap[tpw] = -nw
                    break
            else:
                subset_heap[tpw] = -nw
        for tpw, ball in zip(neighbors, map(getball, iter(neighbors))):
            ballsize = ballsize*0.9 + ball.shape[0]*0.1
            subset_heap[tpw] = -pwfreq[ball].sum()

        if len(subset_heap) > l:
            print(">> ({}) : Heap size: {} ballsize: {}".format(
                time.time()-st, len(subset_heap), ballsize
            ))
            l = len(subset_heap) * 2
        if i%30==0:
            print(">> ({}) : {}: {!r} ({})".format(time.time()-st, i, rpw, f))
        if len(guess_list)>=q:
            break
    normal_succ = pwm.sumvalues(q=q)/pwm.totalf()
    pool = multiprocessing.Pool(7)
    guessed_pws = np.unique(np.concatenate(pool.map(getball, guess_list)))
    fuzzy_succ = pwm.values()[
        guessed_pws
    ].sum()/pwm.totalf()
    print("normal succ: {}, fuzzy succ: {}".format(normal_succ, fuzzy_succ))
    with open('approx_guess_{}.json'.format(q), 'wb') as f:
        json.dump(guess_list, f)
    return guess_list
def compute_guesses_using_typodist(fname,
                                   q,
                                   nh_size=5,
                                   topk=False,
                                   offline=False):
    """
    Computes the Neighborhood based on sampling from the typo distribution.
    """
    # Re-create the neighborhood, it should be small
    global proc_name, N
    print(N, MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF, nh_size)
    if topk:
        proc_name = "TOPKTypo-{}-{}-{}".format
    else:
        proc_name = "TYPODIST-{}-{}-{}".format
    proc_name = proc_name(MIN_ENTROPY_CUTOFF, REL_ENT_CUTOFF,
                          ('off' if offline else 'on'))

    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    typodir = '{}/typodir'.format(pwd)
    pwm = Passwords(fname, max_pass_len=25, min_pass_len=5)
    N = min(N, len(pwm))
    tpw_trie_fname = '{}/{}__{}_{}_typo.trie'\
                     .format(typodir, pwm.fbasename, N, proc_name)
    rpw_nh_graph = '{}/{}__{}_{}_rpw_nh_graph.npz'\
                   .format(typodir, pwm.fbasename, N, proc_name)
    if os.path.exists(tpw_trie_fname) and os.path.exists(rpw_nh_graph):
        M, B, A, typo_trie = _read_typos(pwm, N, proc_name)
    else:
        M, B, A, typo_trie = _get_typos_for_typodist(pwm, q, nh_size, topk)
        np.savez_compressed(rpw_nh_graph, M=M)
        typo_trie.save(tpw_trie_fname)

    guesses = []
    i = 0
    killed = np.ones(M.shape[0], dtype=bool)
    while len(guesses) < q:
        gi = A.argmax()  # tpwid of the i-th guess
        # Set of rows where gi exists
        killed_gi = B[gi]
        killed[killed_gi] = False if not offline else True
        e = (typo_trie.restore_key(gi), A[gi] / float(pwm.totalf()))
        assert offline or (e not in guesses), "Guesses={}, e={}, killed_gi={}, M[killed_gi]={}"\
            .format(guesses, e, gi, M[killed_gi])
        if not guesses:
            print "gi={}, {} -> {} ({}), "\
                .format(gi, e[0], len(B[gi]),
                        [typo_trie.restore_key(c)
                         for c in M[killed_gi, 0]])
        guesses.append(e)
        for ri in killed_gi:
            row = M[ri]
            f = pwm.pw2freq(typo_trie.restore_key(row[0]))
            if f <= 0:
                print("RPW freq is zero! rpw={}, f={}, guess={}"\
                      .format(typo_trie.restore_key(row[0]), f, typo_trie.restore_key(gi)))
                continue
            if offline:
                if gi == row[0]:
                    killed[ri] = False
                    A[gi] = 0
                else:
                    A[gi] -= f / float(nh_size)
            else:
                A[row] -= f
        print("({}): {}> {:30s}: {:.3e} (killed={}/{})".format(
            proc_name, len(guesses), guesses[-1][0], guesses[-1][1] * 100,
            len(killed_gi), M.shape[0] - killed.sum()))
    # Sanity check
    killed_ids = set(
        itertools.chain(*[B[typo_trie.key_id(t)] for t, _ in guesses]))
    killed_pws_weight = sum(
        pwm.pw2freq(typo_trie.restore_key(M[i, 0])) for i in killed_ids)
    fuzzlambda_q = sum(g[1] for g in guesses)
    assert (fuzzlambda_q - killed_pws_weight) < 1e-10, "{} -- {}"\
        .format(fuzzlambda_q, killed_pws_weight)
    print("({}): Total fuzzy success: {}"\
          .format(proc_name, 100*fuzzlambda_q))
    print("({}): Total normal success: {}"\
          .format(proc_name, 100*pwm.sumvalues(q)/float(pwm.totalf())))
    guess_f = 'guesses/{}_guesses_{}_typodist_{}_{}.json'\
              .format(pwm.fbasename, q, nh_size, proc_name)
    print("Saving the guesses:", guess_f)
    with open(guess_f, 'w') as f:
        json.dump(guesses, f, indent=4)