Esempio n. 1
0
def backwords_counter(nwords_list: TextIO, splitter: str, start_chr: str, end_chr: str,
                      start4words: int, step4words: int, threshold: int, max_gram: int):
    nwords_dict: Dict[Tuple, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
    zero = tuple()
    nwords_float_dict = {zero: {}}
    line_num = wc_l(nwords_list)
    words: Dict[str, int] = defaultdict(int)
    section_dict = defaultdict(lambda: defaultdict(int))
    for line in tqdm(nwords_list, total=line_num, desc="Reading: "):  # type: str
        line = line.strip("\r\n")
        sections = [start_chr]
        sections.extend(parse_line(line, splitter, start4words, step4words))
        sections.append(end_chr)
        for sec in sections:
            words[sec] += 1
            if sec not in {start_chr}:
                nwords_dict[zero][sec] += 1
        section_dict[len(sections)][tuple(sections)] += 1
    pass

    zero_sum = sum(nwords_dict[zero].values())
    for trans, p in nwords_dict[zero].items():
        nwords_float_dict[zero][trans] = p / zero_sum
    min_gram = 2
    len_list = [_l for _l, s in section_dict.items() if sum(s.values()) >= threshold]
    max_gram = min(max(len_list), max(2, max_gram))
    if max_gram == 1:
        print(f"max gram is {max_gram}, fail to model the password dataset", file=sys.stderr)
        sys.exit(-1)
    for n in tqdm(range(min_gram, max_gram + 1), desc="Counting: "):
        nwords_dict: Dict[Tuple, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
        for sec_len, sections_cnt in section_dict.items():
            if n > sec_len:
                continue
            for sections, cnt in sections_cnt.items():
                prefix_words_num = n - 1
                for i in range(len(sections) - prefix_words_num):
                    grams = tuple(sections[i:i + prefix_words_num])
                    transition = sections[i + prefix_words_num]
                    nwords_dict[grams][transition] += cnt
            pass
        for prefix, trans_cnt in nwords_dict.items():
            total = sum(trans_cnt.values())
            if total < threshold:
                continue
            trans_prob = {trans: cnt / total for trans, cnt in trans_cnt.items() if cnt >= threshold}
            missing = 1 - sum(trans_prob.values())
            if missing == 1:
                continue
            if missing > 0:
                parent_prefix = prefix[1:]
                for trans, p in nwords_float_dict[parent_prefix].items():
                    trans_prob[trans] = trans_prob.get(trans, 0) + p * missing
            nwords_float_dict[prefix] = trans_prob
    del section_dict
    return nwords_float_dict, words
Esempio n. 2
0
def nwords_counter(nwords_list: TextIO,
                   n: int = 4,
                   end_chr: str = "\x03",
                   threshold: int = 10):
    nwords_dict: Dict[str, Dict[str,
                                int]] = defaultdict(lambda: defaultdict(int))
    prefix_words = n - 1
    line_num = wc_l(nwords_list)
    section_dict = defaultdict(int)
    words: Dict[str, int] = defaultdict(int)

    for line in tqdm(nwords_list, total=line_num,
                     desc="Parsing: "):  # type: str
        line = line.strip("\r\n")
        items = line.split("\t")
        pwd = items[0] + end_chr
        raw_sections = items[1::2]
        start = 0
        sections = []
        raw_sections.append(end_chr)
        for sec in raw_sections:
            word = pwd[start:start + len(sec)]
            sections.append(word)
            start += len(sec)
            words[word] += 1
        if "".join(sections) != pwd or len(pwd) < 4:
            raise Exception("error1")
        section_dict[tuple(sections)] += 1
    needed = {k: v for k, v in words.items() if v >= threshold}
    nwords_list.close()
    for sections, cnt in tqdm(section_dict.items(), desc="Counting: "):
        n_sections = []
        for i, sec in enumerate(sections):
            if sec in needed:
                n_sections.append(sec)
            else:
                n_sections.extend(list(sec))
        prev_chrs = ""
        for sec in n_sections:
            nwords_dict[prev_chrs][sec] += cnt
            prev_chrs = f"{prev_chrs}{sec}"[-prefix_words:]
    del section_dict
    nwords_float_dict: Dict[str, Dict[str, float]] = {}
    for prefix, ends in tqdm(nwords_dict.items(), "Converting: "):
        nwords_float_dict[prefix] = {}
        total = sum(ends.values())
        for e, v in ends.items():
            nwords_float_dict[prefix][e] = (v / total)
    del nwords_dict
    return nwords_float_dict, words
Esempio n. 3
0
 def parse_file(self, testing_set: TextIO, using_component: bool = False) -> \
         List[Tuple[Union[str, List[str]], int, float]]:
     """
     get minus log prob for test set
     :param using_component:
     :param testing_set: test set
     :return: List of tuple (pwd, appearance, minus log prob)
     """
     line_num = wc_l(testing_set)
     pwd_counter = defaultdict(int)
     for line in tqdm(testing_set, desc="Reading: ", total=line_num):
         line = line.strip("\r\n")
         pwd_counter[line] += 1
     res: List[Tuple[Union[str, List[str]], int, float]] = []
     for pwd, num in tqdm(pwd_counter.items(), desc="Scoring: "):  # type: str, int
         _mlp, components = self.calc_ml2p(pwd)
         if using_component:
             res.append((components, num, _mlp))
         else:
             res.append((pwd, num, _mlp))
     res = sorted(res, key=lambda x: x[2])
     return res
Esempio n. 4
0
def nwords_counter(nwords_list: TextIO,
                   n: int,
                   splitter: str,
                   end_chr: str,
                   start4words: int,
                   skip4words: int,
                   start_chr: str = '\x00'):
    nwords_dict: Dict[Tuple, Dict[str,
                                  int]] = defaultdict(lambda: defaultdict(int))
    prefix_words_num = n - 1
    line_num = wc_l(nwords_list)
    section_dict = defaultdict(int)
    words: Dict[str, int] = defaultdict(int)
    # default_start = start_chr * (n - 1)
    for line in tqdm(nwords_list, total=line_num,
                     desc="Reading: "):  # type: str
        line = line.strip("\r\n")
        sections = [start_chr for _ in range(n - 1)]
        extends = parse_line(line, splitter, start4words, skip4words)
        sections.extend(extends)
        sections.append(end_chr)
        for sec in sections:
            words[sec] += 1
        section_dict[tuple(sections)] += 1
    nwords_list.close()
    for sections, cnt in tqdm(section_dict.items(), desc="Counting: "):
        for i in range(len(sections) - prefix_words_num):
            grams = tuple(sections[i:i + prefix_words_num])
            transition = sections[i + prefix_words_num]
            nwords_dict[grams][transition] += cnt
    del section_dict
    nwords_float_dict: Dict[Tuple, Dict[str, float]] = {}
    for prefix, ends in tqdm(nwords_dict.items(), "Converting: "):
        nwords_float_dict[prefix] = {}
        total = sum(ends.values())
        for e, v in ends.items():
            nwords_float_dict[prefix][e] = (v / total)
    del nwords_dict
    return nwords_float_dict, words
Esempio n. 5
0
def backwords_counter(nwords_list: TextIO, splitter: str, start_chr: str, end_chr: str,
                      start4words: int, step4words: int, max_gram: int, threshold: int,
                      nwords_dict: Dict[Tuple, Dict[str, int]] = None,
                      words: Dict[str, int] = None):
    if nwords_dict is None:
        nwords_dict: Dict[Tuple, Dict[str, int]] = {}
        words: Dict[str, int] = {}
    zero = tuple()
    if isinstance(nwords_list, list):
        line_num = len(nwords_list)
    else:
        line_num = wc_l(nwords_list)
    if line_num == 0:
        print("No passwords for training, early return!", file=sys.stderr)
        return nwords_dict, words
    section_dict = defaultdict(lambda: defaultdict(int))
    actual_max_gram = 2
    for line in tqdm(nwords_list, total=line_num, desc="Reading: "):  # type: str
        line = line.strip("\r\n")
        sections = [start_chr]
        sections.extend(parse_line(line, splitter, start4words, step4words))
        sections.append(end_chr)
        for sec in sections:
            if sec not in words:
                words[sec] = 0
            words[sec] += 1
            if sec not in {start_chr}:
                if zero not in nwords_dict:
                    nwords_dict[zero] = {}
                if sec not in nwords_dict[zero]:
                    nwords_dict[zero][sec] = 0
                nwords_dict[zero][sec] += 1

        section_dict[len(sections)][tuple(sections)] += 1
        if len(sections) > actual_max_gram:
            actual_max_gram = len(sections)
    pass

    for n in tqdm(range(2, min(max_gram, actual_max_gram) + 1), desc="N-Gram: "):
        tmp_nwords_dict: Dict[Tuple, Dict[str, int]] = {}
        for sec_len, sec_len_dict in section_dict.items():
            if sec_len < n:
                continue
            order = n - 1
            for sec, cnt in sec_len_dict.items():
                for i in range(0, sec_len - order):
                    prefix = sec[i:i + order]
                    transition = sec[i + order]

                    if prefix not in tmp_nwords_dict:
                        tmp_nwords_dict[prefix] = {}
                    if transition not in tmp_nwords_dict[prefix]:
                        tmp_nwords_dict[prefix][transition] = 0
                    tmp_nwords_dict[prefix][transition] += cnt
                pass
            pass
        if len(tmp_nwords_dict) == 0:
            break
        """
        NOTION: Here I assume that we only supply the cracked passwords as secondary training file.
                According to the assumption above, the model will first remove transitions whose appearance 
                is less than threshold. Therefore, the cracked passwords will never contain the removed transitions.
                As a result, we can remove these transitions early to save memory.
        """
        for prefix, transitions in tmp_nwords_dict.items():

            if prefix not in nwords_dict:
                if any([cnt >= threshold for cnt in transitions.values()]):
                    nwords_dict[prefix] = transitions
                continue
            origin = nwords_dict[prefix]
            for trans, v in transitions.items():
                if trans not in origin:
                    origin[trans] = 0
                origin[trans] += v
        pass
    return nwords_dict, words
Esempio n. 6
0
def wrapper():
    cli = argparse.ArgumentParser('Backwords secondary main')
    cli.add_argument("-i",
                     "--training",
                     dest="training",
                     type=argparse.FileType('r'),
                     required=True,
                     help="The training file, each password a line")
    cli.add_argument("-t",
                     "--testing",
                     dest="testing",
                     type=argparse.FileType('r'),
                     required=True,
                     help="The testing file, each password a line")
    cli.add_argument("-s",
                     "--save",
                     dest="save",
                     required=True,
                     type=str,
                     help='A folder, results will be saved in this folder')
    cli.add_argument(
        "--strategy",
        dest="strategy",
        required=True,
        type=str,
        nargs="+",
        # choices=['guesses', 'hits', 'samples'],
        help=
        '`guesses <guesses1> <guesses2> ...` means guess number thresholds, '
        '`hits <cracked1> <cracked2>` means cracked passwords, '
        '`auto_hits <factor> <base> <termination>` means auto generate '
        '<cracked1 = factor * base> <cracked2> <cracked2 = factor ** 2 * base>'
        '`samples <rounds>` means the number of iterations of'
        'Monte Carlo simulation')
    cli.add_argument("--size",
                     dest="size",
                     type=int,
                     required=False,
                     default=100000,
                     help="sample size")
    cli.add_argument(
        "--secondary-sample",
        dest="secondary_sample",
        type=int,
        required=False,
        default=10000000000,
        help="use some of the cracked passwords for secondary training.")
    cli.add_argument(
        "--splitter",
        dest="splitter",
        type=str,
        required=False,
        default="empty",
        help="how to divide different columns from the input file, "
        "set it \"empty\" to represent \'\', \"space\" for \' \', \"tab\" for \'\t\'"
    )
    cli.add_argument(
        "--start4word",
        dest="start4words",
        type=int,
        required=False,
        default=0,
        help=
        "start index for words, to fit as much as formats of input. An entry per line. "
        "We get an array of words by splitting the entry. "
        "\"start4word\" is the index of the first word in the array")
    cli.add_argument(
        "--skip4word",
        dest="skip4words",
        type=int,
        required=False,
        default=1,
        help="there may be other elements between words, such as tags. "
        "Set skip4word larger than 1 to skip unwanted elements.")
    cli.add_argument("--max-gram",
                     dest="max_gram",
                     required=False,
                     type=int,
                     default=256,
                     help="max gram")
    cli.add_argument(
        "--threshold",
        dest="threshold",
        required=False,
        type=int,
        default=10,
        help="grams whose frequencies less than the threshold will be ignored")
    cli.add_argument(
        "--max-iter",
        dest="max_iter",
        required=False,
        default=10**20,
        type=int,
        help=
        "max iteration when calculating the maximum probability of a password")
    args = cli.parse_args()
    strategy_value = args.strategy
    strategy = strategy_value[0]
    permits = {'guesses', 'hits', 'samples', 'auto_hits'}
    if strategy not in permits:
        print(f"strategy should be one of `{', '.join(permits)}`",
              file=sys.stderr)
        return
    if len(strategy_value) < 2:
        print(f"strategy should have at least 2 values", file=sys.stderr)
        return

    using_sample_attack, signs = False, []
    upper_bound, hits_upper_bound = 10**14, 10**14
    func_thresholds = []
    if strategy == 'guesses':
        print(f"using guesses", file=sys.stderr)
        values = strategy_value[1:]
        values = [int(v) for v in values]
        for i, v in enumerate(values):
            func_thresholds.append((v, hits_upper_bound))
            signs.append(f"guesses-{v:,}")
        pass
    elif strategy == 'hits':
        print(f"using hits", file=sys.stderr)
        values = strategy_value[1:]
        values = [int(v) for v in values]
        for i, v in enumerate(values):
            func_thresholds.append((upper_bound, v))
            signs.append(f"hits-{v:,}")
        pass
    elif strategy == 'auto_hits':
        print(f"using auto_hits", file=sys.stderr)
        factor, base, termination = int(strategy_value[1]), int(
            strategy_value[2]), int(strategy_value[3])
        end = math.ceil(
            math.log(termination / max(base, 1)) / math.log(max(factor, 1)))
        for i, v in enumerate(range(1, end)):
            nv = (factor**v) * base

            func_thresholds.append((upper_bound, nv))
            signs.append(f"auto_hits-{v:,}")
    else:
        print(f"using samples", file=sys.stderr)
        v = int(strategy_value[1])
        func_thresholds = [(upper_bound, hits_upper_bound) for _ in range(v)]
        signs = [f"samples-{args.size}" for _ in range(v)]
        using_sample_attack = True
        pass
    rounds = len(func_thresholds)
    splitter_map = {'empty': '', 'space': ' ', 'tab': '\t'}
    if args.splitter.lower() in splitter_map:
        args.splitter = splitter_map[args.splitter.lower()]
    start_chr, end_chr, training_list = '\x03', '\x00', [args.training.name]
    config = {
        'start_chr': start_chr,
        'end_chr': end_chr,
        'max_gram': args.max_gram,
        'threshold': args.threshold,
        'training_list': training_list
    }
    backwords, words = None, None
    training = args.training
    if not os.path.exists(args.save):
        os.mkdir(args.save)
    already_cracked = set()

    print(f"We will have {rounds} rounds", file=sys.stderr, end=', ')
    cums: List[List[Tuple[str, float, int, int]]] = []
    max_guess_numbers = []
    for idx in range(rounds):
        # guess_number_threshold have default value of [args.size, ..., args.size] if it is None
        func_threshold = func_thresholds[idx]
        # Therefore, prior_guesses will always be args.size if `--using-samples`
        print(f"The {idx}-th iteration", file=sys.stderr)
        cum = []
        backwords, words, config, training, max_gn = secondary_cracker(
            backwords,
            words,
            config=config,
            func_threshold=func_threshold,
            training=training,
            splitter=args.splitter,
            start4words=args.start4words,
            skip4words=args.skip4words,
            max_gram=args.max_gram,
            size=args.size,
            max_iter=args.max_iter,
            testing=args.testing,
            save=args.save,
            secondary_sample=args.secondary_sample,
            already_cracked=already_cracked,
            cum=cum,
            threshold=args.threshold,
            sign=signs[idx],
            using_sample_attack=using_sample_attack,
            tag=f"iter-{idx}",
        )
        cums.append(cum)
        max_guess_numbers.append(max_gn)
        if max_gn >= upper_bound:
            print(
                f"Too large guess number reached: {max_gn}, the training process is terminated",
                file=sys.stderr)
            break
        pass
    backwords, words = backwords_counter(training,
                                         splitter=args.splitter,
                                         start_chr=start_chr,
                                         end_chr=end_chr,
                                         start4words=args.start4words,
                                         step4words=args.skip4words,
                                         max_gram=args.max_gram,
                                         nwords_dict=backwords,
                                         words=words,
                                         threshold=args.threshold)
    f_final_model = os.path.join(args.save, "final_model.pickle")
    with open(f_final_model, 'wb') as fout_final_model:
        pickle.dump((backwords, words, config), file=fout_final_model)
    print("Training phase done.", file=sys.stderr)
    backword_mc = BackWordsSecondaryMonteCarlo((backwords, words, config),
                                               max_iter=args.max_iter)
    ml2p_list = backword_mc.sample(size=args.size)
    mc = MonteCarloLib(ml2p_list)
    scored_testing = backword_mc.parse_file(args.testing)
    gc = mc.ml2p_iter2gc(minus_log_prob_iter=scored_testing)
    # note that this is the cracked passwords obtained according to the final model
    f_iter_result = os.path.join(args.save, "iter_result.txt")
    with open(f_iter_result, 'w') as fout_iter_result:
        cum = []
        for pwd, prob, num, gn, cracked, ratio in gc:
            fout_iter_result.write(
                f"{pwd}\t{prob:.8f}\t{num}\t{gn}\t{cracked}\t{ratio:5.2f}\n")
            if pwd not in already_cracked:
                cum.append((pwd, prob, num, gn))
            pass
        cums.append(cum)
        pass
    # note that this is the union of all intermediate results
    # each guess matters in this result file
    f_sectional_result = os.path.join(args.save, "sectional_result.txt")
    with open(f_sectional_result, "w") as fout_sectional_result:
        _cracked = 0
        _total = wc_l(args.testing)
        for gnt, cum in zip([0, *max_guess_numbers], cums):
            for (_pwd, _prob, _n, _gn) in cum:
                _cracked += _n
                _ratio = _cracked / _total * 100
                fout_sectional_result.write(
                    f"{_pwd}\t{_prob:.8f}\t{_n}\t{_gn + gnt}\t{_cracked}\t{_ratio:5.2f}\n"
                )
        pass
    f_config = os.path.join(args.save, "config.json")
    with open(f_config, 'w') as fout_config:
        json.dump(config, fp=fout_config, indent=2)
    args.testing.close()
    pass