def statistics(input_dir): files = glob(os.path.expanduser(input_dir) + '/**/*.*', recursive=True) files = [f for f in files if not f.endswith('.tar.gz')] shuffle(files) parsed_lines = 0 with open('stats.txt', 'w') as w: w.write(','.join(['success', 'failure', 'failure_rate', 'filename']) + '\n') for filename in files: success = 0 failure = 0 lines = read_lines(filename) parsed_lines += len(lines) for line in lines: result = LineParser.parseline(line) if result is not None: success += 1 else: failure += 1 try: failure_rate = failure / (success + failure) * 100 except ZeroDivisionError: failure_rate = 0.0 w.write(','.join( [str(success), str(failure), f'{failure_rate:.3f}', filename]) + '\n') print(f'Parsed lines = {parsed_lines:,}.') print(f'filename = {filename}, success = {success:,}, ' f'failure = {failure:,}, failure rate = {failure_rate:.3f}%')
def parse_to_files(input_dir, success_filename, failure_filename, cython_acceleration): files = [p for p in Path(input_dir).glob('**/*') if p.is_file()] shuffle(files) # restart from where we stopped. progress_filename = 'progress.txt' if os.path.exists(progress_filename): lines = read_lines(progress_filename) else: lines = [] lines = {n: 1 for n in lines} files = [f for f in files if str(f) not in lines] shuffle(files) # files.sort(key=lambda f: os.stat(f).st_size) # smallest to largest success_total = 0 failure_total = 0 num_threads = os.cpu_count() start_time = time() with open(progress_filename, 'a+') as p: with open(failure_filename, 'a+', encoding='utf8') as f: with open(success_filename, 'a+', encoding='utf8') as w: large_files_processor = Processor( w=w, f=f, cython_acceleration=cython_acceleration) small_files_processor = Processor( cython_acceleration=cython_acceleration) with tqdm(files) as bar: for chunk in chunks(files, num_threads): # more than 300MB? any_large_files = any( [c.stat().st_size / 1e6 > 300 for c in chunk]) if any_large_files: # single thread and write directly to the files. for chunky in chunk: large_files_processor.process(chunky) else: # as many threads as we want. results = parallel_function( small_files_processor.process, chunk, len(chunk)) for single_result in results: success_list, failure_list = single_result for single_success in success_list: w.write(single_success) for single_success in failure_list: f.write(single_success) success_total += len(success_list) failure_total += len(failure_list) bar.update(len(chunk)) bar.set_description( f'success = {success_total:,} failure = {failure_total:,}' ) for single_chunk in chunk: p.write(str(single_chunk) + '\n') print(f'Time elapsed {time() - start_time:.3f} seconds.')
def test(file: str, dataset: str): qm = get_query_manager(dataset) cracked_rate = 0 lines = read_lines(file) for line in lines: results = qm.perform_query(line.strip(), 'email') if len(results) > 0: passwords = [p.split(':', 1)[-1] for p in results] print('FOUND:', line, ':'.join(passwords)) cracked_rate += 1 print(f'Cracked rate: {cracked_rate / len(lines)}')
def sort(path): path = str(Path(path).expanduser()) paths = [ Path(a) for a in glob(path + '/**/*', recursive=True) if Path(a).is_file() and Path(a).suffix != '.sorted' ] for path in tqdm(paths, desc='Unique > Sorting > Replacing existing files'): output_path = path.with_suffix('.sorted') lines = read_lines(path) unique_sorted_lines = sorted(set(lines)) with open(output_path, 'w', encoding='utf8') as w: w.write('\n'.join(unique_sorted_lines) + '\n') shutil.move(output_path, path)
def evaluate(old, new): hit_count = 0 total_count = 0 files = [p for p in Path(old).expanduser().glob('**/*') if p.is_file()] for i in range(1000): lines = read_lines(random.choice(files)) shuffle(lines) random_lines = lines[0:10] for random_line in random_lines: password_list = QueryAPI.query(new, random_line.split(':')[0], 'email') total_count += 1 if len(password_list) == 0: print( f'OLD: {random_line}, NEW: miss, HIT_RATE = {hit_count / total_count:.3f}.' ) else: hit_count += 1 print( f'OLD: {random_line}, NEW: hit, HIT_RATE = {hit_count / total_count:.3f}.' )
def clean(path): files = list(find_files(Path(path).expanduser()).values()) shuffle(files) alpha = alpha_num_lookup() num_files_moved = 0 files_handlers = {} # u limit here! with tqdm(files, desc='clean') as bar: for file in bar: lines_to_delete = [] lines = read_lines(file) for line in lines: line_split = line.split(':') ground_truth_file = get_output_file(alpha, line_split[0]) key = str(file).replace(path, '') if key.startswith('/'): key = key[1:] if ground_truth_file != key: output_file = Path(path) / ground_truth_file # print(f'MOVE: {line_split[0]} -> {output_file}.') write_line_to_file(files_handlers, line_split, output_file, buffer_scale=10) lines_to_delete.append(line) num_files_moved += 1 if num_files_moved % 100_000 == 0: truncated_file = str(file).replace(path, "") truncated_output_file = str(output_file).replace( path, "") bar.set_description( f'clean. {num_files_moved:,} moved. ' f'move: {line_split[0]}: {truncated_file}' f' -> {truncated_output_file}') lines2 = sorted(set(lines) - set(lines_to_delete)) if len(lines) != len(lines2): with open(str(file), 'w', encoding='utf8') as w: w.write('\n'.join(lines2) + '\n')