def statistics(input_dir):
    files = glob(os.path.expanduser(input_dir) + '/**/*.*', recursive=True)
    files = [f for f in files if not f.endswith('.tar.gz')]
    shuffle(files)
    parsed_lines = 0
    with open('stats.txt', 'w') as w:
        w.write(','.join(['success', 'failure', 'failure_rate', 'filename']) +
                '\n')
        for filename in files:
            success = 0
            failure = 0
            lines = read_lines(filename)
            parsed_lines += len(lines)
            for line in lines:
                result = LineParser.parseline(line)
                if result is not None:
                    success += 1
                else:
                    failure += 1
            try:
                failure_rate = failure / (success + failure) * 100
            except ZeroDivisionError:
                failure_rate = 0.0
            w.write(','.join(
                [str(success),
                 str(failure), f'{failure_rate:.3f}', filename]) + '\n')
            print(f'Parsed lines = {parsed_lines:,}.')
            print(f'filename = {filename}, success = {success:,}, '
                  f'failure = {failure:,}, failure rate = {failure_rate:.3f}%')
def parse_to_files(input_dir, success_filename, failure_filename,
                   cython_acceleration):
    files = [p for p in Path(input_dir).glob('**/*') if p.is_file()]
    shuffle(files)
    # restart from where we stopped.
    progress_filename = 'progress.txt'
    if os.path.exists(progress_filename):
        lines = read_lines(progress_filename)
    else:
        lines = []
    lines = {n: 1 for n in lines}
    files = [f for f in files if str(f) not in lines]
    shuffle(files)
    # files.sort(key=lambda f: os.stat(f).st_size)  # smallest to largest

    success_total = 0
    failure_total = 0
    num_threads = os.cpu_count()
    start_time = time()
    with open(progress_filename, 'a+') as p:
        with open(failure_filename, 'a+', encoding='utf8') as f:
            with open(success_filename, 'a+', encoding='utf8') as w:
                large_files_processor = Processor(
                    w=w, f=f, cython_acceleration=cython_acceleration)
                small_files_processor = Processor(
                    cython_acceleration=cython_acceleration)
                with tqdm(files) as bar:
                    for chunk in chunks(files, num_threads):
                        # more than 300MB?
                        any_large_files = any(
                            [c.stat().st_size / 1e6 > 300 for c in chunk])
                        if any_large_files:
                            # single thread and write directly to the files.
                            for chunky in chunk:
                                large_files_processor.process(chunky)
                        else:
                            # as many threads as we want.
                            results = parallel_function(
                                small_files_processor.process, chunk,
                                len(chunk))

                            for single_result in results:
                                success_list, failure_list = single_result

                                for single_success in success_list:
                                    w.write(single_success)
                                for single_success in failure_list:
                                    f.write(single_success)

                                success_total += len(success_list)
                                failure_total += len(failure_list)

                        bar.update(len(chunk))
                        bar.set_description(
                            f'success = {success_total:,} failure = {failure_total:,}'
                        )
                        for single_chunk in chunk:
                            p.write(str(single_chunk) + '\n')
    print(f'Time elapsed {time() - start_time:.3f} seconds.')
Example #3
0
 def evaluate(old, new):
     hit_count = 0
     total_count = 0
     files = [p for p in Path(old).expanduser().glob('**/*') if p.is_file()]
     for i in range(1000):
         lines = read_lines(random.choice(files))
         shuffle(lines)
         random_lines = lines[0:10]
         for random_line in random_lines:
             password_list = QueryAPI.query(new,
                                            random_line.split(':')[0],
                                            'email')
             total_count += 1
             if len(password_list) == 0:
                 print(
                     f'OLD: {random_line}, NEW: miss, HIT_RATE = {hit_count / total_count:.3f}.'
                 )
             else:
                 hit_count += 1
                 print(
                     f'OLD: {random_line}, NEW: hit, HIT_RATE = {hit_count / total_count:.3f}.'
                 )
Example #4
0
 def clean(path):
     files = list(find_files(Path(path).expanduser()).values())
     shuffle(files)
     alpha = alpha_num_lookup()
     num_files_moved = 0
     files_handlers = {}  # u limit here!
     with tqdm(files, desc='clean') as bar:
         for file in bar:
             lines_to_delete = []
             lines = read_lines(file)
             for line in lines:
                 line_split = line.split(':')
                 ground_truth_file = get_output_file(alpha, line_split[0])
                 key = str(file).replace(path, '')
                 if key.startswith('/'):
                     key = key[1:]
                 if ground_truth_file != key:
                     output_file = Path(path) / ground_truth_file
                     # print(f'MOVE: {line_split[0]} -> {output_file}.')
                     write_line_to_file(files_handlers,
                                        line_split,
                                        output_file,
                                        buffer_scale=10)
                     lines_to_delete.append(line)
                     num_files_moved += 1
                     if num_files_moved % 100_000 == 0:
                         truncated_file = str(file).replace(path, "")
                         truncated_output_file = str(output_file).replace(
                             path, "")
                         bar.set_description(
                             f'clean. {num_files_moved:,} moved. '
                             f'move: {line_split[0]}: {truncated_file}'
                             f' -> {truncated_output_file}')
             lines2 = sorted(set(lines) - set(lines_to_delete))
             if len(lines) != len(lines2):
                 with open(str(file), 'w', encoding='utf8') as w:
                     w.write('\n'.join(lines2) + '\n')