def reorder_vocabulary(vocabulary_path: str) -> None: with open(vocabulary_path, 'r') as file_handler: raw_lines = file_handler.readlines() sections: List[List[str]] = [] current_section: List[str] = [] for line in raw_lines: processed_line = line.strip() if not processed_line: continue if processed_line.startswith('#') and current_section: sections.append(current_section) current_section = [] current_section.append(processed_line) if current_section: sections.append(current_section) sorted_sections: List[List[str]] = [] for section_num, section in enumerate(sections, 1): sorted_sections.append( [f'{r}\n' for r in section if r.startswith('#')] + sorted(f'{r}\n' for r in section if not r.startswith('#')) + (['\n'] if section_num < len(sections) else []), ) with open(vocabulary_path, 'w') as file_handler: file_handler.writelines(flat(sorted_sections))
def load_obscene_words(db_path: str) -> Set[str]: connection = sqlite3.connect(db_path) cursor = connection.cursor() return set( flat( cursor.execute( f'SELECT word FROM {OBSCENE_BASE_TABLE_NAME}', ).fetchall()))
def extract_all_constants_from_path( path: str, exclude: List[str], process_dots: bool, processes_amount: int, verbosity: int = 0, ) -> List[str]: extractors = [ (extract_from_python_src, ['py', 'pyi']), (extract_from_markdown, ['md']), (extract_from_html, ['html']), (extract_from_js, ['js', 'ts', 'tsx']), (extract_from_po, ['po']), ] extension_to_extractor_mapping: DefaultDict[ str, List[Callable]] = collections.defaultdict(list) for extractor, extensions in extractors: for extension in extensions: extension_to_extractor_mapping[extension].append(extractor) string_constants: List[str] = [] for extension, extension_extractors in extension_to_extractor_mapping.items( ): if os.path.isdir(path): all_files = get_all_filepathes_recursively(path, exclude, extension) else: all_files = [path] if path.endswith(extension) else [] if not process_dots: all_files = [ f for f in all_files if '/.' not in f and not f.startswith('.') ] if not all_files: continue chunk_size = math.ceil(len(all_files) / processes_amount) new_strings = multiprocessing.Pool(processes_amount).map( functools.partial( extract_all_constants_from_files, extractors=extension_extractors, verbosity=verbosity, ), chunks(all_files, chunk_size), ) string_constants += flat(new_strings) return list(set(string_constants))
def test_flat(): assert flat([[1, 2], [3, 4], [5, 6]]) == list(range(1, 7)) assert flat([['a', 'b'], ['c', 'd']]) == ['a', 'b', 'c', 'd']