def search_cuda_fast(search_filename): BLOCK_SIZE = 100000000 GRID_SIZE = 5000 block_search = cuda_block_search() patterns, pattern_starts = encode_strings( [bytes(string, 'utf-8') for string in searched_words()]) patterns_gpu = gpuarray.to_gpu(patterns) pattern_starts_gpu = gpuarray.to_gpu(pattern_starts) matches = np.zeros(GRID_SIZE, dtype=np.int32) # Must be higher than max id in kernel matches_gpu = gpuarray.to_gpu(matches) matches_count = 0 keyword_count = len(searched_words()) with open(search_filename, "rb") as search_file: while True: block = search_file.read(BLOCK_SIZE) if not block: break last_newline = block.rfind(b'\n') final_block = block final_block_gpu = gpuarray.to_gpu(np.array(final_block)) if len(block) == BLOCK_SIZE: search_file.seek(last_newline - len(block), 1) block_search(final_block_gpu, np.int32(last_newline), patterns_gpu, pattern_starts_gpu, np.int32(len(searched_words())), np.int32(BLOCK_SIZE / GRID_SIZE), matches_gpu, block=(keyword_count, 1, 1), grid=(GRID_SIZE, 1)) result = pycuda.gpuarray.sum(matches_gpu).get() print("File done") return result
def search_file(search_filename): keywords = searched_words() matches = 0 with open(search_filename, "r") as search_file: for line in tqdm.tqdm(search_file): matches += int(is_match(line, keywords)) return matches
def search_file(search_filename): keywords = searched_words() regex = re.compile('|'.join(keywords), re.UNICODE) matches = 0 with open(search_filename, "r") as search_file: for line in tqdm.tqdm(search_file): matches += int(is_match_regex(line, regex)) return matches
def search_cuda_fast(search_filename): BLOCK_SIZE = 100000000 GRID_SIZE = 5000 patterns, pattern_starts = encode_strings( [bytes(string, 'utf-8') for string in searched_words()]) pattern_gpu = cuda.to_device(patterns) pattern_starts_gpu = cuda.to_device(pattern_starts) matches = np.zeros(GRID_SIZE, dtype=np.int32) matches_gpu = cuda.to_device(matches) keyword_count = len(searched_words()) with open(search_filename, "rb") as search_file: while True: block = search_file.read(BLOCK_SIZE) if not block: break last_newline = block.rfind(b'\n') final_block = block final_block_gpu = cuda.to_device( np.frombuffer(final_block, dtype=np.ubyte)) if len(block) == BLOCK_SIZE: search_file.seek(last_newline - len(block), 1) block_search[GRID_SIZE, keyword_count]( final_block_gpu, cuda.to_device(np.array([last_newline], dtype=np.int32)), pattern_gpu, pattern_starts_gpu, cuda.to_device(np.array([keyword_count], dtype=np.int32)), cuda.to_device( np.array([BLOCK_SIZE / GRID_SIZE], dtype=np.int32)), matches_gpu) result_array = matches_gpu.copy_to_host() result = result_array.sum() print("File done") return result
def search_file(search_filename): keywords = searched_words() flashtext_processor = KeywordProcessor() for keyword in keywords: flashtext_processor.add_keyword(keyword) matches = 0 with open(search_filename, "r") as search_file: for line in tqdm.tqdm(search_file): matches += int(is_match_flashtext(line, flashtext_processor)) return matches
def searching_worker(text_queue, file_count): global files_read global global_matches keywords = searched_words() matches = 0 while files_read != file_count: # if some files are still not read till the end, continue while True: try: batch = text_queue.get(True, 5.0) except queue.Empty as e: break for text in batch: if is_match(text, keywords): matches += 1 with lock: global_matches += matches
def searching_process(text_queue, matches_shared, files_read, file_count): keywords = searched_words() matches = 0 while files_read.value != file_count: while True: try: batch = text_queue.get(True, 5.0) except queue.Empty as e: break for text in batch: if is_match(text, keywords): matches += 1 with matches_shared.get_lock(): matches_shared.value += matches print("Searching process finished")