def crawl_multithread(path_rules, search_rules=None, result_rules=None, output=None, threads=None): '''This method is the multithreading version of :func:`Crawler.crawl` with tentative of 1 thread per file''' result_by_file = {} if path_rules is None or "start" not in path_rules: return result_by_file futures = {} with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: root_depth = path_rules["start"].rstrip(os.path.sep).count(os.path.sep) - 1 for dir_path, subdirList, fileList in os.walk(path_rules["start"]): current_depth = dir_path.count(os.path.sep) - root_depth if "max_depth" not in path_rules or path_rules["max_depth"] >= current_depth: for fname in fileList: full_path = os.path.join(dir_path, fname) if os.path.isfile(full_path) \ and ("file" not in path_rules or \ FO.validate_string(full_path, path_rules["file"].get("include"), path_rules["file"].get("exclude"))): futures[executor.submit(FO.validate_file,full_path, search_rules, result_rules)] = full_path for subdir in subdirList: subdir_full_path = os.path.join(dir_path, subdir) if FO.validate_string(subdir_full_path, path_rules.get("include"), path_rules.get("exclude")) is False: subdirList.remove(subdir) for future in concurrent.futures.as_completed(futures): file_result = futures[future] try: result_by_file[file_result] = future.result() except Exception as exc: logging.debug('%r generated an exception: %s',file_result, exc) if output is not None: Crawler.save_crawler_data(result_by_file, output) return result_by_file
def test_search_string(self): f = open("./test/test_inputs/test_search_string.txt", "r") content = f.read() f.close() self.assertTrue(FileOperations.validate_string(content, ["tes"])) self.assertTrue(FileOperations.validate_string(content, ["Google", "Analytics"])) self.assertFalse(FileOperations.validate_string(content, ["Google", "Analytecs"]))
def crawl(path_rules, search_rules=None, result_rules=None, output=None): ''' crawl directories starting at ```path_rules["start"]``` until ```path_rules["max_depth"]``` depth reached. check validity ```path_rules``` on directory and ```path_rules["file"]``` on file then execute ```search_rules``` if valid :param path_rules(dict): regex rules on file paths to exclude or include files/directory in the crawler :param search_rules(Optional[dict]): regex rules to search pattern in files :param result_rules(Optional[dict]): regex rules to extract data from files matched :return (dict of str: dict): a dictionary with path of files being keys and values are the results of search_rules on said file ''' result_by_file = {} if path_rules is None or "start" not in path_rules: return result_by_file root_depth = path_rules["start"].rstrip(os.path.sep).count(os.path.sep) - 1 for dir_path, subdirList, fileList in os.walk(path_rules["start"]): current_depth = dir_path.count(os.path.sep) - root_depth if "max_depth" not in path_rules or path_rules["max_depth"] >= current_depth: for fname in fileList: full_path = os.path.join(dir_path, fname) if os.path.isfile(full_path) \ and ("file" not in path_rules or \ FO.validate_string(full_path, path_rules["file"].get("include"), path_rules["file"].get("exclude"))): result_by_file[full_path] = FO.validate_file(full_path, search_rules, result_rules) for subdir in subdirList: subdir_full_path = os.path.join(dir_path, subdir) if FO.validate_string(subdir_full_path, path_rules.get("include"), path_rules.get("exclude")) is False: subdirList.remove(subdir) if output is not None: Crawler.save_crawler_data(result_by_file, output) return result_by_file