def crawl(path_rules, search_rules=None, result_rules=None, output=None): ''' crawl directories starting at ```path_rules["start"]``` until ```path_rules["max_depth"]``` depth reached. check validity ```path_rules``` on directory and ```path_rules["file"]``` on file then execute ```search_rules``` if valid :param path_rules(dict): regex rules on file paths to exclude or include files/directory in the crawler :param search_rules(Optional[dict]): regex rules to search pattern in files :param result_rules(Optional[dict]): regex rules to extract data from files matched :return (dict of str: dict): a dictionary with path of files being keys and values are the results of search_rules on said file ''' result_by_file = {} if path_rules is None or "start" not in path_rules: return result_by_file root_depth = path_rules["start"].rstrip(os.path.sep).count(os.path.sep) - 1 for dir_path, subdirList, fileList in os.walk(path_rules["start"]): current_depth = dir_path.count(os.path.sep) - root_depth if "max_depth" not in path_rules or path_rules["max_depth"] >= current_depth: for fname in fileList: full_path = os.path.join(dir_path, fname) if os.path.isfile(full_path) \ and ("file" not in path_rules or \ FO.validate_string(full_path, path_rules["file"].get("include"), path_rules["file"].get("exclude"))): result_by_file[full_path] = FO.validate_file(full_path, search_rules, result_rules) for subdir in subdirList: subdir_full_path = os.path.join(dir_path, subdir) if FO.validate_string(subdir_full_path, path_rules.get("include"), path_rules.get("exclude")) is False: subdirList.remove(subdir) if output is not None: Crawler.save_crawler_data(result_by_file, output) return result_by_file
def test_validate_file(self): validate_result = FileOperations.validate_file("./test/test_inputs/test_search_string.txt", ["Google Analytics"], {"CONTENT": {"registered": "'([\w\s]+)'"}, "built-in": ["DATE UPDATED"]}) self.assertEqual(validate_result["registered"][1], "Site Catalyst") self.assertGreater(time.localtime(), time.strptime(validate_result["DATE UPDATED"]))