예제 #1
0
 def crawl(path_rules, search_rules=None, result_rules=None, output=None):
     '''
     crawl directories starting at ```path_rules["start"]``` until ```path_rules["max_depth"]``` depth reached.
     check validity ```path_rules``` on directory and ```path_rules["file"]``` on file then execute ```search_rules``` if valid
     :param path_rules(dict): regex rules on file paths to exclude or include files/directory in the crawler
     :param search_rules(Optional[dict]): regex rules to search pattern in files
     :param result_rules(Optional[dict]): regex rules to extract data from files matched
     :return (dict of str: dict): a dictionary with path of files being keys and values are the results of search_rules on said file
     '''
     result_by_file = {}
     if path_rules is None or "start" not in path_rules:
         return result_by_file
     root_depth = path_rules["start"].rstrip(os.path.sep).count(os.path.sep) - 1
     for dir_path, subdirList, fileList in os.walk(path_rules["start"]):
         current_depth = dir_path.count(os.path.sep) - root_depth
         if "max_depth" not in path_rules or path_rules["max_depth"] >= current_depth:
             for fname in fileList:
                 full_path = os.path.join(dir_path, fname)
                 if os.path.isfile(full_path) \
                         and ("file" not in path_rules or \
                                      FO.validate_string(full_path, path_rules["file"].get("include"), path_rules["file"].get("exclude"))):
                     result_by_file[full_path] = FO.validate_file(full_path, search_rules, result_rules)
             for subdir in subdirList:
                 subdir_full_path = os.path.join(dir_path, subdir)
                 if FO.validate_string(subdir_full_path, path_rules.get("include"), path_rules.get("exclude")) is False:
                     subdirList.remove(subdir)
     if output is not None:
         Crawler.save_crawler_data(result_by_file, output)
     return result_by_file
예제 #2
0
 def test_validate_file(self):
     validate_result = FileOperations.validate_file("./test/test_inputs/test_search_string.txt", ["Google Analytics"],
                                                    {"CONTENT": {"registered": "'([\w\s]+)'"},
                                                     "built-in": ["DATE UPDATED"]})
     self.assertEqual(validate_result["registered"][1], "Site Catalyst")
     self.assertGreater(time.localtime(), time.strptime(validate_result["DATE UPDATED"]))