예제 #1
0
 def crawl_multithread(path_rules, search_rules=None, result_rules=None, output=None, threads=None):
     '''This method is the multithreading version of :func:`Crawler.crawl` with tentative of 1 thread per file'''
     result_by_file = {}
     if path_rules is None or "start" not in path_rules:
         return result_by_file
     futures = {}
     with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
         root_depth = path_rules["start"].rstrip(os.path.sep).count(os.path.sep) - 1
         for dir_path, subdirList, fileList in os.walk(path_rules["start"]):
             current_depth = dir_path.count(os.path.sep) - root_depth
             if "max_depth" not in path_rules or path_rules["max_depth"] >= current_depth:
                 for fname in fileList:
                     full_path = os.path.join(dir_path, fname)
                     if os.path.isfile(full_path) \
                             and ("file" not in path_rules or \
                                          FO.validate_string(full_path, path_rules["file"].get("include"), path_rules["file"].get("exclude"))):
                         futures[executor.submit(FO.validate_file,full_path, search_rules, result_rules)] = full_path
                 for subdir in subdirList:
                     subdir_full_path = os.path.join(dir_path, subdir)
                     if FO.validate_string(subdir_full_path, path_rules.get("include"), path_rules.get("exclude")) is False:
                         subdirList.remove(subdir)
     for future in concurrent.futures.as_completed(futures):
         file_result = futures[future]
         try:
             result_by_file[file_result] = future.result()
         except Exception as exc:
             logging.debug('%r generated an exception: %s',file_result, exc)
     if output is not None:
         Crawler.save_crawler_data(result_by_file, output)
     return result_by_file
예제 #2
0
 def test_search_string(self):
     f = open("./test/test_inputs/test_search_string.txt", "r")
     content = f.read()
     f.close()
     self.assertTrue(FileOperations.validate_string(content, ["tes"]))
     self.assertTrue(FileOperations.validate_string(content, ["Google", "Analytics"]))
     self.assertFalse(FileOperations.validate_string(content, ["Google", "Analytecs"]))
예제 #3
0
 def crawl(path_rules, search_rules=None, result_rules=None, output=None):
     '''
     crawl directories starting at ```path_rules["start"]``` until ```path_rules["max_depth"]``` depth reached.
     check validity ```path_rules``` on directory and ```path_rules["file"]``` on file then execute ```search_rules``` if valid
     :param path_rules(dict): regex rules on file paths to exclude or include files/directory in the crawler
     :param search_rules(Optional[dict]): regex rules to search pattern in files
     :param result_rules(Optional[dict]): regex rules to extract data from files matched
     :return (dict of str: dict): a dictionary with path of files being keys and values are the results of search_rules on said file
     '''
     result_by_file = {}
     if path_rules is None or "start" not in path_rules:
         return result_by_file
     root_depth = path_rules["start"].rstrip(os.path.sep).count(os.path.sep) - 1
     for dir_path, subdirList, fileList in os.walk(path_rules["start"]):
         current_depth = dir_path.count(os.path.sep) - root_depth
         if "max_depth" not in path_rules or path_rules["max_depth"] >= current_depth:
             for fname in fileList:
                 full_path = os.path.join(dir_path, fname)
                 if os.path.isfile(full_path) \
                         and ("file" not in path_rules or \
                                      FO.validate_string(full_path, path_rules["file"].get("include"), path_rules["file"].get("exclude"))):
                     result_by_file[full_path] = FO.validate_file(full_path, search_rules, result_rules)
             for subdir in subdirList:
                 subdir_full_path = os.path.join(dir_path, subdir)
                 if FO.validate_string(subdir_full_path, path_rules.get("include"), path_rules.get("exclude")) is False:
                     subdirList.remove(subdir)
     if output is not None:
         Crawler.save_crawler_data(result_by_file, output)
     return result_by_file