Ejemplo n.º 1
0
 def test_lalibre(self):
     test_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data', 'lalibre')
     to_process = sorted(glob(os.path.join(test_dir, '*.har')))
     crawled_tree = CrawledTree(to_process)
     crawled_tree.find_parents()
     crawled_tree.join_trees()
     crawled_tree.to_json()
Ejemplo n.º 2
0
    def cache_tree(self, capture_uuid: str) -> None:
        '''Generate the pickle, add capture in the indexes'''
        capture_dir = self.lookup_capture_dir(capture_uuid)
        if not capture_dir:
            raise MissingUUID(
                f'Unable to find UUID {capture_uuid} in the cache')

        with open((capture_dir / 'uuid'), 'r') as f:
            uuid = f.read()
        har_files = sorted(capture_dir.glob('*.har'))
        # NOTE: We only index the public captures
        index = True
        try:
            ct = CrawledTree(har_files, uuid)
            self.resolve_dns(ct)
            # getting the cache triggers an update of the said cache. We want it there.
            cache = self.capture_cache(capture_uuid)
            if self.is_public_instance:
                if cache.get('no_index') is not None:
                    index = False
            if index:
                self.indexing.index_cookies_capture(ct)
                self.indexing.index_body_hashes_capture(ct)
                self.indexing.index_url_capture(ct)
                categories = list(self.categories_capture(capture_uuid).keys())
                self.indexing.index_categories_capture(capture_uuid,
                                                       categories)
        except Har2TreeError as e:
            raise NoValidHarFile(e.message)

        with (capture_dir / 'tree.pickle').open('wb') as _p:
            pickle.dump(ct, _p)
Ejemplo n.º 3
0
 def tree(self):
     if not HAVE_ETE:
         self.log('error', 'Missing dependency: git+https://github.com/viper-framework/har2tree.git')
         return
     har_files = self.all_reports[self.reportid]['har']
     ct = CrawledTree(har_files)
     ct.find_parents()
     ct.join_trees()
     tree_file = os.path.join(self.scraper_store, self.all_reports[self.reportid]['isots'], "tree.pdf")
     ct.dump_test(tree_file)
     self.log('success', 'Tree dump created: {}'.format(tree_file))
Ejemplo n.º 4
0
 def load_tree(self, capture_dir: Path) -> Tuple[str, str, str, str, Dict[str, str]]:
     har_files = sorted(capture_dir.glob('*.har'))
     pickle_file = capture_dir / 'tree.pickle'
     try:
         meta = {}
         if (capture_dir / 'meta').exists():
             # NOTE: Legacy, the meta file should be present
             with open((capture_dir / 'meta'), 'r') as f:
                 meta = json.load(f)
         ct = self._load_pickle(pickle_file)
         if not ct:
             with open((capture_dir / 'uuid'), 'r') as f:
                 uuid = f.read()
             ct = CrawledTree(har_files, uuid)
             with pickle_file.open('wb') as _p:
                 pickle.dump(ct, _p)
         return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
     except Har2TreeError as e:
         raise NoValidHarFile(e.message)
Ejemplo n.º 5
0
def load_tree(report_dir):
    session.clear()
    har_files = sorted(report_dir.glob('*.har'))
    ct = CrawledTree(har_files)
    ct.find_parents()
    ct.join_trees()
    temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
    pickle.dump(ct, temp)
    temp.close()
    session["tree"] = temp.name
    return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
Ejemplo n.º 6
0
    def setUpClass(cls) -> None:

        setattr(
            cls, 'test_dir',
            Path(os.path.abspath(os.path.dirname(__file__))) /
            'capture_samples')

        # Iterates over capture_samples folder and makes a CrawledTree out of each folder
        for x in cls.test_dir.iterdir():
            if x.is_dir():
                folder_name = str(x).split('/')[-1]
                tree_name = f'{folder_name}_ct'
                har = cls.test_dir / folder_name / '0.har'
                setattr(cls, tree_name, CrawledTree([har], str(uuid.uuid4())))
Ejemplo n.º 7
0
def load_tree(report_dir):
    if session.get('tree'):
        # TODO delete file
        pass
    session.clear()
    har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har')))
    ct = CrawledTree(har_files)
    ct.find_parents()
    ct.join_trees()
    temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
    pickle.dump(ct, temp)
    temp.close()
    session["tree"] = temp.name
    return ct.jsonify(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
Ejemplo n.º 8
0
    def _create_pickle(self, capture_dir: Path) -> CrawledTree:
        with (capture_dir / 'uuid').open() as f:
            uuid = f.read().strip()

        lock_file = capture_dir / 'lock'
        if try_make_file(lock_file):
            # Lock created, we can process
            with lock_file.open('w') as f:
                f.write(datetime.now().isoformat())
        else:
            # The pickle is being created somewhere else, wait until it's done.
            while lock_file.exists():
                time.sleep(5)
            return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)

        har_files = sorted(capture_dir.glob('*.har'))
        pickle_file = capture_dir / 'tree.pickle'
        try:
            tree = CrawledTree(har_files, uuid)
            self.__resolve_dns(tree)
            if self.contextualizer:
                self.contextualizer.contextualize_tree(tree)
        except Har2TreeError as e:
            raise NoValidHarFile(e)
        except RecursionError as e:
            raise NoValidHarFile(
                f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.'
            )
        else:
            with pickle_file.open('wb') as _p:
                # Some pickles require a pretty high recursion limit, this kindof fixes it.
                # If the capture is really broken (generally a refresh to self), the capture
                # is discarded in the RecursionError above.
                default_recursion_limit = sys.getrecursionlimit()
                sys.setrecursionlimit(int(default_recursion_limit * 1.1))
                try:
                    pickle.dump(tree, _p)
                except RecursionError as e:
                    raise NoValidHarFile(
                        f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.'
                    )
                sys.setrecursionlimit(default_recursion_limit)
        finally:
            lock_file.unlink(missing_ok=True)
        return tree
Ejemplo n.º 9
0
 def load_tree(self, report_dir: Path):
     har_files = sorted(report_dir.glob('*.har'))
     try:
         meta = {}
         if (report_dir / 'meta').exists():
             with open((report_dir / 'meta'), 'r') as f:
                 meta = json.load(f)
         ct = CrawledTree(har_files)
         ct.find_parents()
         ct.join_trees()
         temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
         pickle.dump(ct, temp)
         temp.close()
         return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
     except Har2TreeError as e:
         raise NoValidHarFile(e.message)
Ejemplo n.º 10
0
 def test_wired(self):
     test_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'data', 'wired')
     to_process = sorted(glob(os.path.join(test_dir, '*.har')))
     crawled_tree = CrawledTree(to_process)
     crawled_tree.to_json()
Ejemplo n.º 11
0
 def test_wired(self) -> None:
     test_dir = Path(os.path.abspath(
         os.path.dirname(__file__))) / 'data' / 'wired'
     har_to_process: Iterable[Path] = sorted(test_dir.glob('*.har'))
     crawled_tree = CrawledTree(har_to_process, str(uuid.uuid4()))
     crawled_tree.to_json()
Ejemplo n.º 12
0
from pathlib import Path
import uuid
from har2tree import CrawledTree
har_path = Path()  / 'tests' / 'capture_samples' / 'http_redirect' / '0.har'
my_first_crawled_tree = CrawledTree([har_path], str(uuid.uuid4()))
my_first_crawled_tree.root_hartree.rendered_node.show()