Example #1
0
    def __init__(self, url, domain, limit, limit_param, result_file_name,
                 max_threads, sema, verbose):
        """
        Create instance of Spider.

        :param url: website url
        :param domain: domain of website
        :param limit: crawling limit type ("depth" or "count")
        :param limit_param: limit parameter (max depth or max number of pages)
        :param result_file_name: file to store results in
        :param max_threads: maximum number of threads per process
        :param sema: semaphore (used for release action)
        :param verbose: verbosity of Spider
        """
        # note: the locks are necessary for the parallel work and updating of variables
        self._emails_file_path = result_file_name

        self._max_threads = max_threads

        self._sema = sema  # a semaphore

        self._url = Scraper.create_http_link(
            urlsplit(url))  # starting url should also be encoded
        self._domain = domain

        # set limit properties
        self.limit = limit
        self.limit_param = limit_param

        # pages scanned count
        self._count = 0
        self._count_lock = Lock()  # lock count variable

        # create links-to-visit queue
        self._to_visit = Queue()
        self._to_visit.put(self._url)

        self._scraper = Scraper(self._domain, url)  # spider's links scraper
        self._emails = list(
        )  # list of emails already found (no need to use hash list since emails are usually short)
        self._email_lock = Lock()  # lock to emails file

        self.verbose = verbose