def __init__(self, url, domain, limit, limit_param, result_file_name, max_threads, sema, verbose): """ Create instance of Spider. :param url: website url :param domain: domain of website :param limit: crawling limit type ("depth" or "count") :param limit_param: limit parameter (max depth or max number of pages) :param result_file_name: file to store results in :param max_threads: maximum number of threads per process :param sema: semaphore (used for release action) :param verbose: verbosity of Spider """ # note: the locks are necessary for the parallel work and updating of variables self._emails_file_path = result_file_name self._max_threads = max_threads self._sema = sema # a semaphore self._url = Scraper.create_http_link( urlsplit(url)) # starting url should also be encoded self._domain = domain # set limit properties self.limit = limit self.limit_param = limit_param # pages scanned count self._count = 0 self._count_lock = Lock() # lock count variable # create links-to-visit queue self._to_visit = Queue() self._to_visit.put(self._url) self._scraper = Scraper(self._domain, url) # spider's links scraper self._emails = list( ) # list of emails already found (no need to use hash list since emails are usually short) self._email_lock = Lock() # lock to emails file self.verbose = verbose