def _request_file(url, base_url): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: curl = HttpDirectory._curl_handle() raw_headers = BytesIO() curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) curl.setopt(pycurl.HEADERFUNCTION, raw_headers.write) curl.perform() stripped_url = url[len(base_url) - 1:] headers = HttpDirectory._parse_dict_header( raw_headers.getvalue().decode("utf-8", errors="ignore")) raw_headers.close() path, name = os.path.split(stripped_url) date = headers.get("Last-Modified", "1970-01-01") curl.close() return File(path=unquote(path).strip("/"), name=unquote(name), size=int(headers.get("Content-Length", -1)), mtime=int(parse_date(date).timestamp()), is_dir=False) except pycurl.error: retries -= 1 logger.debug("TimeoutError - _request_file") raise TimeoutError
def _process_listings(self, url: str, in_q: Queue, files_q: Queue): directory = RemoteDirectoryFactory.get_directory(url) timeout_retries = 20 # If any worker threads reaches 20 retries, the whole queue is emptied while directory: try: path = in_q.get(timeout=2000) except Empty: logger.debug("in_q is Empty") directory.close() break if path is None: break try: path_id, listing = directory.list_dir(path) if len(listing) > 0 and path_id not in self.crawled_paths: self.crawled_paths.add(path_id) for f in listing: if f.is_dir: in_q.put(urljoin(f.path, f.name)) else: files_q.put(f) logger.debug("LISTED " + urljoin(self.url, path)) except TooManyConnectionsError: logger.debug( "Too many connections, this thread will be killed and path resubmitted" ) # Kill worker and resubmit listing task directory.close() in_q.put(path) # TODO: If all workers are killed the queue will never get processed and # TODO: the crawler will be stuck forever break except TimeoutError: logger.error("Directory listing timed out, " + str(timeout_retries) + " retries left") if timeout_retries > 0: timeout_retries -= 1 in_q.put(path) else: logger.error("Dropping website " + url) self.status_code = "Timeout during website listing" directory.close() logger.debug("Emptying queue") while True: try: in_q.get_nowait() in_q.task_done() except Empty: break logger.debug("Emptied queue") break finally: in_q.task_done()
def _fetch_body(self, url: str): retries = HttpDirectory.MAX_RETRIES while retries > 0: try: content = BytesIO() self.curl.setopt(pycurl.URL, url.encode("utf-8", errors="ignore")) self.curl.setopt(pycurl.WRITEDATA, content) self.curl.perform() return content.getvalue().decode("utf-8", errors="ignore") except pycurl.error: self.close() retries -= 1 logger.debug("TimeoutError - _fetch_body") raise TimeoutError
def stop_when_connected(self): failed_attempts = 0 while failed_attempts < self.max_attempts: try: self._connect() logger.debug("New FTP connection @ " + self.base_url) return True except ftputil.error.FTPError as e: if e.errno == 530 or e.errno == 421: break failed_attempts += 1 print("Connection error; reconnecting..." + e.strerror + " " + str(e.errno)) time.sleep(2) return False
def error(self, message): logger.debug("HTML Parser error: " + message)
def close(self): if self.ftp: self.ftp.close() self.ftp = None logger.debug("Closing FtpRemoteDirectory for " + self.base_url)
def reconnect(self): if self.ftp: self.ftp.close() success = self.stop_when_connected() logger.debug("Reconnecting to FTP server " + self.base_url + (" (OK)" if success else " (ERR)"))