def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None, progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(), not_allow_redirects=False, proxies=None, delay=0, limit=1000, to_file=None, user_agent=None, cookies=None, headers=None): if not max_workers and not delay: max_workers = (multiprocessing.cpu_count() or 1) * 5 elif not max_workers and delay: max_workers = len(proxies or [None]) super(Crawler, self).__init__(max_workers) self.domains = set() self.results = Queue() self.index_of_processors = [] self.proxies = proxies self.delay = delay self.sessions = Sessions(proxies, delay, user_agent, cookies, headers) self.processing = {} self.processed = {} self.add_lock = Lock() self.spinner = random_spinner() self.start_dt = datetime.datetime.now() self.interesting_extensions = interesting_extensions or [] self.interesting_files = interesting_files or [] self.closing = False self.std = std or None self.progress_enabled = progress_enabled self.timeout = timeout self.not_follow_subdomains = not_follow_subdomains self.depth = depth self.exclude_sources = exclude_sources self.sources = Sources(self.add_url, self.add_message, exclude_sources) self.not_allow_redirects = not_allow_redirects self.limit = limit self.current_processed_count = 0 self.to_file = to_file
def __init__(self, processors, sessions, std=None, max_workers=None, progress_enabled=True, timeout=10): super(UrlsInfo, self).__init__(max_workers) self.lock = Lock() self.processors = processors self.sessions = sessions self.std = std self.spinner = random_spinner() self.progress_enabled = progress_enabled self.timeout = timeout
def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, echo=None, progress_enabled=True): self.domains = set() self.results = Queue() self.sessions = Sessions() self.processing = {} self.processed = {} self.max_workers = max_workers self.add_lock = Lock() self.executor = ThreadPoolExecutor(max_workers=self.max_workers) self.spinner = random_spinner() self.start_dt = datetime.datetime.now() self.interesting_extensions = interesting_extensions or [] self.interesting_files = interesting_files or [] self.closing = False self.echo = echo or (lambda x: x) self.progress_enabled = progress_enabled
def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None, progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(), not_allow_redirects=False): super(Crawler, self).__init__(max_workers) self.domains = set() self.results = Queue() self.index_of_processors = [] self.sessions = Sessions() self.processing = {} self.processed = {} self.add_lock = Lock() self.spinner = random_spinner() self.start_dt = datetime.datetime.now() self.interesting_extensions = interesting_extensions or [] self.interesting_files = interesting_files or [] self.closing = False self.std = std or None self.progress_enabled = progress_enabled self.timeout = timeout self.not_follow_subdomains = not_follow_subdomains self.depth = depth self.sources = Sources(self.add_url, exclude_sources) self.not_allow_redirects = not_allow_redirects
def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None, progress_enabled=True, timeout=10): super(Crawler, self).__init__(max_workers) self.domains = set() self.results = Queue() self.index_of_processors = [] self.sessions = Sessions() self.processing = {} self.processed = {} self.add_lock = Lock() self.spinner = random_spinner() self.start_dt = datetime.datetime.now() self.interesting_extensions = interesting_extensions or [] self.interesting_files = interesting_files or [] self.closing = False self.std = std or None self.progress_enabled = progress_enabled self.timeout = timeout