Beispiel #1
0
 def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
              progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
              not_allow_redirects=False, proxies=None, delay=0, limit=1000, to_file=None, user_agent=None,
              cookies=None, headers=None):
     if not max_workers and not delay:
         max_workers = (multiprocessing.cpu_count() or 1) * 5
     elif not max_workers and delay:
         max_workers = len(proxies or [None])
     super(Crawler, self).__init__(max_workers)
     self.domains = set()
     self.results = Queue()
     self.index_of_processors = []
     self.proxies = proxies
     self.delay = delay
     self.sessions = Sessions(proxies, delay, user_agent, cookies, headers)
     self.processing = {}
     self.processed = {}
     self.add_lock = Lock()
     self.spinner = random_spinner()
     self.start_dt = datetime.datetime.now()
     self.interesting_extensions = interesting_extensions or []
     self.interesting_files = interesting_files or []
     self.closing = False
     self.std = std or None
     self.progress_enabled = progress_enabled
     self.timeout = timeout
     self.not_follow_subdomains = not_follow_subdomains
     self.depth = depth
     self.exclude_sources = exclude_sources
     self.sources = Sources(self.add_url, self.add_message, exclude_sources)
     self.not_allow_redirects = not_allow_redirects
     self.limit = limit
     self.current_processed_count = 0
     self.to_file = to_file
Beispiel #2
0
 def __init__(self, processors, sessions, std=None, max_workers=None, progress_enabled=True, timeout=10):
     super(UrlsInfo, self).__init__(max_workers)
     self.lock = Lock()
     self.processors = processors
     self.sessions = sessions
     self.std = std
     self.spinner = random_spinner()
     self.progress_enabled = progress_enabled
     self.timeout = timeout
Beispiel #3
0
 def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, echo=None,
              progress_enabled=True):
     self.domains = set()
     self.results = Queue()
     self.sessions = Sessions()
     self.processing = {}
     self.processed = {}
     self.max_workers = max_workers
     self.add_lock = Lock()
     self.executor = ThreadPoolExecutor(max_workers=self.max_workers)
     self.spinner = random_spinner()
     self.start_dt = datetime.datetime.now()
     self.interesting_extensions = interesting_extensions or []
     self.interesting_files = interesting_files or []
     self.closing = False
     self.echo = echo or (lambda x: x)
     self.progress_enabled = progress_enabled
Beispiel #4
0
 def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
              progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
              not_allow_redirects=False):
     super(Crawler, self).__init__(max_workers)
     self.domains = set()
     self.results = Queue()
     self.index_of_processors = []
     self.sessions = Sessions()
     self.processing = {}
     self.processed = {}
     self.add_lock = Lock()
     self.spinner = random_spinner()
     self.start_dt = datetime.datetime.now()
     self.interesting_extensions = interesting_extensions or []
     self.interesting_files = interesting_files or []
     self.closing = False
     self.std = std or None
     self.progress_enabled = progress_enabled
     self.timeout = timeout
     self.not_follow_subdomains = not_follow_subdomains
     self.depth = depth
     self.sources = Sources(self.add_url, exclude_sources)
     self.not_allow_redirects = not_allow_redirects
Beispiel #5
0
 def __init__(self,
              max_workers=None,
              interesting_extensions=None,
              interesting_files=None,
              std=None,
              progress_enabled=True,
              timeout=10):
     super(Crawler, self).__init__(max_workers)
     self.domains = set()
     self.results = Queue()
     self.index_of_processors = []
     self.sessions = Sessions()
     self.processing = {}
     self.processed = {}
     self.add_lock = Lock()
     self.spinner = random_spinner()
     self.start_dt = datetime.datetime.now()
     self.interesting_extensions = interesting_extensions or []
     self.interesting_files = interesting_files or []
     self.closing = False
     self.std = std or None
     self.progress_enabled = progress_enabled
     self.timeout = timeout