Beispiel #1
0
 def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
              progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
              not_allow_redirects=False, proxies=None, delay=0, limit=1000, to_file=None, user_agent=None,
              cookies=None, headers=None):
     if not max_workers and not delay:
         max_workers = (multiprocessing.cpu_count() or 1) * 5
     elif not max_workers and delay:
         max_workers = len(proxies or [None])
     super(Crawler, self).__init__(max_workers)
     self.domains = set()
     self.results = Queue()
     self.index_of_processors = []
     self.proxies = proxies
     self.delay = delay
     self.sessions = Sessions(proxies, delay, user_agent, cookies, headers)
     self.processing = {}
     self.processed = {}
     self.add_lock = Lock()
     self.spinner = random_spinner()
     self.start_dt = datetime.datetime.now()
     self.interesting_extensions = interesting_extensions or []
     self.interesting_files = interesting_files or []
     self.closing = False
     self.std = std or None
     self.progress_enabled = progress_enabled
     self.timeout = timeout
     self.not_follow_subdomains = not_follow_subdomains
     self.depth = depth
     self.exclude_sources = exclude_sources
     self.sources = Sources(self.add_url, self.add_message, exclude_sources)
     self.not_allow_redirects = not_allow_redirects
     self.limit = limit
     self.current_processed_count = 0
     self.to_file = to_file
Beispiel #2
0
 def __init__(self, proxies=None, delay=0):
     self.availables = Queue()
     self.proxies_lists = RandomProxies()
     self.delay = delay
     self.sessions = self.create_sessions(proxies or [None])
     for session in self.sessions:
         self.availables.put(session)
Beispiel #3
0
 def __init__(self,
              proxies=None,
              delay=0,
              user_agent=None,
              cookies=None,
              headers=None):
     self.availables = Queue()
     self.proxies_lists = RandomProxies()
     self.delay = delay
     self.user_agent = user_agent
     self.cookies = cookies or {}
     self.headers = headers or {}
     self.sessions = self.create_sessions(proxies or [None])
     for session in self.sessions:
         self.availables.put(session)
Beispiel #4
0
class Sessions(object):
    def __init__(self, proxies=None, delay=0):
        self.availables = Queue()
        self.proxies_lists = RandomProxies()
        self.delay = delay
        self.sessions = self.create_sessions(proxies or [None])
        for session in self.sessions:
            self.availables.put(session)

    def add_available(self, session):
        if self.delay:
            threading.Timer(self.delay, self.availables.put, [session]).start()
        else:
            self.availables.put(session)

    def create_sessions(self, proxies):
        return [Session(self, proxy) for proxy in proxies]

    def get_random_session(self):
        return random.choice(self.sessions)

    def get_session(self):
        if not self.delay and self.availables.empty():
            return self.get_random_session()
        return self.availables.get()
Beispiel #5
0
 def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
              progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
              not_allow_redirects=False):
     super(Crawler, self).__init__(max_workers)
     self.domains = set()
     self.results = Queue()
     self.index_of_processors = []
     self.sessions = Sessions()
     self.processing = {}
     self.processed = {}
     self.add_lock = Lock()
     self.spinner = random_spinner()
     self.start_dt = datetime.datetime.now()
     self.interesting_extensions = interesting_extensions or []
     self.interesting_files = interesting_files or []
     self.closing = False
     self.std = std or None
     self.progress_enabled = progress_enabled
     self.timeout = timeout
     self.not_follow_subdomains = not_follow_subdomains
     self.depth = depth
     self.sources = Sources(self.add_url, exclude_sources)
     self.not_allow_redirects = not_allow_redirects
Beispiel #6
0
 def __init__(self,
              max_workers=None,
              interesting_extensions=None,
              interesting_files=None,
              std=None,
              progress_enabled=True,
              timeout=10):
     super(Crawler, self).__init__(max_workers)
     self.domains = set()
     self.results = Queue()
     self.index_of_processors = []
     self.sessions = Sessions()
     self.processing = {}
     self.processed = {}
     self.add_lock = Lock()
     self.spinner = random_spinner()
     self.start_dt = datetime.datetime.now()
     self.interesting_extensions = interesting_extensions or []
     self.interesting_files = interesting_files or []
     self.closing = False
     self.std = std or None
     self.progress_enabled = progress_enabled
     self.timeout = timeout
Beispiel #7
0
class Crawler(ThreadPoolExecutor):
    def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
                 progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
                 not_allow_redirects=False):
        super(Crawler, self).__init__(max_workers)
        self.domains = set()
        self.results = Queue()
        self.index_of_processors = []
        self.sessions = Sessions()
        self.processing = {}
        self.processed = {}
        self.add_lock = Lock()
        self.spinner = random_spinner()
        self.start_dt = datetime.datetime.now()
        self.interesting_extensions = interesting_extensions or []
        self.interesting_files = interesting_files or []
        self.closing = False
        self.std = std or None
        self.progress_enabled = progress_enabled
        self.timeout = timeout
        self.not_follow_subdomains = not_follow_subdomains
        self.depth = depth
        self.sources = Sources(self.add_url, exclude_sources)
        self.not_allow_redirects = not_allow_redirects

    def add_init_urls(self, *urls):
        """Add urls to queue.
        """
        for crawler_url in urls:
            if not isinstance(crawler_url, CrawlerUrl):
                crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout)
            self.add_domain(crawler_url.url.only_domain)
            self.add_url(crawler_url)

    def in_domains(self, domain):
        if self.not_follow_subdomains and domain not in self.domains:
            return False
        initial_domain = domain
        while True:
            if domain in self.domains:
                if initial_domain != domain:
                    # subdomain
                    self.add_domain(initial_domain)
                return True
            parts = domain.split('.')
            if len(parts) <= 2:
                return False
            domain = '.'.join(parts[1:])

    def add_domain(self, domain):
        if domain in self.domains:
            return
        self.domains.add(domain)
        self.sources.add_domain(domain)

    def add_url(self, crawler_url, force=False):
        """Add url to queue"""
        if not isinstance(crawler_url, CrawlerUrl):
            crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout)
        self.add_lock.acquire()
        url = crawler_url.url
        if not url.is_valid() or not url.only_domain or not self.in_domains(url.only_domain):
            self.add_lock.release()
            return
        if url.url in self.processing or url.url in self.processed:
            self.add_lock.release()
            return self.processing.get(url.url) or self.processed.get(url.url)

        fn = reraise_with_stack(crawler_url.start)
        if self.closing:
            self.add_lock.release()
            return
        if force:
            future = ThreadPoolExecutor(max_workers=1).submit(fn)
        else:
            future = self.submit(fn)
        self.processing[url.url] = future
        self.add_lock.release()
        return future

    def echo(self, body):
        if self.std is None:
            return
        # TODO: remove ANSI chars on is not tty
        self.std.write(str(body))
        self.std.write('\n')

    def erase(self):
        if self.std is None or not self.std.isatty():
            return
        CURSOR_UP_ONE = '\x1b[1A'
        ERASE_LINE = '\x1b[2K'
        # This can be improved. In the future we may want to define stdout/stderr with an cli option
        # fn = sys.stderr.write if sys.stderr.isatty() else sys.stdout.write
        self.std.write(CURSOR_UP_ONE + ERASE_LINE)

    def print_progress(self, finished=False):
        if not self.progress_enabled:
            # Cancel print progress on
            return
        self.echo('{} {} {}'.format(
            next(self.spinner),
            'Finished after' if finished else 'Started',
            (humanize.naturaldelta if finished else humanize.naturaltime)(datetime.datetime.now() - self.start_dt),
        ))

    def print_results(self, exclude=None, include=None):
        exclude = exclude or set()
        self.echo('Starting...')
        while True:
            result = None
            try:
                result = self.results.get(timeout=.5)
            except queue.Empty:
                pass
            self.erase()
            if result and result.maybe_directory() and not (result.crawler_url.flags & exclude) \
                    and (not include or (include & result.crawler_url.flags)):
                self.echo(result)
            self.print_progress()
            if self.sources.finished() and not self.processing:
                # Ended?
                self.erase()
                self.print_progress(True)
                return

    def print_urls_info(self):
        if not self.index_of_processors:
            self.echo('No interesting files detected ¯\_(ツ)_/¯')
            return
        self.echo('━' * get_terminal_size()[0])
        UrlsInfo(self.index_of_processors, self.sessions, self.std, self._max_workers, self.progress_enabled,
                 self.timeout).start()

    def restart(self):
        try:
            self.add_lock.release()
        except (ThreadError, RuntimeError):
            pass

    def close(self):
        self.closing = True
        self.shutdown(False)
        atexit.unregister(_python_exit)
Beispiel #8
0
class Crawler(ThreadPoolExecutor):
    urls_info = None

    def __init__(self, max_workers=None, interesting_extensions=None, interesting_files=None, std=None,
                 progress_enabled=True, timeout=10, depth=3, not_follow_subdomains=False, exclude_sources=(),
                 not_allow_redirects=False, proxies=None, delay=0, limit=1000, to_file=None, user_agent=None,
                 cookies=None, headers=None):
        if not max_workers and not delay:
            max_workers = (multiprocessing.cpu_count() or 1) * 5
        elif not max_workers and delay:
            max_workers = len(proxies or [None])
        super(Crawler, self).__init__(max_workers)
        self.domains = set()
        self.results = Queue()
        self.index_of_processors = []
        self.proxies = proxies
        self.delay = delay
        self.sessions = Sessions(proxies, delay, user_agent, cookies, headers)
        self.processing = {}
        self.processed = {}
        self.add_lock = Lock()
        self.spinner = random_spinner()
        self.start_dt = datetime.datetime.now()
        self.interesting_extensions = interesting_extensions or []
        self.interesting_files = interesting_files or []
        self.closing = False
        self.std = std or None
        self.progress_enabled = progress_enabled
        self.timeout = timeout
        self.not_follow_subdomains = not_follow_subdomains
        self.depth = depth
        self.exclude_sources = exclude_sources
        self.sources = Sources(self.add_url, self.add_message, exclude_sources)
        self.not_allow_redirects = not_allow_redirects
        self.limit = limit
        self.current_processed_count = 0
        self.to_file = to_file

    def add_init_urls(self, *urls):
        """Add urls to queue.
        """
        for crawler_url in urls:
            if not isinstance(crawler_url, CrawlerUrl):
                crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout)
            self.add_domain(crawler_url.url.only_domain)
            self.add_url(crawler_url)

    def in_domains(self, domain):
        if self.not_follow_subdomains and domain not in self.domains:
            return False
        initial_domain = domain
        while True:
            if domain in self.domains:
                if initial_domain != domain:
                    # subdomain
                    self.add_domain(initial_domain)
                return True
            parts = domain.split('.')
            if len(parts) <= 2:
                return False
            domain = '.'.join(parts[1:])

    def add_domain(self, domain):
        if domain in self.domains:
            return
        self.domains.add(domain)
        self.sources.add_domain(domain)

    def add_url(self, crawler_url, force=False):
        """Add url to queue"""
        if not isinstance(crawler_url, CrawlerUrl):
            crawler_url = CrawlerUrl(self, crawler_url, depth=self.depth, timeout=self.timeout)
        self.add_lock.acquire()
        url = crawler_url.url
        if not url.is_valid() or not url.only_domain or not self.in_domains(url.only_domain):
            self.add_lock.release()
            return
        if url.url in self.processing or url.url in self.processed:
            self.add_lock.release()
            return self.processing.get(url.url) or self.processed.get(url.url)

        fn = reraise_with_stack(crawler_url.start)
        if self.closing:
            self.add_lock.release()
            return
        if force:
            future = ThreadPoolExecutor(max_workers=1).submit(fn)
        else:
            future = self.submit(fn)
        self.processing[url.url] = future
        self.add_lock.release()
        return future

    def add_message(self, body):
        from dirhunt.processors import Message
        self.results.put(Message(body))

    def echo(self, body):
        if self.std is None:
            return
        # TODO: remove ANSI chars on is not tty
        self.std.write(str(body))
        self.std.write('\n')

    def erase(self):
        if self.std is None or not self.std.isatty():
            return
        CURSOR_UP_ONE = '\x1b[1A'
        ERASE_LINE = '\x1b[2K'
        # This can be improved. In the future we may want to define stdout/stderr with an cli option
        # fn = sys.stderr.write if sys.stderr.isatty() else sys.stdout.write
        self.std.write(CURSOR_UP_ONE + ERASE_LINE)

    def print_progress(self, finished=False):
        if not self.progress_enabled:
            # Cancel print progress on
            return
        self.echo('{} {} {}'.format(
            next(self.spinner),
            'Finished after' if finished else 'Started',
            (humanize.naturaldelta if finished else humanize.naturaltime)(datetime.datetime.now() - self.start_dt),
        ))

    def print_results(self, exclude=None, include=None):
        exclude = exclude or set()
        self.echo('Starting...')
        while True:
            result = None
            try:
                result = self.results.get(timeout=.5)
            except queue.Empty:
                pass
            self.erase()
            if result and result.maybe_directory() and not (result.crawler_url.flags & exclude) \
                    and (not include or (include & result.crawler_url.flags)):
                self.echo(result)
            self.print_progress()
            if (self.sources.finished() and not self.processing) or \
                    self.current_processed_count >= self.limit and self.limit:
                # Ended
                if self.current_processed_count >= self.limit and self.limit:
                    # Force shutdown
                    self.closing = True
                    self.shutdown()
                self.erase()
                self.print_progress(True)
                return

    def print_urls_info(self):
        if not self.index_of_processors:
            self.echo(r'No interesting files detected ¯\_(ツ)_/¯')
            return
        self.echo('━' * get_terminal_size()[0])
        self.urls_info = UrlsInfo(self.index_of_processors, self.sessions, self.std, self._max_workers,
                                  self.progress_enabled, self.timeout, bool(self.to_file))
        self.urls_info.start()

    def restart(self):
        try:
            self.add_lock.release()
        except (ThreadError, RuntimeError):
            pass

    def options(self):
        return {
            'interesting_extensions': self.interesting_extensions,
            'interesting_files': self.interesting_files,
            'timeout': self.interesting_files,
            'depth': self.interesting_files,
            'not_follow_subdomains': self.not_follow_subdomains,
            'exclude_sources': self.exclude_sources,
            'not_allow_redirects': self.not_allow_redirects,
            'proxies': self.proxies,
            'delay': self.delay,
            'limit': self.limit,
        }

    @property
    def options_file(self):
        checksum = sha256(json.dumps(self.options(), sort_keys=True).encode('utf-8')).hexdigest()
        return os.path.join(resume_dir, checksum)

    def get_resume_file(self):
        return self.to_file or self.options_file

    def close(self, create_resume=False):
        self.closing = True
        self.shutdown(False)
        if create_resume:
            self.create_report(self.get_resume_file())
        unregister(_python_exit)

    def create_report(self, to_file):
        """Write to a file a report with current json() state. This file can be read
        to continue an analysis."""
        to_file = os.path.abspath(to_file)
        directory = os.path.dirname(to_file)
        if not os.path.exists(directory):
            os.makedirs(directory)
        data = self.json()
        json.dump(data, open(to_file, 'w'), cls=JsonReportEncoder, indent=4, sort_keys=True)

    def resume(self, path):
        resume_data = json.load(open(path))
        file_version = resume_data.get('version')
        if file_version != __version__:
            raise IncompatibleVersionError(
                'Analysis file incompatible with the current version of dirhunt. '
                'Dirhunt version: {}. File version: {}'.format(__version__, file_version)
            )
        for data in resume_data['processed']:
            crawler_url_data = data['crawler_url']
            url = crawler_url_data['url']['address']
            crawler_url = CrawlerUrl(self, url, crawler_url_data['depth'], None, crawler_url_data['exists'],
                                     crawler_url_data['type'])
            crawler_url.flags = set(crawler_url_data['flags'])
            crawler_url.processor_data = data
            self.processed[url] = crawler_url
            self.echo(data['line'])
        for url in resume_data['processing']:
            self.add_url(url)

    def json(self):
        urls_infos = self.urls_info.urls_info if self.urls_info else []
        urls_infos = [urls_info.json() for urls_info in urls_infos]
        return {
            'version': __version__,
            'current_processed_count': self.current_processed_count,
            'domains': self.domains,
            'index_of_processors': self.index_of_processors,
            'processing': list(self.processing.keys()),
            'processed': list(filter(bool, [processed.processor_data for processed in self.processed.values()])),
            'urls_infos': urls_infos,
        }