class html_comments(GrepPlugin):
    """
    Extract and analyze HTML comments.

    :author: Andres Riancho ([email protected])
    """

    HTML_RE = re.compile('<[a-zA-Z]*.*?>.*?</[a-zA-Z]>')

    INTERESTING_WORDS = (
        # In English
        'user', 'pass', 'xxx', 'fix', 'bug', 'broken', 'oops', 'hack',
        'caution', 'todo', 'note', 'warning', '!!!', '???', 'shit',
        'pass', 'password', 'passwd', 'pwd', 'secret', 'stupid',
        
        # In Spanish
        'tonto', 'porqueria', 'cuidado', 'usuario', u'contraseña',
        'puta', 'email', 'security', 'captcha', 'pinga', 'cojones',
        
        # some in Portuguese
        'banco', 'bradesco', 'itau', 'visa', 'bancoreal', u'transfêrencia',
        u'depósito', u'cartão', u'crédito', 'dados pessoais'
    )

    _multi_in = multi_in([' %s ' % w for w in INTERESTING_WORDS])

    def __init__(self):
        GrepPlugin.__init__(self)

        # Internal variables
        self._comments = DiskDict(table_prefix='html_comments')
        self._already_reported = ScalableBloomFilter()

    def grep(self, request, response):
        """
        Plugin entry point, parse those comments!

        :param request: The HTTP request object.
        :param response: The HTTP response object
        :return: None
        """
        if not response.is_text_or_html():
            return
        
        try:
            dp = parser_cache.dpc.get_document_parser_for(response)
        except BaseFrameworkException:
            return
        
        for comment in dp.get_comments():
            # These next two lines fix this issue:
            # audit.ssi + grep.html_comments + web app with XSS = false positive
            if request.sent(comment):
                continue

            if self._is_new(comment, response):

                self._interesting_word(comment, request, response)
                self._html_in_comment(comment, request, response)

    def _interesting_word(self, comment, request, response):
        """
        Find interesting words in HTML comments
        """
        comment = comment.lower()

        for word in self._multi_in.query(comment):
            if (word, response.get_url()) in self._already_reported:
                continue

            desc = ('A comment with the string "%s" was found in: "%s".'
                    ' This could be interesting.')
            desc %= (word, response.get_url())

            i = Info.from_fr('Interesting HTML comment', desc, response.id,
                             self.get_name(), request)
            i.add_to_highlight(word)

            kb.kb.append(self, 'interesting_comments', i)
            om.out.information(i.get_desc())
                
            self._already_reported.add((word, response.get_url()))

    def _html_in_comment(self, comment, request, response):
        """
        Find HTML code in HTML comments
        """
        html_in_comment = self.HTML_RE.search(comment)

        if html_in_comment is None:
            return

        if (comment, response.get_url()) in self._already_reported:
            return

        # There is HTML code in the comment.
        comment = comment.strip()
        comment = comment.replace('\n', '')
        comment = comment.replace('\r', '')
        comment = comment[:40]

        desc = ('A comment with the string "%s" was found in: "%s".'
                ' This could be interesting.')
        desc %= (comment, response.get_url())

        i = Info.from_fr('HTML comment contains HTML code', desc, response.id,
                         self.get_name(), request)
        i.set_uri(response.get_uri())
        i.add_to_highlight(html_in_comment.group(0))

        kb.kb.append(self, 'html_comment_hides_html', i)
        om.out.information(i.get_desc())
        self._already_reported.add((comment, response.get_url()))

    def _is_new(self, comment, response):
        """
        Make sure that we perform a thread safe check on the self._comments
        dict, in order to avoid duplicates.
        """
        with self._plugin_lock:
            
            #pylint: disable=E1103
            comment_data = self._comments.get(comment, None)
            response_url = response.get_url()

            if comment_data is None:
                self._comments[comment] = [(response_url, response.id)]
                return True
            else:
                for saved_url, response_id in comment_data:
                    if response_url == saved_url:
                        return False
                else:
                    comment_data.append((response_url, response.id))
                    self._comments[comment] = comment_data
                    return True
            #pylint: enable=E1103

    def end(self):
        """
        This method is called when the plugin wont be used anymore.
        :return: None
        """
        for comment, url_request_id_lst in self._comments.iteritems():

            stick_comment = ' '.join(comment.split())

            if len(stick_comment) > 40:
                msg = ('A comment with the string "%s..." (and %s more bytes)'
                       ' was found on these URL(s):')
                args = (stick_comment[:40], str(len(stick_comment) - 40))
                om.out.information(msg % args)
            else:
                msg = 'A comment containing "%s" was found on these URL(s):'
                om.out.information(msg % stick_comment)

            inform = []

            for url, request_id in url_request_id_lst:
                msg = '- %s (request with id: %s)'
                inform.append(msg % (url, request_id))

            for i in sorted(inform):
                om.out.information(i)

        self._comments.cleanup()

    def get_long_desc(self):
        """
        :return: A DETAILED description of the plugin functions and features.
        """
        return """
Exemple #2
0
class html_comments(GrepPlugin):
    """
    Extract and analyze HTML comments.

    :author: Andres Riancho ([email protected])
    """

    HTML_RE = re.compile('<[a-zA-Z]*.*?>.*?</[a-zA-Z]>')

    INTERESTING_WORDS = (
        # In English
        'user',
        'pass',
        'xxx',
        'fix',
        'bug',
        'broken',
        'oops',
        'hack',
        'caution',
        'todo',
        'note',
        'warning',
        '!!!',
        '???',
        'shit',
        'pass',
        'password',
        'passwd',
        'pwd',
        'secret',
        'stupid',

        # In Spanish
        'tonto',
        'porqueria',
        'cuidado',
        'usuario',
        u'contraseña',
        'puta',
        'email',
        'security',
        'captcha',
        'pinga',
        'cojones',

        # some in Portuguese
        'banco',
        'bradesco',
        'itau',
        'visa',
        'bancoreal',
        u'transfêrencia',
        u'depósito',
        u'cartão',
        u'crédito',
        'dados pessoais')

    _multi_in = multi_in([' %s ' % w for w in INTERESTING_WORDS])

    def __init__(self):
        GrepPlugin.__init__(self)

        # Internal variables
        self._comments = DiskDict(table_prefix='html_comments')
        self._already_reported = ScalableBloomFilter()

    def grep(self, request, response):
        """
        Plugin entry point, parse those comments!

        :param request: The HTTP request object.
        :param response: The HTTP response object
        :return: None
        """
        if not response.is_text_or_html():
            return

        try:
            dp = parser_cache.dpc.get_document_parser_for(response)
        except BaseFrameworkException:
            return

        for comment in dp.get_comments():
            # These next two lines fix this issue:
            # audit.ssi + grep.html_comments + web app with XSS = false positive
            if request.sent(comment):
                continue

            if self._is_new(comment, response):

                self._interesting_word(comment, request, response)
                self._html_in_comment(comment, request, response)

    def _interesting_word(self, comment, request, response):
        """
        Find interesting words in HTML comments
        """
        comment = comment.lower()

        for word in self._multi_in.query(comment):
            if (word, response.get_url()) in self._already_reported:
                continue

            desc = ('A comment with the string "%s" was found in: "%s".'
                    ' This could be interesting.')
            desc %= (word, response.get_url())

            v = Vuln.from_fr('Interesting HTML comment',
                             desc, severity.INFORMATION, response.id,
                             self.get_name(), request)
            v.add_to_highlight(word)

            kb.kb.append(self, 'interesting_comments', v)

            self._already_reported.add((word, response.get_url()))

    def _html_in_comment(self, comment, request, response):
        """
        Find HTML code in HTML comments
        """
        html_in_comment = self.HTML_RE.search(comment)

        if html_in_comment is None:
            return

        if (comment, response.get_url()) in self._already_reported:
            return

        # There is HTML code in the comment.
        comment = comment.strip()
        comment = comment.replace('\n', '')
        comment = comment.replace('\r', '')
        comment = comment[:40]

        desc = ('A comment with the string "%s" was found in: "%s".'
                ' This could be interesting.')
        desc %= (comment, response.get_url())

        v = Vuln.from_fr('HTML comment contains HTML code',
                         desc, severity.INFORMATION, response.id,
                         self.get_name(), request)
        v.set_uri(response.get_uri())
        v.add_to_highlight(html_in_comment.group(0))

        om.out.vulnerability(v.get_desc(), severity=severity.INFORMATION)
        kb.kb.append(self, 'html_comment_hides_html', v)
        self._already_reported.add((comment, response.get_url()))

    def _is_new(self, comment, response):
        """
        Make sure that we perform a thread safe check on the self._comments
        dict, in order to avoid duplicates.
        """
        with self._plugin_lock:

            #pylint: disable=E1103
            comment_data = self._comments.get(comment, None)
            response_url = response.get_url()

            if comment_data is None:
                self._comments[comment] = [(response_url, response.id)]
                return True
            else:
                for saved_url, response_id in comment_data:
                    if response_url == saved_url:
                        return False
                else:
                    comment_data.append((response_url, response.id))
                    self._comments[comment] = comment_data
                    return True
            #pylint: enable=E1103

    def end(self):
        """
        This method is called when the plugin wont be used anymore.
        :return: None
        """
        for comment, url_request_id_lst in self._comments.iteritems():

            stick_comment = ' '.join(comment.split())

            if len(stick_comment) > 40:
                msg = ('A comment with the string "%s..." (and %s more bytes)'
                       ' was found on these URL(s):')
                args = (stick_comment[:40], str(len(stick_comment) - 40))
                om.out.vulnerability(msg % args, severity=severity.INFORMATION)
            else:
                msg = 'A comment containing "%s" was found on these URL(s):'
                om.out.vulnerability(msg % stick_comment,
                                     severity=severity.INFORMATION)

            inform = []

            for url, request_id in url_request_id_lst:
                msg = '- %s (request with id: %s)'
                inform.append(msg % (url, request_id))

            for i in sorted(inform):
                om.out.vulnerability(i, severity=severity.INFORMATION)

        self._comments.cleanup()

    def get_long_desc(self):
        """
        :return: A DETAILED description of the plugin functions and features.
        """
        return """
Exemple #3
0
class CachedDiskDict(object):
    """
    This data structure keeps the `max_in_memory` most frequently accessed
    keys in memory and stores the rest on disk.

    It is ideal for situations where a DiskDict is frequently accessed,
    fast read / writes are required, and items can take considerable amounts
    of memory.
    """
    def __init__(self, max_in_memory=50, table_prefix=None):
        """
        :param max_in_memory: The max number of items to keep in memory
        """
        assert max_in_memory > 0, 'In-memory items must be > 0'

        table_prefix = self._get_table_prefix(table_prefix)

        self._max_in_memory = max_in_memory
        self._disk_dict = DiskDict(table_prefix=table_prefix)
        self._in_memory = dict()
        self._access_count = Counter()

    def cleanup(self):
        self._disk_dict.cleanup()

    def _get_table_prefix(self, table_prefix):
        if table_prefix is None:
            table_prefix = 'cached_disk_dict_%s' % rand_alpha(16)
        else:
            args = (table_prefix, rand_alpha(16))
            table_prefix = 'cached_disk_dict_%s_%s' % args

        return table_prefix

    def get(self, key, default=-456):
        try:
            return self[key]
        except KeyError:
            if default is not -456:
                return default

        raise KeyError()

    def __getitem__(self, key):
        try:
            value = self._in_memory[key]
        except KeyError:
            # This will raise KeyError if k is not found, and that is OK
            # because we don't need to increase the access count when the
            # key doesn't exist
            value = self._disk_dict[key]

        self._increase_access_count(key)
        return value

    def _get_keys_for_memory(self):
        """
        :return: Generate the names of the keys that should be kept in memory.
                 For example, if `max_in_memory` is set to 2 and:

                    _in_memory: {1: None, 2: None}
                    _access_count: {1: 10, 2: 20, 3: 5}
                    _disk_dict: {3: None}

                Then the method will generate [1, 2].
        """
        return [k for k, v in self._access_count.most_common(self._max_in_memory)]

    def _increase_access_count(self, key):
        self._access_count.update([key])

        keys_for_memory = self._get_keys_for_memory()

        self._move_key_to_disk_if_needed(keys_for_memory)
        self._move_key_to_memory_if_needed(key, keys_for_memory)

    def _move_key_to_disk_if_needed(self, keys_for_memory):
        """
        Analyzes the current access count for the last accessed key and
        checks if any if the keys in memory should be moved to disk.

        :param keys_for_memory: The keys that should be in memory
        :return: The name of the key that was moved to disk, or None if
                 all the keys are still in memory.
        """
        for key in self._in_memory:

            if key in keys_for_memory:
                continue

            try:
                value = self._in_memory.pop(key)
            except KeyError:
                return
            else:
                self._disk_dict[key] = value
                return key

    def _move_key_to_memory_if_needed(self, key, keys_for_memory):
        """
        Analyzes the current access count for the last accessed key and
        checks if any if the keys in disk should be moved to memory.

        :param key: The key that was last accessed
        :param keys_for_memory: The keys that should be in memory
        :return: The name of the key that was moved to memory, or None if
                 all the keys are still on disk.
        """
        # The key is already in memory, nothing to do here
        if key in self._in_memory:
            return

        # The key must not be in memory, nothing to do here
        if key not in keys_for_memory:
            return

        try:
            value = self._disk_dict.pop(key)
        except KeyError:
            return
        else:
            self._in_memory[key] = value
            return key

    def __setitem__(self, key, value):
        if key in self._in_memory:
            self._in_memory[key] = value

        elif len(self._in_memory) < self._max_in_memory:
            self._in_memory[key] = value

        else:
            self._disk_dict[key] = value

        self._increase_access_count(key)

    def __delitem__(self, key):
        try:
            del self._in_memory[key]
        except KeyError:
            # This will raise KeyError if k is not found, and that is OK
            # because we don't need to increase the access count when the
            # key doesn't exist
            del self._disk_dict[key]

        try:
            del self._access_count[key]
        except KeyError:
            # Another thread removed this key
            pass

    def __contains__(self, key):
        if key in self._in_memory:
            self._increase_access_count(key)
            return True

        if key in self._disk_dict:
            self._increase_access_count(key)
            return True

        return False

    def __iter__(self):
        """
        Decided not to increase the access count when iterating through the
        items. In most cases the iteration will be performed on all items,
        thus increasing the access count +1 for each, which will leave all
        access counts +1, forcing no movements between memory and disk.
        """
        for key in self._in_memory:
            yield key

        for key in self._disk_dict:
            yield key

    def iteritems(self):
        for key, value in self._in_memory.iteritems():
            yield key, value

        for key, value in self._disk_dict.iteritems():
            yield key, value