Ejemplo n.º 1
class dot_ds_store(CrawlPlugin):
    Search .DS_Store file and checks for files containing.

    :author: Tomas Velazquez ( [email protected] )
    :author: Andres Riancho ( [email protected] )

    :credits: This code was based in cpan Mac::Finder::DSStore by Wim Lewis ( [email protected] )
    DS_STORE = '.DS_Store'

    def __init__(self):

        # Internal variables
        self._analyzed_dirs = DiskSet()

    def crawl(self, fuzzable_request):
        For every directory, fetch a list of files and analyze the response.

        :parameter fuzzable_request: A fuzzable_request instance that contains
                                    (among other things) the URL to test.
        directories_to_check = []

        for domain_path in fuzzable_request.get_url().get_directories():
            if domain_path not in self._analyzed_dirs:

        # Send the requests using threads
        self.worker_pool.map(self._check_and_analyze, directories_to_check)

    def _check_and_analyze(self, domain_path):
        Check if a .DS_Store filename exists in the domain_path.

        :return: None, everything is saved to the self.out_queue.
        # Request the file
        url = domain_path.url_join(self.DS_STORE)

            response = self.http_get_and_parse(url, binary_response=True)
        except BaseFrameworkException, w3:
            msg = 'Failed to GET .DS_Store file: %s. Exception: %s.'
            om.out.debug(msg, (url, w3))

        # Check if it's a .DS_Store file
        if is_404(response):

            store = DsStore(response.get_raw_body())
            entries = store.get_file_entries()
        except Exception, e:
            om.out.debug('Unexpected error while parsing DS_Store file: "%s"' % e)
Ejemplo n.º 2
    def test_add_HTTPPostDataRequest(self):
        ds = DiskSet()

        uri = URL('http://w3af.org/?id=2')
        hdr = Headers([('Referer', 'http://w3af.org/')])

        pdr1 = HTTPPostDataRequest(uri, method='GET', headers=hdr)

        uri = URL('http://w3af.org/?id=3')
        pdr2 = HTTPPostDataRequest(uri, method='GET', headers=hdr)

        uri = URL('http://w3af.org/?id=7')
        pdr3 = HTTPPostDataRequest(uri, method='FOO', headers=hdr)


        self.assertEqual(ds[0], pdr1)
        self.assertEqual(ds[1], pdr2)
        self.assertFalse(pdr3 in ds)
        self.assertTrue(pdr2 in ds)
        self.assertEqual(len(ds), 2)

        # This forces an internal change in the URL object
        self.assertTrue(pdr2 in ds)
Ejemplo n.º 3
    def test_add_HTTPPostDataRequest(self):
        ds = DiskSet()

        uri = URL("http://w3af.org/?id=2")
        hdr = Headers([("Referer", "http://w3af.org/")])

        pdr1 = HTTPPostDataRequest(uri, method="GET", headers=hdr)

        uri = URL("http://w3af.org/?id=3")
        pdr2 = HTTPPostDataRequest(uri, method="GET", headers=hdr)

        uri = URL("http://w3af.org/?id=7")
        pdr3 = HTTPPostDataRequest(uri, method="FOO", headers=hdr)


        self.assertEqual(ds[0], pdr1)
        self.assertEqual(ds[1], pdr2)
        self.assertFalse(pdr3 in ds)
        self.assertTrue(pdr2 in ds)
        self.assertEqual(len(ds), 2)

        # This forces an internal change in the URL object
        self.assertTrue(pdr2 in ds)
Ejemplo n.º 4
    def test_store_in_disk_set(self):
        boundary, post_data = multipart_encode([
            ('a', 'bcd'),
        ], [])
        multipart_boundary = MultipartContainer.MULTIPART_HEADER

        headers = Headers([('content-length', str(len(post_data))),
                           ('content-type', multipart_boundary % boundary)])

        dc = MultipartContainer.from_postdata(headers, post_data)

        dc.set_token(('a', 0))

        disk_set = DiskSet()

        dc_read = disk_set[0]

        # These are different objects
        self.assertIsNot(dc_read, dc)

        # But they hold the same data
        self.assertEqual(dc.get_token(), dc_read.get_token())
        self.assertEqual(dc_read.get_token().get_name(), 'a')
Ejemplo n.º 5
    def test_store_fuzzable_request_two(self):
        ds = DiskSet()

        # Add a simple fr, without post-data
        fr = FuzzableRequest(URL('http://example.com/?id=1'))

        # Add a fr with post-data
        form_params = FormParameters()
        form_params.add_field_by_attr_items([("name", "username"),
                                             ("value", "abc")])
        form_params.add_field_by_attr_items([("name", "address"),
                                             ("value", "")])

        form = dc_from_form_params(form_params)

        fr = FuzzableRequest.from_form(form)

        # Compare
        stored_fr = ds[1]

        self.assertEqual(stored_fr, fr)
        self.assertIsNot(stored_fr, fr)
Ejemplo n.º 6
    def test_add_QsRequest(self):
        ds = DiskSet()

        uri = URL('http://w3af.org/?id=2')
        hdr = Headers([('Referer', 'http://w3af.org/')])

        qsr1 = FuzzableRequest(uri, method='GET', headers=hdr)

        uri = URL('http://w3af.org/?id=3')
        qsr2 = FuzzableRequest(uri, method='GET', headers=hdr)

        uri = URL('http://w3af.org/?id=7')
        qsr3 = FuzzableRequest(uri, method='FOO', headers=hdr)


        self.assertEqual(ds[0], qsr1)
        self.assertEqual(ds[1], qsr2)
        self.assertFalse(qsr3 in ds)
        self.assertTrue(qsr2 in ds)
        self.assertEqual(len(ds), 2)

        # This forces an internal change in the URL object
        self.assertIn(qsr2, ds)
Ejemplo n.º 7
    def test_add_QsRequest(self):
        ds = DiskSet()

        uri = URL('http://w3af.org/?id=2')
        hdr = Headers([('Referer', 'http://w3af.org/')])

        qsr1 = FuzzableRequest(uri, method='GET', headers=hdr)

        uri = URL('http://w3af.org/?id=3')
        qsr2 = FuzzableRequest(uri, method='GET', headers=hdr)

        uri = URL('http://w3af.org/?id=7')
        qsr3 = FuzzableRequest(uri, method='FOO', headers=hdr)


        self.assertEqual(ds[0], qsr1)
        self.assertEqual(ds[1], qsr2)
        self.assertFalse(qsr3 in ds)
        self.assertTrue(qsr2 in ds)
        self.assertEqual(len(ds), 2)

        # This forces an internal change in the URL object
        self.assertIn(qsr2, ds)
Ejemplo n.º 8
class dwsync_xml(CrawlPlugin):
    Search Dream Waver Sync file (dwsync.xml) and extract referenced files.

    :author: Tomas Velazquez ([email protected])

    DWSYNC = '_notes/dwsync.xml'

    def __init__(self):
        # Internal variables
        self._analyzed_dirs = DiskSet()

    def crawl(self, fuzzable_request):
        For every directory, fetch a list of files and analyze the response.
        :parameter fuzzable_request: A fuzzable_request instance that contains
                                    (among other things) the URL to test.
        for domain_path in fuzzable_request.get_url().get_directories():
            if domain_path not in self._analyzed_dirs:

    def _find_dwsync(self, domain_path):
        dwsync_url = domain_path.url_join(self.DWSYNC)
        response = self.http_get_and_parse(dwsync_url)

        if is_404(response):

        if '</dwsync>' not in response.get_body():

        om.out.debug('Parsing dwsync.xml file at %s' % dwsync_url)

            dom = xml.dom.minidom.parseString(response.get_body())
        except Exception, e:
            msg = 'Exception while parsing dwsync.xml file at %s : "%s"'
            om.out.debug(msg % (dwsync_url, e))

        parsed_url_list = set()

        for file_entry in dom.getElementsByTagName('file'):
                _file = file_entry.getAttribute('name')
                url = domain_path.url_join(_file)
            except ValueError, ve:
                msg = 'dwsync file had an invalid URL: "%s"'
                om.out.debug(msg % ve)
            except Exception, e:
                msg = 'Sitemap file had an invalid format: "%s"'
                om.out.debug(msg % e)
Ejemplo n.º 9
class dwsync_xml(CrawlPlugin):
    Search Dream Waver Sync file (dwsync.xml) and extract referenced files.

    :author: Tomas Velazquez ([email protected])

    DWSYNC = '_notes/dwsync.xml'

    def __init__(self):

        # Internal variables
        self._analyzed_dirs = DiskSet()

    def crawl(self, fuzzable_request):
        For every directory, fetch a list of files and analyze the response.
        :parameter fuzzable_request: A fuzzable_request instance that contains
                                    (among other things) the URL to test.
        for domain_path in fuzzable_request.get_url().get_directories():
            if domain_path not in self._analyzed_dirs:

    def _find_dwsync(self, domain_path):
        dwsync_url = domain_path.url_join(self.DWSYNC)
        response = self.http_get_and_parse(dwsync_url)

        if is_404(response):

        if '</dwsync>' not in response.get_body():

        om.out.debug('Parsing dwsync.xml file at %s' % dwsync_url)

            dom = xml.dom.minidom.parseString(response.get_body())
        except Exception, e:
            msg = 'Exception while parsing dwsync.xml file at %s : "%s"'
            om.out.debug(msg % (dwsync_url, e))

        parsed_url_list = set()

        for file_entry in dom.getElementsByTagName('file'):
                _file = file_entry.getAttribute('name')
                url = domain_path.url_join(_file)
            except ValueError, ve:
                msg = 'dwsync file had an invalid URL: "%s"'
                om.out.debug(msg % ve)
            except Exception, e:
                msg = 'Sitemap file had an invalid format: "%s"'
                om.out.debug(msg % e)
Ejemplo n.º 10
    def test_add(self):
        ds = DiskSet()

        self.assertEqual(list(ds), [1, 2, 3])
        self.assertEqual(len(ds), 3)
        self.assertEqual(unicode(ds), u'<DiskSet [1, 2, 3]>')
Ejemplo n.º 11
    def test_add(self):
        ds = DiskSet()

        self.assertEqual(list(ds), [1, 2, 3])
        self.assertEqual(len(ds), 3)
        self.assertEqual(unicode(ds), u'<DiskSet [1, 2, 3]>')
Ejemplo n.º 12
    def test_disk_set(self):
        ds = DiskSet()

        for i in xrange(20000):
            data = (i, i)

        for i in xrange(20000):
            data = (i, i)
            data in ds
Ejemplo n.º 13
    def test_add_urlobject(self):
        ds = DiskSet()


        self.assertEqual(ds[0], URL('http://w3af.org/?id=2'))
        self.assertEqual(ds[1], URL('http://w3af.org/?id=3'))
        self.assertEqual(len(ds), 2)
        self.assertFalse(URL('http://w3af.org/?id=4') in ds)
        self.assertTrue(URL('http://w3af.org/?id=2') in ds)
Ejemplo n.º 14
    def test_add_urlobject(self):
        ds = DiskSet()


        self.assertEqual(ds[0], URL('http://w3af.org/?id=2'))
        self.assertEqual(ds[1], URL('http://w3af.org/?id=3'))
        self.assertEqual(len(ds), 2)
        self.assertFalse(URL('http://w3af.org/?id=4') in ds)
        self.assertTrue(URL('http://w3af.org/?id=2') in ds)
Ejemplo n.º 15
    def test_remove_table(self):
        disk_set = DiskSet()

        table_name = disk_set.table_name
        db = get_default_temp_db_instance()



Ejemplo n.º 16
    def test_remove_table(self):
        disk_set = DiskSet()

        table_name = disk_set.table_name
        db = get_default_temp_db_instance()



Ejemplo n.º 17
class phpinfo(CrawlPlugin):
    Search PHP Info file and if it finds it will determine the version of PHP.
    :author: Viktor Gazdag ( [email protected] )

        Feb/17/2009- Added PHP Settings Audit Checks by Aung Khant (aungkhant[at]yehg.net)

    def __init__(self):

        # Internal variables
        self._analyzed_dirs = DiskSet()
        self._has_audited = 0

    def crawl(self, fuzzable_request):
        For every directory, fetch a list of files and analyze the response.

        :param fuzzable_request: A fuzzable_request instance that contains
                                    (among other things) the URL to test.
        for domain_path in fuzzable_request.get_url().get_directories():

            if domain_path in self._analyzed_dirs:

            url_repeater = repeat(domain_path)
            args = izip(url_repeater, self._get_potential_phpinfos())

            self.worker_pool.map_multi_args(self._check_and_analyze, args)

    def _check_and_analyze(self, domain_path, php_info_filename):
        Check if a php_info_filename exists in the domain_path.
        :return: None, everything is put() into the self.output_queue.
        # Request the file
        php_info_url = domain_path.url_join(php_info_filename)
            response = self._uri_opener.GET(php_info_url, cache=True)
        except BaseFrameworkException, w3:
            msg = 'Failed to GET phpinfo file: "%s". Exception: "%s".'
            om.out.debug(msg % (php_info_url, w3))
Ejemplo n.º 18
    def test_store_fuzzable_request(self):
        form_params = FormParameters()
        form_params.add_input([("name", "username"), ("value", "abc")])
        form_params.add_input([("name", "address"), ("value", "")])

        form = dc_from_form_params(form_params)

        fr = FuzzableRequest.from_form(form)

        ds = DiskSet()

        stored_fr = ds[0]

        self.assertEqual(stored_fr, fr)
        self.assertIsNot(stored_fr, fr)
Ejemplo n.º 19
    def test_store_fuzzable_request(self):
        form_params = FormParameters()
        form_params.add_input([("name", "username"), ("value", "abc")])
        form_params.add_input([("name", "address"), ("value", "")])

        form = dc_from_form_params(form_params)

        fr = FuzzableRequest.from_form(form)

        ds = DiskSet()

        stored_fr = ds[0]

        self.assertEqual(stored_fr, fr)
        self.assertIsNot(stored_fr, fr)
Ejemplo n.º 20
    def test_multipart_fuzzable_request_store(self):
        boundary, post_data = multipart_encode([('a', 'bcd'), ], [])
        multipart_boundary = MultipartContainer.MULTIPART_HEADER

        headers = Headers([('content-length', str(len(post_data))),
                           ('content-type', multipart_boundary % boundary)])

        dc = MultipartContainer.from_postdata(headers, post_data)
        post_data = str(dc)

        fr = FuzzableRequest.from_parts(URL('http://www.w3af.com/'),
                                        method='POST', post_data=post_data,
        disk_set = DiskSet()

        fr_read = disk_set[0]

        self.assertIsInstance(fr_read.get_raw_data(), MultipartContainer)
        self.assertIn('a', fr_read.get_raw_data())
Ejemplo n.º 21
    def test_multipart_fuzzable_request_store(self):
        boundary, post_data = multipart_encode([('a', 'bcd'), ], [])
        multipart_boundary = MultipartContainer.MULTIPART_HEADER

        headers = Headers([('content-length', str(len(post_data))),
                           ('content-type', multipart_boundary % boundary)])

        dc = MultipartContainer.from_postdata(headers, post_data)
        post_data = str(dc)

        fr = FuzzableRequest.from_parts(URL('http://www.w3af.com/'),
                                        method='POST', post_data=post_data,
        disk_set = DiskSet()

        fr_read = disk_set[0]

        self.assertIsInstance(fr_read.get_raw_data(), MultipartContainer)
        self.assertIn('a', fr_read.get_raw_data())
Ejemplo n.º 22
    def test_store_in_disk_set(self):
        boundary, post_data = multipart_encode([('a', 'bcd'), ], [])
        multipart_boundary = MultipartContainer.MULTIPART_HEADER

        headers = Headers([('content-length', str(len(post_data))),
                           ('content-type', multipart_boundary % boundary)])

        dc = MultipartContainer.from_postdata(headers, post_data)

        dc.set_token(('a', 0))

        disk_set = DiskSet()

        dc_read = disk_set[0]

        # These are different objects
        self.assertIsNot(dc_read, dc)

        # But they hold the same data
        self.assertEqual(dc.get_token(), dc_read.get_token())
        self.assertEqual(dc_read.get_token().get_name(), 'a')
Ejemplo n.º 23
    def test_store_fuzzable_request_two(self):
        ds = DiskSet()

        # Add a simple fr, without post-data
        fr = FuzzableRequest(URL('http://example.com/?id=1'))

        # Add a fr with post-data
        form_params = FormParameters()
        form_params.add_field_by_attr_items([("name", "username"), ("value", "abc")])
        form_params.add_field_by_attr_items([("name", "address"), ("value", "")])

        form = dc_from_form_params(form_params)

        fr = FuzzableRequest.from_form(form)

        # Compare
        stored_fr = ds[1]

        self.assertEqual(stored_fr, fr)
        self.assertIsNot(stored_fr, fr)
Ejemplo n.º 24
class DBKnowledgeBase(BasicKnowledgeBase):
    This class saves the data that is sent to it by plugins. It is the only way
    in which plugins can exchange information.

    Data is stored in a DB.

    :author: Andres Riancho ([email protected])
    COLUMNS = [('location_a', 'TEXT'),
               ('location_b', 'TEXT'),
               ('uniq_id', 'TEXT'),
               ('pickle', 'BLOB')]

    def __init__(self):
        super(DBKnowledgeBase, self).__init__()
        self.initialized = False

        # TODO: Why doesn't this work with a WeakValueDictionary?
        self.observers = {} #WeakValueDictionary()
        self._observer_id = 0

    def setup(self):
        Setup all the required backend stores. This was mostly created to avoid
        starting any threads during __init__() which is called during python's
        import phase and dead-locks in some cases.

        :return: None
        with self._kb_lock:
            if self.initialized:

            self.urls = DiskSet(table_prefix='kb_urls')
            self.fuzzable_requests = DiskSet(table_prefix='kb_fuzzable_requests')

            self.db = get_default_persistent_db_instance()

            self.table_name = 'knowledge_base_' + rand_alpha(30)
            self.db.create_table(self.table_name, self.COLUMNS)
            self.db.create_index(self.table_name, ['location_a', 'location_b'])
            self.db.create_index(self.table_name, ['uniq_id'])

            # Only initialize once
            self.initialized = True

    def clear(self, location_a, location_b):
        location_a = self._get_real_name(location_a)

        query = "DELETE FROM %s WHERE location_a = ? and location_b = ?"
        params = (location_a, location_b)
        self.db.execute(query % self.table_name, params)

    def raw_write(self, location_a, location_b, value):
        This method saves value to (location_a,location_b) but previously
        clears any pre-existing values.
        if isinstance(value, Info):
            raise TypeError('Use append or append_uniq to store vulnerabilities')

        location_a = self._get_real_name(location_a)

        self.clear(location_a, location_b)
        self.append(location_a, location_b, value, ignore_type=True)

    def raw_read(self, location_a, location_b):
        This method reads the value from (location_a, location_b)
        location_a = self._get_real_name(location_a)
        result = self.get(location_a, location_b, check_types=False)

        if len(result) > 1:
            msg = 'Incorrect use of raw_write/raw_read, found %s results.'
            raise RuntimeError(msg % len(result))
        elif len(result) == 0:
            return []
            return result[0]

    def get_one(self, location_a, location_b):
        This method reads the value from (location_a, location_b), checking it's
        type and making sure only one is stored at that address.

        Similar to raw_read, but checking types.

        :see: https://github.com/andresriancho/w3af/issues/3955
        location_a = self._get_real_name(location_a)
        result = self.get(location_a, location_b, check_types=True)

        if len(result) > 1:
            msg = 'Incorrect use of get_one(), found %s results.'
            raise RuntimeError(msg % result)
        elif len(result) == 0:
            return []
            return result[0]

    def _get_uniq_id(self, obj):
        if isinstance(obj, (Info, InfoSet)):
            return obj.get_uniq_id()
            if isinstance(obj, collections.Iterable):
                concat_all = ''.join([str(hash(i)) for i in obj])
                return str(hash(concat_all))
                return str(hash(obj))

    def append(self, location_a, location_b, value, ignore_type=False):
        This method appends the location_b value to a dict.
        if not ignore_type and not isinstance(value, (Info, Shell, InfoSet)):
            msg = ('You MUST use raw_write/raw_read to store non-info objects'
                   ' to the KnowledgeBase.')
            raise TypeError(msg)

        location_a = self._get_real_name(location_a)
        uniq_id = self._get_uniq_id(value)

        pickled_obj = cpickle_dumps(value)
        t = (location_a, location_b, uniq_id, pickled_obj)

        query = "INSERT INTO %s VALUES (?, ?, ?, ?)" % self.table_name
        self.db.execute(query, t)
        self._notify_observers(self.APPEND, location_a, location_b, value,

    def get(self, location_a, location_b, check_types=True):
        :param location_a: The plugin that saved the data to the
                           kb.info Typically the name of the plugin,
                           but could also be the plugin instance.

        :param location_b: The name of the variables under which the vuln
                           objects were saved. Typically the same name of
                           the plugin, or something like "vulns", "errors",
                           etc. In most cases this is NOT None. When set
                           to None, a dict with all the vuln objects found
                           by the plugin_name is returned.

        :return: Returns the data that was saved by another plugin.
        location_a = self._get_real_name(location_a)

        if location_b is None:
            query = 'SELECT pickle FROM %s WHERE location_a = ?'
            params = (location_a,)
            query = 'SELECT pickle FROM %s WHERE location_a = ?'\
                                           ' and location_b = ?'
            params = (location_a, location_b)

        result_lst = []

        results = self.db.select(query % self.table_name, params)
        for r in results:
            obj = cPickle.loads(r[0])

            if check_types and not isinstance(obj, (Info, InfoSet, Shell)):
                raise TypeError('Use raw_write and raw_read to query the'
                                ' knowledge base for non-Info objects')


        return result_lst

    def get_by_uniq_id(self, uniq_id):
        query = 'SELECT pickle FROM %s WHERE uniq_id = ?'
        params = (uniq_id,)

        result = self.db.select_one(query % self.table_name, params)

        if result is not None:
            result = cPickle.loads(result[0])

        return result

    def update(self, old_info, update_info):
        :param old_info: The info/vuln instance to be updated in the kb.
        :param update_info: The info/vuln instance with new information
        :return: Nothing
        old_not_info = not isinstance(old_info, (Info, InfoSet, Shell))
        update_not_info = not isinstance(update_info, (Info, InfoSet, Shell))

        if old_not_info or update_not_info:
            msg = ('You MUST use raw_write/raw_read to store non-info objects'
                   ' to the KnowledgeBase.')
            raise TypeError(msg)

        old_uniq_id = old_info.get_uniq_id()
        new_uniq_id = update_info.get_uniq_id()
        pickle = cpickle_dumps(update_info)

        # Update the pickle and unique_id after finding by original uniq_id
        query = "UPDATE %s SET pickle = ?, uniq_id = ? WHERE uniq_id = ?"

        params = (pickle, new_uniq_id, old_uniq_id)
        result = self.db.execute(query % self.table_name, params).result()

        if result.rowcount:
            self._notify_observers(self.UPDATE, old_info, update_info)
            ex = ('Failed to update() %s instance because'
                  ' the original unique_id (%s) does not exist in the DB,'
                  ' or the new unique_id (%s) is invalid.')
            raise DBException(ex % (old_info.__class__.__name__,

    def add_observer(self, observer):
        Add the observer instance to the list.
        observer_id = self.get_observer_id()
        self.observers[observer_id] = observer

    def get_observer_id(self):
        self._observer_id += 1
        return self._observer_id

    def _notify_observers(self, method, *args, **kwargs):
        Call the observer if the location_a/location_b matches with the
        configured observers.

        :return: None
        # Note that I copy the items list in order to iterate though it without
        # any issues like the size changing
        for _, observer in self.observers.items()[:]:
            functor = getattr(observer, method)
            functor(*args, **kwargs)

    def get_all_entries_of_class(self, klass):
        :return: A list of all objects of class == klass that are saved in the
        query = 'SELECT pickle FROM %s'
        results = self.db.select(query % self.table_name)

        result_lst = []

        for r in results:
            obj = cPickle.loads(r[0])
            if isinstance(obj, klass):

        return result_lst

    def get_all_vulns(self):
        :return: A list of all info instances with severity in (LOW, MEDIUM,
        query = 'SELECT pickle FROM %s'
        results = self.db.select(query % self.table_name)

        result_lst = []

        for r in results:
            obj = cPickle.loads(r[0])
            if hasattr(obj, 'get_severity'):
                severity = obj.get_severity()
                if severity in (LOW, MEDIUM, HIGH):

        return result_lst

    def get_all_infos(self):
        :return: A list of all info instances with severity eq INFORMATION
        query = 'SELECT pickle FROM %s'
        results = self.db.select(query % self.table_name)

        result_lst = []

        for r in results:
            obj = cPickle.loads(r[0])
            if hasattr(obj, 'get_severity'):
                severity = obj.get_severity()
                if severity in (INFORMATION,):

        return result_lst

    def dump(self):
        result_dict = {}

        query = 'SELECT location_a, location_b, pickle FROM %s'
        results = self.db.select(query % self.table_name)

        for location_a, location_b, pickle in results:
            obj = cPickle.loads(pickle)

            if location_a not in result_dict:
                result_dict[location_a] = {location_b: [obj,]}
            elif location_b not in result_dict[location_a]:
                result_dict[location_a][location_b] = [obj,]

        return result_dict

    def cleanup(self):
        Cleanup internal data.
        self.db.execute("DELETE FROM %s WHERE 1=1" % self.table_name)

        # Remove the old, create new.
        old_urls = self.urls
        self.urls = DiskSet(table_prefix='kb_urls')

        old_fuzzable_requests = self.fuzzable_requests
        self.fuzzable_requests = DiskSet(table_prefix='kb_fuzzable_requests')


    def remove(self):

    def get_all_known_urls(self):
        :return: A DiskSet with all the known URLs as URL objects.
        return self.urls

    def add_url(self, url):
        :return: True if the URL was previously unknown
        if not isinstance(url, URL):
            msg = 'add_url requires a URL as parameter got %s instead.'
            raise TypeError(msg % type(url))

        self._notify_observers(self.ADD_URL, url)
        return self.urls.add(url)

    def get_all_known_fuzzable_requests(self):
        :return: A DiskSet with all the known URLs as URL objects.
        return self.fuzzable_requests

    def add_fuzzable_request(self, fuzzable_request):
        :return: True if the FuzzableRequest was previously unknown
        if not isinstance(fuzzable_request, FuzzableRequest):
            msg = ('add_fuzzable_request requires a FuzzableRequest as'
                   ' parameter, got "%s" instead.')
            raise TypeError(msg % type(fuzzable_request))

        return self.fuzzable_requests.add(fuzzable_request)
Ejemplo n.º 25
class open_api(CrawlPlugin):
    Extract REST API calls from Open API specifications.

    :author: Andres Riancho ([email protected])

    FILENAMES = ['swagger.json', 'openapi.json', 'openapi.yaml']

        '/', '/api/', '/api/v2/', '/api/v1/', '/api/v2.0/', '/api/v2.1/',
        '/api/v1.0/', '/api/v1.1/', '/api/2.0/', '/api/2.1/', '/api/1.0/',

    def __init__(self):

        # Internal variables
        self._first_run = True
        self._already_analyzed = DiskSet(table_prefix='open_api')

        # User configured variables
        self._query_string_auth = QueryString()
        self._header_auth = Headers()
        self._no_spec_validation = False
        self._custom_spec_location = ''
        self._discover_fuzzable_headers = True
        self._discover_fuzzable_url_parts = True

    def crawl(self, fuzzable_request, debugging_id):
        Try to extract all the API endpoints from various locations
        if no custom location specified.

        :param debugging_id: A unique identifier for this call to discover()
        :param fuzzable_request: A fuzzable_request instance that contains
                                (among other things) the URL to test.

        if self._has_custom_spec_location():
            self._analyze_common_paths(fuzzable_request, debugging_id)
            self._analyze_current_path(fuzzable_request, debugging_id)

    def _enable_file_name_fuzzing(self):
        Enable file name fuzzing:


        Users are not going to remember to enable this in misc-settings, and
        most of the APIs which are documented with Open API are REST APIs,
        so it makes sense to enable this automatically here.

        :return: None
        if self._first_run and not self._discover_fuzzable_url_parts:
            cf.cf.save('fuzz_url_filenames', True)
            cf.cf.save('fuzz_url_parts', True)

    def _should_analyze(self, url):
        Makes sure that we only analyze a URL once, this reduces the number
        of HTTP requests and the CPU usage required for parsing the

        :param url: The URL we want to analyze
        :return: True if we never analyzed this URL before
        if url in self._already_analyzed:
            return False

        return True

    def _analyze_common_paths(self, fuzzable_request, debugging_id):
        Try to find the open api specification in the most common paths,
        extract all the REST API endpoints when found.

        This is run only the first time the plugin is called.

        :return: None, everything we find is sent to the core.
        if not self._first_run:

        self._first_run = False

        args = izip(self._spec_url_generator_common(fuzzable_request),

        self.worker_pool.map_multi_args(self._extract_api_calls, args)

    def _extract_api_calls(self, spec_url, debugging_id):
        HTTP GET the `spec_url` and try to parse it. Send all the newly found
        fuzzable requests to the core after adding any authentication data
        that might have been configured.

        :return: None
        # Merge the user-configured authentication query string (if any)
        # with the spec_url query string
        qs = spec_url.get_querystring()

        for key, values in self._query_string_auth.iteritems():
            qs[key] = values


        # Also add the authentication headers to the request (if any)
        # Disable the cache because we're sending auth headers which might
        # confuse the cache implementation
        http_response = self._uri_opener.GET(spec_url,

        if is_404(http_response):

        self._extract_api_calls_from_response(spec_url, http_response)

    def _extract_api_calls_from_response(self, spec_url, http_response):
        Try to parse an API specification from an HTTP response.
        Send all the newly found fuzzable requests to the core
        after adding any authentication data that might have been configured.

        :parm spec_url: A URL to API specification
        :param http_response: An HTTP response
        :return: None
        if not OpenAPI.can_parse(http_response):

        om.out.debug('OpenAPI parser is about to parse %s' % spec_url)

        parser = OpenAPI(http_response, self._no_spec_validation,

        self._report_to_kb_if_needed(http_response, parser)

        om.out.debug('OpenAPI parser identified %s API calls' %

        for api_call in parser.get_api_calls():
            if not self._is_target_domain(api_call):

            api_call = self._set_authentication_data(api_call)

    def _send_spec_to_core(self, spec_url):
        fuzzable_request = FuzzableRequest(spec_url, method='GET')

    def _is_target_domain(fuzzable_request):
        :param fuzzable_request: The api call as a fuzzable request
        :return: True if the target domain matches
        targets = cf.cf.get('targets')
        if not targets:
            return False

        target_domain = targets[0].get_domain()
        api_call_domain = fuzzable_request.get_url().get_domain()

        if target_domain == api_call_domain:
            return True

        om.out.debug('The OpenAPI specification has operations which point'
                     ' to a domain (%s) outside the defined target (%s).'
                     ' Ignoring the operation to prevent scanning out of scope'
                     ' targets.' % (api_call_domain, target_domain))
        return False

    def _report_to_kb_if_needed(self, http_response, parser):
        If the parser did find something, then we report it to the KB.

        :param http_response: The HTTP response that was parsed
        :param parser: The OpenAPI parser instance
        :return: None
        if not parser.get_api_calls() and parser.get_parsing_errors():
            desc = (
                'An Open API specification was found at: "%s", but the scanner'
                ' was unable to extract any API endpoints. In most cases this'
                ' is because of a syntax error in the Open API specification.\n'
                'Use https://editor.swagger.io/ to inspect the Open API'
                ' specification, identify and fix any issues and try again.\n'
                'The errors found by the parser were:\n'
                '\n - %s')

            desc %= (http_response.get_url(),
                     '\n - '.join(parser.get_parsing_errors()))

            i = Info('Failed to parse Open API specification', desc,
                     http_response.id, self.get_name())

            kb.kb.append(self, 'open_api', i)


        # Save it to the kb!
        desc = ('An Open API specification was found at: "%s", the scanner'
                ' was able to extract %s API endpoints which will be audited'
                ' for vulnerabilities.')
        desc %= (http_response.get_url(), len(parser.get_api_calls()))

        i = Info('Open API specification found', desc, http_response.id,

        kb.kb.append(self, 'open_api', i)

        # Warn the user about missing credentials
        if self._query_string_auth or self._header_auth:

        desc = (
            'An Open API specification was found at: "%s", but no credentials'
            ' were provided in the `open_api` plugin. The scanner will try'
            ' to audit the identified endpoints but coverage will most likely'
            ' be reduced due to missing authentication.')
        desc %= http_response.get_url()

        i = Info('Open API missing credentials', desc, http_response.id,

        kb.kb.append(self, 'open_api', i)

    def _set_authentication_data(self, fuzzable_request):
        :param fuzzable_request: The fuzzable request as returned by the parser

        :return: The same fuzzable request as before, but adding authentication
                 data configured by the user, such as headers and query string
        headers = fuzzable_request.get_headers()
        uri = fuzzable_request.get_uri()
        query_string = uri.get_querystring()

        if self._header_auth:
            for header_name, header_value in self._header_auth.iteritems():
                headers[header_name] = header_value

        if self._query_string_auth:
            for qs_param, qs_value in self._query_string_auth.iteritems():
                query_string[qs_param] = qs_value



        return fuzzable_request

    def _spec_url_generator_common(self, fuzzable_request):
        Generate the potential locations for the open api specification

        :param fuzzable_request: The fuzzable request we get from the core
        :return: URLs to test
        base_url = fuzzable_request.get_url().base_url()

        for directory in self.DIRECTORIES:
            for filename in self.FILENAMES:
                spec_url = base_url.url_join('%s%s' % (directory, filename))

                if not self._should_analyze(spec_url):

                yield spec_url

    def _spec_url_generator_current_path(self, fuzzable_request):
        Generate the potential locations for the open api specification
        based on the current path

        :param fuzzable_request: The fuzzable request we get from the core
        :return: URLs to test
        url = fuzzable_request.get_url()

        # If the user set the swagger.json URL as target, we want to test it
        if self._should_analyze(url):
            yield url

        # Now we create some URLs based on the received URL
        for directory_url in url.get_directories():
            for filename in self.FILENAMES:
                spec_url = directory_url.url_join(filename)

                if not self._should_analyze(spec_url):

                yield spec_url

    def _analyze_current_path(self, fuzzable_request, debugging_id):
        Try to find the common files in the current path.

        This is faster than `_analyze_common_paths` since it doesn't test all
        the directories (such as /api/ , /api/v2/, etc).

        :return: None, we send everything we find to the core.
        args = izip(self._spec_url_generator_current_path(fuzzable_request),

        self.worker_pool.map_multi_args(self._extract_api_calls, args)

    def _has_custom_spec_location(self):
        Checks if the plugin is configured to use a custom API specification
        from a local file.

        :return: True if the plugin is configured to read a custom API spec
        return self._custom_spec_location != ''

    def _analyze_custom_spec(self):
        Loads a custom API specification from a local file, and try to parse it.

        :return: None
        if not self._first_run:

        self._first_run = False

        url = URL('file://%s' % os.path.abspath(self._custom_spec_location))

        ext = os.path.splitext(self._custom_spec_location)[1][1:].lower()
        if ext not in ('yaml', 'json'):
            om.out.error('Skip loading custom API spec '
                         'because of unknown file extension: %s' % ext)

        with open(self._custom_spec_location, 'r') as f:
            custom_spec_as_string = f.read()

        headers = Headers([('content-type', 'application/%s' % ext)])
        http_response = HTTPResponse(200,

        self._extract_api_calls_from_response(url, http_response)

    def get_options(self):
        :return: A list of option objects for this plugin.
        ol = OptionList()

        d = 'Query string parameters to add in each API request'
        h = ('Some REST APIs use query string parameters, such as `api_key`'
             ' for authentication. Set this parameter to configure one or more'
             ' query string parameters which will be added to each API HTTP'
             ' request. An example value for this field is: "api_key=0x12345"')
        o = opt_factory('query_string_auth',

        d = 'Headers to add in each API request'
        h = (
            'Some REST APIs use HTTP headers, such as `X-Authenticate` or `Basic`'
            ' for authentication. Set this parameter to configure one or more'
            ' HTTP headers which will be added to each API request.'
            ' An example value for this field is: "Basic: bearer 0x12345"')
        o = opt_factory('header_auth', self._header_auth, d, HEADER, help=h)

        d = 'Disable Open API spec validation'
        h = 'By default, the plugin validates Open API specification before extracting endpoints.'
        o = opt_factory('no_spec_validation',

        d = 'Path to Open API specification'
        h = (
            'By default, the plugin looks for the API specification on the target,'
            ' but sometimes applications do not provide an API specification.'
            ' Set this parameter to specify a local path to the API specification.'
            ' The file must have .json or .yaml extension.')
        o = opt_factory('custom_spec_location',

        d = 'Automatic HTTP header discovery for further testing'
        h = (
            'By default, the plugin looks for parameters which are passed to endpoints via HTTP headers,'
            ' and enables them for further testing.'
            ' Set this options to False if you would like to disable this feature.'
            ' You can also set `misc-settings.fuzzable_headers` option to test only specific headers.'
        o = opt_factory('discover_fuzzable_headers',

        d = 'Automatic path parameter discovery for further testing'
        h = (
            'By default, URLs discovered by this plugin allow other plugins'
            ' to inject content into the path only at locations declared as path'
            ' parameters in the Open API specification.'
            ' For example, if the Open API specification declares an endpoint with the path'
            ' `/store/product-{productID}`, only the `{productID}` part of the URL will be'
            ' modified during fuzzing.'
            ' Set this option to False if you would like to disable this feature,'
            ' and instead fuzz all path segments. If this option is set to False,'
            ' the plugin will automatically set `misc-settings.fuzz_url_parts`'
            ' and `misc-settings.fuzz_url_filenames` to True')
        o = opt_factory('discover_fuzzable_url_parts',

        return ol

    def set_options(self, options_list):
        This method sets all the options that are configured using the user
        interface generated by the framework using the result of get_options().

        :param options_list: A dictionary with the options for the plugin.
        :return: No value is returned.
        self._query_string_auth = options_list['query_string_auth'].get_value()
        self._header_auth = options_list['header_auth'].get_value()
        self._no_spec_validation = options_list[
        self._custom_spec_location = options_list[
        self._discover_fuzzable_headers = options_list[
        self._discover_fuzzable_url_parts = options_list[

    def get_long_desc(self):
        :return: A DETAILED description of the plugin functions and features.
        return """
Ejemplo n.º 26
class error_500(GrepPlugin):
    Grep every page for error 500 pages that haven't been identified as bugs by
    other plugins.

    :author: Andres Riancho ([email protected])

    IGNORE_CODES = (404, 403, 401, 405, 400, 501)
    FALSE_POSITIVE_STRINGS = ('<h1>Bad Request (Invalid URL)</h1>', )

    def __init__(self):

        self._error_500_responses = DiskSet()

    def grep(self, request, response):
        Plugin entry point, identify which requests generated a 500 error.

        :param request: The HTTP request object.
        :param response: The HTTP response object
        :return: None
        if response.is_text_or_html() \
        and response.get_code() > 400 \
        and response.get_code() < 600 \
        and response.get_code() not in self.IGNORE_CODES\
        and not self._is_false_positive(response):
            self._error_500_responses.add((request, response.id))

    def _is_false_positive(self, response):
        Filters out some false positives like this one:

        This false positive is generated by IIS when I send an URL that's "odd"
        Some examples of URLs that trigger this false positive:

        :return: True if the response is a false positive.
        for fps in self.FALSE_POSITIVE_STRINGS:
            if fps in response.get_body():
                return True
        return False

    def end(self):
        This method is called when the plugin wont be used anymore.

        The real job of this plugin is done here, where I will try to see if
        one of the error_500 responses were not identified as a vuln by some
        of my audit plugins
        all_vulns = kb.kb.get_all_vulns()
        all_vulns_tuples = [(v.get_uri(), v.get_dc()) for v in all_vulns]

        for request, error_500_response_id in self._error_500_responses:
            if (request.get_uri(), request.get_dc()) not in all_vulns_tuples:
                # Found a err 500 that wasnt identified !!!
                desc = 'An unidentified web application error (HTTP response'\
                       ' code 500) was found at: "%s". Enable all plugins and'\
                       ' try again, if the vulnerability still is not identified'\
                       ', please verify manually and report it to the w3af'\
                       ' developers.'
                desc = desc % request.get_url()

                v = Vuln('Unhandled error in web application',
                         desc, severity.MEDIUM, error_500_response_id,


                self.kb_append_uniq(self, 'error_500', v, 'VAR')


    def get_long_desc(self):
        :return: A DETAILED description of the plugin functions and features.
        return """
Ejemplo n.º 27
class find_captchas(CrawlPlugin):
    Identify captcha images on web pages.
    :author: Andres Riancho ([email protected])
    def __init__(self):

        self._captchas_found = DiskSet(table_prefix='find_captchas')

    def crawl(self, fuzzable_request):
        Find CAPTCHA images.

        :param fuzzable_request: A fuzzable_request instance that contains
                                    (among other things) the URL to test.
        result, captchas = self._identify_captchas(fuzzable_request)

        if result:
            for captcha in captchas:
                desc = 'Found a CAPTCHA image at: "%s".' % captcha.img_src
                response_ids = [
                    response.id for response in captcha.http_responses

                i = Info('Captcha image detected', desc, response_ids,

                kb.kb.append(self, 'CAPTCHA', i)

    def _identify_captchas(self, fuzzable_request):
        :return: A tuple with the following information:
                    * True indicating that the page has CAPTCHAs
                    * A list with tuples that contain:
                        * The CAPTCHA image source
                        * The http responses used to verify that the image was
                          indeed a CAPTCHA
        found_captcha = False
        captchas = []

        # GET the document, and fetch the images
        images_1 = self._get_images(fuzzable_request)

        # Re-GET the document, and fetch the images
        images_2 = self._get_images(fuzzable_request)

        # If the number of images in each response is different, don't even
        # bother to perform any analysis since our simplistic approach will fail
        # TODO: Add something more advanced.
        if len(images_1) == len(images_2):

            not_in_2 = []

            for img_src_1, img_hash_1, http_response_1 in images_1:
                for _, img_hash_2, http_response_2 in images_2:
                    if img_hash_1 == img_hash_2:
                        # The image is in both lists, can't be a CAPTCHA
                    not_in_2.append((img_src_1, img_hash_1,
                                     [http_response_1, http_response_2]))

            # Results
            # TODO: This allows for more than one CAPTCHA in the same page. Does
            #       that make sense? When that's found, should I simply declare
            #       defeat and don't report anything?
            for img_src, _, http_responses in not_in_2:

                CaptchaInfo = namedtuple('CaptchaInfo',
                                         ['img_src', 'http_responses'])
                img_src = img_src.uri2url()

                if img_src not in self._captchas_found:
                    found_captcha = True

                    captchas.append(CaptchaInfo(img_src, http_responses))

        return found_captcha, captchas

    def _get_images(self, fuzzable_request):
        Get all img tags and retrieve the src.

        :param fuzzable_request: The request to modify
        :return: A list with tuples containing (img_src, image_hash, http_response)
        res = []

            response = self._uri_opener.GET(fuzzable_request.get_uri(),
            om.out.debug('Failed to retrieve the page for finding captchas.')
            # Do not use parser_cache here, it's not good since CAPTCHA implementations
            # *might* change the image name for each request of the HTML
            # dp = parser_cache.dpc.get_document_parser_for( response )
                document_parser = DocumentParser.DocumentParser(response)
            except BaseFrameworkException:
                return []

            image_path_list = document_parser.get_references_of_tag('img')

            GET = self._uri_opener.GET
            sha1 = hashlib.sha1

            result_iter = self.worker_pool.imap_unordered(GET, image_path_list)

            for image_response in result_iter:
                if image_response.is_image():
                    img_src = image_response.get_uri()
                    img_hash = sha1(image_response.get_body()).hexdigest()
                    res.append((img_src, img_hash, response))

        return res

    def end(self):

    def get_long_desc(self):
        :return: A DETAILED description of the plugin functions and features.
        return """
Ejemplo n.º 28
class retirejs(GrepPlugin):
    Uses retirejs to identify javascript libraries with known vulnerabilities

    :author: Andres Riancho ([email protected])

    METHODS = ('GET', )
    HTTP_CODES = (200, )
    RETIRE_CMD = 'retire -j --outputformat json --outputpath %s --jspath %s'

    def __init__(self):

        self._analyzed_hashes = DiskSet(table_prefix='retirejs')
        self._retirejs_path = self._get_retirejs_path()
        self._retirejs_exit_code_result = None
        self._retirejs_exit_code_was_run = False

    def grep(self, request, response):
        Send HTTP responses to retirejs and parse JSON output.

        For performance, avoid running retirejs on the same file more than once.

        :param request: The HTTP request object.
        :param response: The HTTP response object
        :return: None
        if not self._retirejs_exit_code():

        if request.get_method() not in self.METHODS:

        if response.get_code() not in self.HTTP_CODES:

        if not response.is_text_or_html():

        if not self._should_analyze(response):


    def end(self):

    def _retirejs_exit_code(self):
        Runs retirejs on an empty file to check that the return code is 0, this
        is just a safety check to make sure everything is working. It is only
        run once.

        :return: True if everything works
        if self._retirejs_exit_code_was_run:
            return self._retirejs_exit_code_result

        check_file = tempfile.NamedTemporaryFile(prefix='retirejs-check-',

        output_file = tempfile.NamedTemporaryFile(prefix='retirejs-output-',

        args = (output_file.name, check_file.name)
        cmd = self.RETIRE_CMD % args

            subprocess.check_output(cmd, shell=True)
        except subprocess.CalledProcessError:
            msg = ('Unexpected retire.js exit code.'
                   ' Disabling grep.retirejs plugin.')

            self._retirejs_exit_code_was_run = True
            self._retirejs_exit_code_result = False
            om.out.debug('retire.js returned the expected exit code.')

            self._retirejs_exit_code_was_run = True
            self._retirejs_exit_code_result = True

        return self._retirejs_exit_code_result

    def _should_analyze(self, response):
        :param response: HTTP response
        :return: True if we should analyze this HTTP response
        # Avoid running this plugin twice on the same URL
        url_hash = hashlib.md5(response.get_url().url_string).hexdigest()
        if url_hash in self._analyzed_hashes:
            return False


        # Avoid running this plugin twice on the same file content
        response_hash = hashlib.md5(response.get_body()).hexdigest()

        if response_hash in self._analyzed_hashes:
            return False

        return True

    def _analyze_response(self, response):
        :return: None, save the findings to the KB.
        response_file = self._save_response_to_file(response)
        json_doc = self._analyze_file(response_file)
        self._json_to_kb(response, json_doc)

    def _save_response_to_file(self, response):
        # Note: The file needs to have .js extension to force retirejs to
        #       scan it. Any other extension will be ignored.
        response_file = tempfile.NamedTemporaryFile(
            prefix='retirejs-response-', suffix='.w3af.js', delete=False)


        return response_file.name

    def _analyze_file(self, response_file):
        Analyze a file and return the result as JSON

        :param response_file: File holding HTTP response body
        :return: JSON document
        json_file = tempfile.NamedTemporaryFile(prefix='retirejs-output-',

        args = (json_file.name, response_file)
        cmd = self.RETIRE_CMD % args

        process = subprocess.Popen(cmd, shell=True)

        # This will terminate the retirejs process in case it hangs
        t = Timer(self.RETIRE_TIMEOUT, kill, [process])

        # Wait for the retirejs process to complete

        # Cancel the timer if it wasn't run

        # retirejs will return code != 0 when a vulnerability is found
        # we use this to decide when we need to parse the output
        json_doc = []

        if process.returncode != 0:
                json_doc = json.loads(file(json_file.name).read())
            except Exception, e:
                msg = 'Failed to parse retirejs output. Exception: "%s"'
                om.out.debug(msg % e)

        return json_doc
Ejemplo n.º 29
class DBKnowledgeBase(BasicKnowledgeBase):
    This class saves the data that is sent to it by plugins. It is the only way
    in which plugins can exchange information.

    Data is stored in a DB.

    :author: Andres Riancho ([email protected])
    COLUMNS = [('location_a', 'TEXT'),
               ('location_b', 'TEXT'),
               ('uniq_id', 'TEXT'),
               ('pickle', 'BLOB')]

    def __init__(self):
        super(DBKnowledgeBase, self).__init__()
        self.initialized = False

        # TODO: Why doesn't this work with a WeakValueDictionary?
        self.observers = {} #WeakValueDictionary()
        self._observer_id = 0

    def setup(self):
        Setup all the required backend stores. This was mostly created to avoid
        starting any threads during __init__() which is called during python's
        import phase and dead-locks in some cases.

        :return: None
        with self._kb_lock:
            if self.initialized:

            self.urls = DiskSet(table_prefix='kb_urls')
            self.fuzzable_requests = DiskSet(table_prefix='kb_fuzzable_requests')

            self.db = get_default_persistent_db_instance()

            self.table_name = 'knowledge_base_' + rand_alpha(30)
            self.db.create_table(self.table_name, self.COLUMNS)
            self.db.create_index(self.table_name, ['location_a', 'location_b'])
            self.db.create_index(self.table_name, ['uniq_id'])

            # Only initialize once
            self.initialized = True

    def clear(self, location_a, location_b):
        location_a = self._get_real_name(location_a)

        query = "DELETE FROM %s WHERE location_a = ? and location_b = ?"
        params = (location_a, location_b)
        self.db.execute(query % self.table_name, params)

    def raw_write(self, location_a, location_b, value):
        This method saves value to (location_a,location_b) but previously
        clears any pre-existing values.
        if isinstance(value, Info):
            raise TypeError('Use append or append_uniq to store vulnerabilities')

        location_a = self._get_real_name(location_a)

        self.clear(location_a, location_b)
        self.append(location_a, location_b, value, ignore_type=True)

    def raw_read(self, location_a, location_b):
        This method reads the value from (location_a, location_b)
        location_a = self._get_real_name(location_a)
        result = self.get(location_a, location_b, check_types=False)

        if len(result) > 1:
            msg = 'Incorrect use of raw_write/raw_read, found %s results.'
            raise RuntimeError(msg % len(result))
        elif len(result) == 0:
            return []
            return result[0]

    def get_one(self, location_a, location_b):
        This method reads the value from (location_a, location_b), checking it's
        type and making sure only one is stored at that address.

        Similar to raw_read, but checking types.

        :see: https://github.com/andresriancho/w3af/issues/3955
        location_a = self._get_real_name(location_a)
        result = self.get(location_a, location_b, check_types=True)

        if len(result) > 1:
            msg = 'Incorrect use of get_one(), found %s results.'
            raise RuntimeError(msg % result)
        elif len(result) == 0:
            return []
            return result[0]

    def _get_uniq_id(self, obj):
        if isinstance(obj, (Info, InfoSet)):
            return obj.get_uniq_id()
            if isinstance(obj, collections.Iterable):
                concat_all = ''.join([str(i) for i in obj])
                return str(hash(concat_all))
                return str(hash(obj))

    def append(self, location_a, location_b, value, ignore_type=False):
        This method appends the location_b value to a dict.
        if not ignore_type and not isinstance(value, (Info, Shell, InfoSet)):
            msg = 'You MUST use raw_write/raw_read to store non-info objects'\
                  ' to the KnowledgeBase.'
            raise TypeError(msg)

        location_a = self._get_real_name(location_a)
        uniq_id = self._get_uniq_id(value)

        pickled_obj = cpickle_dumps(value)
        t = (location_a, location_b, uniq_id, pickled_obj)

        query = "INSERT INTO %s VALUES (?, ?, ?, ?)" % self.table_name
        self.db.execute(query, t)
        self._notify_observers(self.APPEND, location_a, location_b, value,

    def get(self, location_a, location_b, check_types=True):
        :param location_a: The plugin that saved the data to the
                           kb.info Typically the name of the plugin,
                           but could also be the plugin instance.

        :param location_b: The name of the variables under which the vuln
                           objects were saved. Typically the same name of
                           the plugin, or something like "vulns", "errors",
                           etc. In most cases this is NOT None. When set
                           to None, a dict with all the vuln objects found
                           by the plugin_name is returned.

        :return: Returns the data that was saved by another plugin.
        location_a = self._get_real_name(location_a)

        if location_b is None:
            query = 'SELECT pickle FROM %s WHERE location_a = ?'
            params = (location_a,)
            query = 'SELECT pickle FROM %s WHERE location_a = ?'\
                                           ' and location_b = ?'
            params = (location_a, location_b)

        result_lst = []

        results = self.db.select(query % self.table_name, params)
        for r in results:
            obj = cPickle.loads(r[0])

            if check_types and not isinstance(obj, (Info, InfoSet, Shell)):
                raise TypeError('Use raw_write and raw_read to query the'
                                ' knowledge base for non-Info objects')


        return result_lst

    def get_by_uniq_id(self, uniq_id):
        query = 'SELECT pickle FROM %s WHERE uniq_id = ?'
        params = (uniq_id,)

        result = self.db.select_one(query % self.table_name, params)

        if result is not None:
            result = cPickle.loads(result[0])

        return result

    def update(self, old_info, update_info):
        :param old_info: The info/vuln instance to be updated in the kb.
        :param update_info: The info/vuln instance with new information
        :return: Nothing
        old_not_info = not isinstance(old_info, (Info, InfoSet, Shell))
        update_not_info = not isinstance(update_info, (Info, InfoSet, Shell))

        if old_not_info or update_not_info:
            msg = 'You MUST use raw_write/raw_read to store non-info objects'\
                  ' to the KnowledgeBase.'
            raise TypeError(msg)

        old_uniq_id = old_info.get_uniq_id()
        new_uniq_id = update_info.get_uniq_id()
        pickle = cpickle_dumps(update_info)

        # Update the pickle and unique_id after finding by original uniq_id
        query = "UPDATE %s SET pickle = ?, uniq_id = ? WHERE uniq_id = ?"

        params = (pickle, new_uniq_id, old_uniq_id)
        result = self.db.execute(query % self.table_name, params).result()

        if result.rowcount:
            self._notify_observers(self.UPDATE, old_info, update_info)
            ex = 'Failed to update() %s instance because' \
                 ' the original unique_id (%s) does not exist in the DB,' \
                 ' or the new unique_id (%s) is invalid.'
            raise DBException(ex % (old_info.__class__.__name__,

    def add_observer(self, observer):
        Add the observer instance to the list.
        observer_id = self.get_observer_id()
        self.observers[observer_id] = observer

    def get_observer_id(self):
        self._observer_id += 1
        return self._observer_id

    def _notify_observers(self, method, *args, **kwargs):
        Call the observer if the location_a/location_b matches with the
        configured observers.

        :return: None
        # Note that I copy the items list in order to iterate though it without
        # any issues like the size changing
        for _, observer in self.observers.items()[:]:
            functor = getattr(observer, method)
            functor(*args, **kwargs)

    def get_all_entries_of_class(self, klass):
        :return: A list of all objects of class == klass that are saved in the
        query = 'SELECT pickle FROM %s'
        results = self.db.select(query % self.table_name)

        result_lst = []

        for r in results:
            obj = cPickle.loads(r[0])
            if isinstance(obj, klass):

        return result_lst

    def get_all_vulns(self):
        :return: A list of all info instances with severity in (LOW, MEDIUM,
        query = 'SELECT pickle FROM %s'
        results = self.db.select(query % self.table_name)

        result_lst = []

        for r in results:
            obj = cPickle.loads(r[0])
            if hasattr(obj, 'get_severity'):
                severity = obj.get_severity()
                if severity in (LOW, MEDIUM, HIGH):

        return result_lst

    def get_all_infos(self):
        :return: A list of all info instances with severity eq INFORMATION
        query = 'SELECT pickle FROM %s'
        results = self.db.select(query % self.table_name)

        result_lst = []

        for r in results:
            obj = cPickle.loads(r[0])
            if hasattr(obj, 'get_severity'):
                severity = obj.get_severity()
                if severity in (INFORMATION,):

        return result_lst

    def dump(self):
        result_dict = {}

        query = 'SELECT location_a, location_b, pickle FROM %s'
        results = self.db.select(query % self.table_name)

        for location_a, location_b, pickle in results:
            obj = cPickle.loads(pickle)

            if location_a not in result_dict:
                result_dict[location_a] = {location_b: [obj,]}
            elif location_b not in result_dict[location_a]:
                result_dict[location_a][location_b] = [obj,]

        return result_dict

    def cleanup(self):
        Cleanup internal data.
        self.db.execute("DELETE FROM %s WHERE 1=1" % self.table_name)

        # Remove the old, create new.
        self.urls = DiskSet(table_prefix='kb_urls')

        self.fuzzable_requests = DiskSet(table_prefix='kb_fuzzable_requests')


    def remove(self):

    def get_all_known_urls(self):
        :return: A DiskSet with all the known URLs as URL objects.
        return self.urls

    def add_url(self, url):
        :return: True if the URL was previously unknown
        if not isinstance(url, URL):
            msg = 'add_url requires a URL as parameter got %s instead.'
            raise TypeError(msg % type(url))

        self._notify_observers(self.ADD_URL, url)
        return self.urls.add(url)

    def get_all_known_fuzzable_requests(self):
        :return: A DiskSet with all the known URLs as URL objects.
        return self.fuzzable_requests

    def add_fuzzable_request(self, fuzzable_request):
        :return: True if the FuzzableRequest was previously unknown
        if not isinstance(fuzzable_request, FuzzableRequest):
            msg = 'add_fuzzable_request requires a FuzzableRequest as '\
                  'parameter, got "%s" instead.'
            raise TypeError(msg % type(fuzzable_request))

        return self.fuzzable_requests.add(fuzzable_request)
Ejemplo n.º 30
class find_captchas(CrawlPlugin):
    Identify captcha images on web pages.
    :author: Andres Riancho ([email protected])

    def __init__(self):

        self._captchas_found = DiskSet(table_prefix='find_captchas')

    def crawl(self, fuzzable_request):
        Find CAPTCHA images.

        :param fuzzable_request: A fuzzable_request instance that contains
                                    (among other things) the URL to test.
        result, captchas = self._identify_captchas(fuzzable_request)
        if not result:

        for captcha in captchas:

            desc = 'Found a CAPTCHA image at: "%s".' % captcha.img_src
            response_ids = [response.id for response in captcha.http_responses]

            i = Info('Captcha image detected', desc, response_ids, self.get_name())

            kb.kb.append(self, 'CAPTCHA', i)

    def _identify_captchas(self, fuzzable_request):
        :return: A tuple with the following information:
                    * True indicating that the page has CAPTCHAs
                    * A list with tuples that contain:
                        * The CAPTCHA image source
                        * The http responses used to verify that the image was
                          indeed a CAPTCHA
        found_captcha = False
        captchas = []
        # GET the document, and fetch the images
        images_1 = self._get_images(fuzzable_request)

        # Re-GET the document, and fetch the images
        images_2 = self._get_images(fuzzable_request)

        # If the number of images in each response is different, don't even
        # bother to perform any analysis since our simplistic approach will fail
        # TODO: Add something more advanced.
        if len(images_1) == len(images_2):

            not_in_2 = []

            for img_src_1, img_hash_1, http_response_1 in images_1:
                for _, img_hash_2, http_response_2 in images_2:
                    if img_hash_1 == img_hash_2:
                        # The image is in both lists, can't be a CAPTCHA
                    not_in_2.append((img_src_1, img_hash_1, [http_response_1, http_response_2]))

            # Results
            # TODO: This allows for more than one CAPTCHA in the same page. Does
            #       that make sense? When that's found, should I simply declare
            #       defeat and don't report anything?
            for img_src, _, http_responses in not_in_2:

                CaptchaInfo = namedtuple('CaptchaInfo', ['img_src',
                img_src = img_src.uri2url()
                if img_src not in self._captchas_found:
                    found_captcha = True
                    captchas.append(CaptchaInfo(img_src, http_responses))
        return found_captcha, captchas
    def _get_images(self, fuzzable_request):
        Get all img tags and retrieve the src.

        :param fuzzable_request: The request to modify
        :return: A list with tuples containing (img_src, image_hash, http_response)
        res = []

            response = self._uri_opener.GET(fuzzable_request.get_uri(),
            om.out.debug('Failed to retrieve the page for finding captchas.')
            # Do not use parser_cache here, it's not good since CAPTCHA implementations
            # *might* change the image name for each request of the HTML
            #dp = parser_cache.dpc.get_document_parser_for( response )
                document_parser = DocumentParser.DocumentParser(response)
            except BaseFrameworkException:
                return []
            image_path_list = document_parser.get_references_of_tag('img')

            GET = self._uri_opener.GET
            sha1 = hashlib.sha1
            result_iter = self.worker_pool.imap_unordered(GET, image_path_list)
            for image_response in result_iter:
                if image_response.is_image():
                    img_src = image_response.get_uri()
                    img_hash = sha1(image_response.get_body()).hexdigest()
                    res.append((img_src, img_hash, response))

        return res

    def end(self):

    def get_long_desc(self):
        :return: A DETAILED description of the plugin functions and features.
        return """
Ejemplo n.º 31
class ParserCache(CacheStats):
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        Clear all the internal variables
        :return: None
        # Stop any workers

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):

        # We don't need the parsers anymore

    def should_cache(self, http_response):
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        # We need to verify if we can parse this HTTP response
            can_parse = DocumentParser.can_parse(http_response)
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None

    def get_document_parser_for(self, http_response, cache=True):
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        # We know that we can parse this document, lets work!
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            return parser
            # Not in cache, have to work.

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except ScanMustStopException, e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
Ejemplo n.º 32
class error_500(GrepPlugin):
    Grep every page for error 500 pages that haven't been identified as bugs by
    other plugins.

    :author: Andres Riancho ([email protected])

    IGNORE_CODES = (404, 403, 401, 405, 400, 501)
    FALSE_POSITIVE_STRINGS = ('<h1>Bad Request (Invalid URL)</h1>',

    def __init__(self):

        self._error_500_responses = DiskSet(table_prefix='error_500')

    def grep(self, request, response):
        Plugin entry point, identify which requests generated a 500 error.

        :param request: The HTTP request object.
        :param response: The HTTP response object
        :return: None
        if response.is_text_or_html() \
        and 400 < response.get_code() < 600 \
        and response.get_code() not in self.IGNORE_CODES\
        and not self._is_false_positive(response):
            self._error_500_responses.add((request, response.id))

    def _is_false_positive(self, response):
        Filters out some false positives like this one:

        This false positive is generated by IIS when I send an URL that's "odd"
        Some examples of URLs that trigger this false positive:

        :return: True if the response is a false positive.
        for fps in self.FALSE_POSITIVE_STRINGS:
            if fps in response.get_body():
                return True
        return False

    def end(self):
        This method is called when the plugin wont be used anymore.

        The real job of this plugin is done here, where I will try to see if
        one of the error_500 responses were not identified as a vuln by some
        of my audit plugins
        all_vuln_ids = set()

        for info in kb.kb.get_all_findings():
            for _id in info.get_id():

        for request, error_500_response_id in self._error_500_responses:

            if error_500_response_id not in all_vuln_ids:
                # Found a error 500 that wasn't identified !
                desc = 'An unidentified web application error (HTTP response'\
                       ' code 500) was found at: "%s". Enable all plugins and'\
                       ' try again, if the vulnerability still is not'\
                       ' identified, please verify manually and report it to'\
                       ' the w3af developers.'
                desc = desc % request.get_url()

                v = Vuln('Unhandled error in web application', desc,
                         severity.MEDIUM, error_500_response_id,


                self.kb_append_uniq(self, 'error_500', v, 'VAR')


    def get_long_desc(self):
        :return: A DETAILED description of the plugin functions and features.
        return """
Ejemplo n.º 33
class dot_ds_store(CrawlPlugin):
    Search .DS_Store file and checks for files containing.

    :author: Tomas Velazquez ( [email protected] )
    :author: Andres Riancho ( [email protected] )

    :credits: This code was based in cpan Mac::Finder::DSStore by Wim Lewis ( [email protected] )
    DS_STORE = '.DS_Store'

    def __init__(self):

        # Internal variables
        self._analyzed_dirs = DiskSet()

    def crawl(self, fuzzable_request, debugging_id):
        For every directory, fetch a list of files and analyze the response.

        :param debugging_id: A unique identifier for this call to discover()
        :parameter fuzzable_request: A fuzzable_request instance that contains
                                    (among other things) the URL to test.
        directories_to_check = []

        for domain_path in fuzzable_request.get_url().get_directories():
            if domain_path not in self._analyzed_dirs:

        # Send the requests using threads
        self.worker_pool.map(self._check_and_analyze, directories_to_check)

    def _check_and_analyze(self, domain_path):
        Check if a .DS_Store filename exists in the domain_path.

        :return: None, everything is saved to the self.out_queue.
        # Request the file
        url = domain_path.url_join(self.DS_STORE)

            response = self.http_get_and_parse(url, binary_response=True)
        except BaseFrameworkException as w3:
            msg = 'Failed to GET .DS_Store file: %s. Exception: %s.'
            om.out.debug(msg, (url, w3))

        # Check if it's a .DS_Store file
        if is_404(response):

            store = DsStore(response.get_raw_body())
            entries = store.get_file_entries()
        except Exception as e:
            om.out.debug('Unexpected error while parsing DS_Store file: "%s"' % e)

        parsed_url_list = []

        for filename in entries:

        self.worker_pool.map(self.http_get_and_parse, parsed_url_list)

        desc = ('A .DS_Store file was found at: %s. The contents of this file'
                ' disclose filenames')
        desc %= (response.get_url())

        v = Vuln('.DS_Store file found', desc, severity.LOW, response.id, self.get_name())

        kb.kb.append(self, 'dot_ds_store', v)
        om.out.vulnerability(v.get_desc(), severity=v.get_severity())

    def get_long_desc(self):
        :return: A DETAILED description of the plugin functions and features.
        return '''
Ejemplo n.º 34
class DBKnowledgeBase(BasicKnowledgeBase):
    This class saves the data that is sent to it by plugins. It is the only way
    in which plugins can exchange information.

    Data is stored in a DB.

    :author: Andres Riancho ([email protected])
    def __init__(self):
        super(DBKnowledgeBase, self).__init__()

        self.urls = DiskSet()
        self.fuzzable_requests = DiskSet()

        self.db = get_default_persistent_db_instance()

        columns = [('location_a', 'TEXT'), ('location_b', 'TEXT'),
                   ('uniq_id', 'TEXT'), ('pickle', 'BLOB')]

        self.table_name = rand_alpha(30)
        self.db.create_table(self.table_name, columns)
        self.db.create_index(self.table_name, ['location_a', 'location_b'])
        self.db.create_index(self.table_name, [

        # TODO: Why doesn't this work with a WeakValueDictionary?
        self.observers = {}  #WeakValueDictionary()
        self.type_observers = {}  #WeakValueDictionary()
        self.url_observers = []
        self._observer_id = 0

    def clear(self, location_a, location_b):
        location_a = self._get_real_name(location_a)

        query = "DELETE FROM %s WHERE location_a = ? and location_b = ?"
        params = (location_a, location_b)
        self.db.execute(query % self.table_name, params)

    def raw_write(self, location_a, location_b, value):
        This method saves value to (location_a,location_b) but previously
        clears any pre-existing values.
        if isinstance(value, Info):
            raise TypeError(
                'Use append or append_uniq to store vulnerabilities')

        location_a = self._get_real_name(location_a)

        self.clear(location_a, location_b)
        self.append(location_a, location_b, value, ignore_type=True)

    def raw_read(self, location_a, location_b):
        This method reads the value from (location_a,location_b)
        location_a = self._get_real_name(location_a)
        result = self.get(location_a, location_b, check_types=False)

        if len(result) > 1:
            msg = 'Incorrect use of raw_write/raw_read, found %s rows.'
            raise RuntimeError(msg % result)
        elif len(result) == 0:
            return []
            return result[0]

    def _get_uniq_id(self, obj):
        if isinstance(obj, Info):
            return obj.get_uniq_id()
            if isinstance(obj, collections.Iterable):
                concat_all = ''.join([str(i) for i in obj])
                return str(hash(concat_all))
                return str(hash(obj))

    def append(self, location_a, location_b, value, ignore_type=False):
        This method appends the location_b value to a dict.
        if not ignore_type and not isinstance(value, (Info, Shell)):
            msg = 'You MUST use raw_write/raw_read to store non-info objects'\
                  ' to the KnowledgeBase.'
            raise TypeError(msg)

        location_a = self._get_real_name(location_a)
        uniq_id = self._get_uniq_id(value)

        pickled_obj = cPickle.dumps(value)
        t = (location_a, location_b, uniq_id, pickled_obj)

        query = "INSERT INTO %s VALUES (?, ?, ?, ?)" % self.table_name
        self.db.execute(query, t)
        self._notify(location_a, location_b, value)

    def get(self, location_a, location_b, check_types=True):
        :param location_a: The plugin that saved the data to the
                           kb.info Typically the name of the plugin,
                           but could also be the plugin instance.

        :param location_b: The name of the variables under which the vuln
                           objects were saved. Typically the same name of
                           the plugin, or something like "vulns", "errors",
                           etc. In most cases this is NOT None. When set
                           to None, a dict with all the vuln objects found
                           by the plugin_name is returned.

        :return: Returns the data that was saved by another plugin.
        location_a = self._get_real_name(location_a)

        if location_b is None:
            query = 'SELECT pickle FROM %s WHERE location_a = ?'
            params = (location_a, )
            query = 'SELECT pickle FROM %s WHERE location_a = ?'\
                                           ' and location_b = ?'
            params = (location_a, location_b)

        result_lst = []

        results = self.db.select(query % self.table_name, params)
        for r in results:
            obj = cPickle.loads(r[0])

            if check_types and not isinstance(obj, (Info, Shell)):
                raise TypeError('Use raw_write and raw_read to query the'
                                ' knowledge base for non-Info objects')


        return result_lst

    def get_by_uniq_id(self, uniq_id):
        query = 'SELECT pickle FROM %s WHERE uniq_id = ?'
        params = (uniq_id, )

        result = self.db.select_one(query % self.table_name, params)

        if result is not None:
            result = cPickle.loads(result[0])

        return result

    def add_observer(self, location_a, location_b, observer):
        Add the observer function to the observer list. The function will be
        called when there is a change in (location_a, location_b).
        You can use None in location_a or location_b as wildcards.
        The observer function needs to be a function which takes three params:
            * location_a
            * location_b
            * value that's added to the kb location
        :return: None
        if not isinstance(location_a, (basestring, types.NoneType)) or \
        not isinstance(location_a, (basestring, types.NoneType)):
            raise TypeError('Observer locations need to be strings or None.')

        observer_id = self.get_observer_id()
        self.observers[(location_a, location_b, observer_id)] = observer

    def add_types_observer(self, type_filter, observer):
        Add the observer function to the list of functions to be called when a
        new object that is of type "type_filter" is added to the KB.
        The type_filter must be one of Info, Vuln or Shell.
        :return: None
        if type_filter not in (Info, Vuln, Shell):
            msg = 'The type_filter needs to be one of Info, Vuln or Shell'
            raise TypeError(msg)

        observer_id = self.get_observer_id()
        self.type_observers[(type_filter, observer_id)] = observer

    def get_observer_id(self):
        self._observer_id += 1
        return self._observer_id

    def _notify(self, location_a, location_b, value):
        Call the observer if the location_a/location_b matches with the
        configured observers.
        :return: None
        # Note that I copy the items list in order to iterate though it without
        # any issues like the size changing
        for (obs_loc_a, obs_loc_b, _), observer in self.observers.items()[:]:

            if obs_loc_a is None and obs_loc_b is None:
                observer(location_a, location_b, value)

            if obs_loc_a == location_a and obs_loc_b is None:
                observer(location_a, location_b, value)

            if obs_loc_a == location_a and obs_loc_b == location_b:
                observer(location_a, location_b, value)

        for (type_filter, _), observer in self.type_observers.items()[:]:
            if isinstance(value, type_filter):
                observer(location_a, location_b, value)

    def get_all_entries_of_class(self, klass):
        :return: A list of all objects of class == klass that are saved in the
        query = 'SELECT pickle FROM %s'
        results = self.db.select(query % self.table_name)

        result_lst = []

        for r in results:
            obj = cPickle.loads(r[0])
            if isinstance(obj, klass):

        return result_lst

    def dump(self):
        result_dict = {}

        query = 'SELECT location_a, location_b, pickle FROM %s'
        results = self.db.select(query % self.table_name)

        for location_a, location_b, pickle in results:
            obj = cPickle.loads(pickle)

            if location_a not in result_dict:
                result_dict[location_a] = {
                    location_b: [
            elif location_b not in result_dict[location_a]:
                result_dict[location_a][location_b] = [

        return result_dict

    def cleanup(self):
        Cleanup internal data.
        self.db.execute("DELETE FROM %s WHERE 1=1" % self.table_name)

        # Remove the old, create new.
        self.urls = DiskSet()

        self.fuzzable_requests = DiskSet()


    def remove(self):

    def get_all_known_urls(self):
        :return: A DiskSet with all the known URLs as URL objects.
        return self.urls

    def add_url_observer(self, observer):

    def _notify_url_observers(self, new_url):
        Call the observer with new_url.
        :return: None
        # Note that I copy the items list in order to iterate though it without
        # any issues like the size changing
        for observer in self.url_observers[:]:

    def add_url(self, url):
        :return: True if the URL was previously unknown 
        if not isinstance(url, URL):
            msg = 'add_url requires a URL as parameter got %s instead.'
            raise TypeError(msg % type(url))

        return self.urls.add(url)

    def get_all_known_fuzzable_requests(self):
        :return: A DiskSet with all the known URLs as URL objects.
        return self.fuzzable_requests

    def add_fuzzable_request(self, fuzzable_request):
        :return: True if the FuzzableRequest was previously unknown 
        if not isinstance(fuzzable_request, FuzzableRequest):
            msg = 'add_fuzzable_request requires a FuzzableRequest as '\
                  'parameter, got "%s" instead.'
            raise TypeError(msg % type(fuzzable_request))

        return self.fuzzable_requests.add(fuzzable_request)
Ejemplo n.º 35
class web_spider(CrawlPlugin):
    Crawl the web application.

    :author: Andres Riancho ([email protected])
    UNAUTH_FORBID = {http_constants.UNAUTHORIZED, http_constants.FORBIDDEN}

    def __init__(self):

        # Internal variables
        self._compiled_ignore_re = None
        self._compiled_follow_re = None
        self._broken_links = DiskSet(table_prefix='web_spider')
        self._first_run = True
        self._target_urls = []
        self._target_domain = None
        self._already_filled_form = ScalableBloomFilter()
        self._variant_db = VariantDB()

        # User configured variables
        self._ignore_regex = ''
        self._follow_regex = '.*'
        self._only_forward = False
        self._ignore_extensions = []

    def crawl(self, fuzzable_request, debugging_id):
        Searches for links on the html.

        :param debugging_id: A unique identifier for this call to discover()
        :param fuzzable_request: A fuzzable_req instance that contains
                                 (among other things) the URL to test.

        # If it is a form, then smart_fill the parameters to send something that
        # makes sense and will allow us to cover more code.
        data_container = fuzzable_request.get_raw_data()
        if isinstance(data_container, Form):

            if fuzzable_request.get_url() in self._already_filled_form:


        # Send the HTTP request
        resp = self._uri_opener.send_mutant(fuzzable_request)

        # Nothing to do here...
        if resp.get_code() == http_constants.UNAUTHORIZED:

        # Nothing to do here...
        if resp.is_image():

        # And we don't trust what comes from the core, check if 404
        if is_404(resp):

        self._extract_html_forms(resp, fuzzable_request)
        self._extract_links_and_verify(resp, fuzzable_request)

    def _extract_html_forms(self, resp, fuzzable_req):
        Parses the HTTP response body and extract HTML forms, resulting forms
        are put() on the output queue.
        # Try to find forms in the document
            dp = parser_cache.dpc.get_document_parser_for(resp)
        except BaseFrameworkException:
            # Failed to find a suitable parser for the document

        # Create one FuzzableRequest for each form variant
        mode = cf.cf.get('form_fuzzing_mode')
        for form_params in dp.get_forms():

            # Form exclusion #15161
            form_id_json = form_params.get_form_id().to_json()
            om.out.debug('A new form was found! Form-id is: "%s"' % form_id_json)

            if not self._should_analyze_url(form_params.get_action()):

            headers = fuzzable_req.get_headers()

            for form_params_variant in form_params.get_variants(mode):
                data_container = dc_from_form_params(form_params_variant)

                # Now data_container is one of Multipart of URLEncoded form
                # instances, which is a DataContainer. Much better than the
                # FormParameters instance we had before in form_params_variant
                r = FuzzableRequest.from_form(data_container, headers=headers)

    def _handle_first_run(self):
        if not self._first_run:

        # I have to set some variables, in order to be able to code
        # the "only_forward" feature
        self._first_run = False
        self._target_urls = [i.uri2url() for i in cf.cf.get('targets')]

        # The following line triggered lots of bugs when the "stop" button
        # was pressed and the core did this: "cf.cf.save('targets', [])"
        #self._target_domain = cf.cf.get('targets')[0].get_domain()
        #    Changing it to something awful but bug-free.
        targets = cf.cf.get('targets')
        if not targets:

        self._target_domain = targets[0].get_domain()
    def _urls_to_verify_generator(self, resp, fuzzable_req):
        Yields tuples containing:
            * Newly found URL
            * The FuzzableRequest instance passed as parameter
            * The HTTPResponse generated by the FuzzableRequest
            * Boolean indicating if we trust this reference or not

        :param resp: HTTP response object
        :param fuzzable_req: The HTTP request that generated the response
        gen = itertools.chain(self._url_path_url_generator(resp, fuzzable_req),
                              self._body_url_generator(resp, fuzzable_req),
                              headers_url_generator(resp, fuzzable_req))
        for ref, fuzzable_req, original_resp, possibly_broken in gen:
            if self._should_verify_extracted_url(ref, original_resp):
                yield ref, fuzzable_req, original_resp, possibly_broken

    def _url_path_url_generator(self, resp, fuzzable_req):
        Yields tuples containing:
            * Newly found URL
            * The FuzzableRequest instance passed as parameter
            * The HTTPResponse generated by the FuzzableRequest
            * Boolean indicating if we trust this reference or not

        :param resp: HTTP response object
        :param fuzzable_req: The HTTP request that generated the response
        # Analyze all directories, if the URL w3af just found is:
        #   http://localhost/a/b/c/f00.php
        # I want to GET:
        #   http://localhost/a/b/c/
        #   http://localhost/a/b/
        #   http://localhost/a/
        #   http://localhost/
        # And analyze the responses...
        dirs = resp.get_url().get_directories()

        for ref in unique_justseen(dirs):
            yield ref, fuzzable_req, resp, False

    def _body_url_generator(self, resp, fuzzable_req):
        Yields tuples containing:
            * Newly found URL
            * The FuzzableRequest instance passed as parameter
            * The HTTPResponse generated by the FuzzableRequest
            * Boolean indicating if we trust this reference or not

        The newly found URLs are extracted from the http response body using
        one of the framework's parsers.

        :param resp: HTTP response object
        :param fuzzable_req: The HTTP request that generated the response
        # Note: I WANT to follow links that are in the 404 page.
            doc_parser = parser_cache.dpc.get_document_parser_for(resp)
        except BaseFrameworkException as w3:
            om.out.debug('Failed to find a suitable document parser. '
                         'Exception "%s"' % w3)
            # Note:
            # - With parsed_refs I'm 100% that it's really
            #   something in the HTML that the developer intended to add.
            # - The re_refs are the result of regular expressions,
            #   which in some cases are just false positives.
            parsed_refs, re_refs = doc_parser.get_references()

            dirs = resp.get_url().get_directories()
            only_re_refs = set(re_refs) - set(dirs + parsed_refs)

            all_refs = itertools.chain(parsed_refs, re_refs)
            resp_is_404 = is_404(resp)

            for ref in unique_justseen(sorted(all_refs)):
                possibly_broken = resp_is_404 or (ref in only_re_refs)
                yield ref, fuzzable_req, resp, possibly_broken

    def _should_analyze_url(self, ref):
        :param ref: A URL instance to match against the user configured filters
        :return: True if we should navigate to this URL
        # I don't want w3af sending requests to 3rd parties!
        if ref.get_domain() != self._target_domain:
            msg = 'web_spider will ignore %s (different domain name)'
            args = (ref.get_domain(),)
            om.out.debug(msg % args)
            return False

        # Filter the URL according to the configured regular expressions
        if not self._compiled_follow_re.match(ref.url_string):
            msg = 'web_spider will ignore %s (not match follow regex)'
            args = (ref.url_string,)
            om.out.debug(msg % args)
            return False

        if self._compiled_ignore_re.match(ref.url_string):
            msg = 'web_spider will ignore %s (match ignore regex)'
            args = (ref.url_string,)
            om.out.debug(msg % args)
            return False

        if self._has_ignored_extension(ref):
            msg = 'web_spider will ignore %s (match ignore extensions)'
            args = (ref.url_string,)
            om.out.debug(msg % args)
            return False

        # Implementing only forward
        if self._only_forward and not self._is_forward(ref):
            msg = 'web_spider will ignore %s (is not forward)'
            args = (ref.url_string,)
            om.out.debug(msg % args)
            return False

        return True

    def _has_ignored_extension(self, new_url):
        if not self._ignore_extensions:
            return False

        return new_url.get_extension().lower() in self._ignore_extensions

    def _should_verify_extracted_url(self, ref, resp):
        :param ref: A newly found URL
        :param resp: The HTTP response where the URL was found

        :return: Boolean indicating if I should send this new reference to the
        # Ignore myself
        if ref == resp.get_uri():
            return False

        if not self._should_analyze_url(ref):
            return False

        # I tried to have only one VariantDB in the framework instead of two,
        # but after some tests and architecture considerations it was better
        # to duplicate the data.
        # In the future I'll run plugins in different processes than the core,
        # so it makes sense to have independent plugins.
        # If I remove the web_spider VariantDB and just leave the one in the
        # core the framework keeps working but this method
        # (_should_verify_extracted_url) will return True much more often, which
        # leads to extra HTTP requests for URLs which we already checked and the
        # core will dismiss anyway
        fuzzable_request = FuzzableRequest(ref)
        if self._variant_db.append(fuzzable_request):
            return True

        return False

    def _extract_links_and_verify(self, resp, fuzzable_req):
        This is a very basic method that will send the work to different
        threads. Work is generated by the _urls_to_verify_generator

        :param resp: HTTP response object
        :param fuzzable_req: The HTTP request that generated the response
            self._urls_to_verify_generator(resp, fuzzable_req))

    def _verify_reference(self, reference, original_request,
                          original_response, possibly_broken,
        The parameters are:
            * Newly found URL
            * The FuzzableRequest instance which generated the response where
              the new URL was found
            * The HTTPResponse generated by the FuzzableRequest
            * Boolean indicating if we trust this reference or not

        This method GET's every new link and parses it in order to get
        new links and forms.
        # Remember that this "breaks" the cache=True in most cases!
        #     headers = { 'Referer': original_url }
        # But this does not, and it is friendlier than simply ignoring the
        # referer
        referer = original_response.get_url().base_url().url_string
        headers = Headers([('Referer', referer)])

        # Note: We're not grep'ing this HTTP request/response now because it
        #       has high probability of being a 404, and the grep plugins
        #       already got enough 404 responses to analyze (from is_404 for
        #       example). If it's not a 404 then we'll push it to the core
        #       and it will come back to this plugin's crawl() where it will
        #       be requested with grep=True
        resp = self._uri_opener.GET(reference, cache=True, headers=headers,

        if not is_404(resp):
            msg = '[web_spider] Found new link "%s" at "%s"'
            args = (reference, original_response.get_url())
            om.out.debug(msg % args)

            fuzz_req = FuzzableRequest(reference, headers=headers)

            # These next steps are simple, but actually allows me to set the
            # referer and cookie for the FuzzableRequest instances I'm sending
            # to the core, which will then allow the fuzzer to create
            # CookieMutant and HeadersMutant instances.
            # Without setting the Cookie, the CookieMutant would never have any
            # data to modify; remember that cookies are actually set by the
            # urllib2 cookie handler when the request already exited the
            # framework.
            cookie = Cookie.from_http_response(original_response)



        # Note: I WANT to follow links that are in the 404 page, but
        # DO NOT return the 404 itself to the core.
        # This will parse the 404 response and add the 404-links in the
        # output queue, so that the core can get them
        if be_recursive:
            # Only follow one level of links in 404 pages, this limits the
            # potential issue when this is found:
            #   http://foo.com/abc/ => 404
            #   Body: <a href="def/">link</a>
            # Which would lead to this function to perform requests to:
            #   * http://foo.com/abc/
            #   * http://foo.com/abc/def/
            #   * http://foo.com/abc/def/def/
            #   * http://foo.com/abc/def/def/def/
            #   * ...

            # Do not use threads here, it will dead-lock (for unknown
            # reasons). This is tested in TestDeadLock unittest.
            for args in self._urls_to_verify_generator(resp, original_request):
                self._verify_reference(*args, be_recursive=False)

        # Store the broken links
        if not possibly_broken and resp.get_code() not in self.UNAUTH_FORBID:
            t = (resp.get_url(), original_request.get_uri())

    def end(self):
        Called when the process ends, prints out the list of broken links.
        if len(self._broken_links):

            om.out.information('The following is a list of broken links that'
                               ' were found by the web_spider plugin:')
            for broken, where in unique_justseen(self._broken_links.ordered_iter()):
                om.out.information('- %s [ referenced from: %s ]' %
                                   (broken, where))

    def _is_forward(self, reference):
        Check if the reference is inside the target directories.

        :return: True if reference is an URL inside the directory structure of
                 at least one of the target URLs.
        for domain_path in self._target_urls:
            if reference.url_string.startswith(domain_path.url_string):
                return True

        return False

    def get_options(self):
        :return: A list of option objects for this plugin.
        ol = OptionList()

        d = 'Only crawl links to paths inside the URL given as target.'
        o = opt_factory('only_forward', self._only_forward, d, BOOL)

        d = ('Only crawl links that match this regular expression.'
             ' Note that ignore_regex has precedence over follow_regex.')
        o = opt_factory('follow_regex', self._follow_regex, d, REGEX)

        d = ('DO NOT crawl links that match this regular expression.'
             ' Note that ignore_regex has precedence over follow_regex.')
        o = opt_factory('ignore_regex', self._ignore_regex, d, REGEX)

        d = 'DO NOT crawl links that use these extensions.'
        h = ('This configuration parameter is commonly used to ignore'
             ' static files such as zip, pdf, jpeg, etc. It is possible to'
             ' ignore these files using `ignore_regex`, but configuring'
             ' this parameter is easier and performs case insensitive'
             ' matching.')
        o = opt_factory('ignore_extensions', self._ignore_extensions, d, LIST, help=h)

        return ol

    def set_options(self, options_list):
        This method sets all the options that are configured using the user
        interface generated by the framework using the result of get_options().

        :param options_list: A dictionary with the options for the plugin.
        :return: No value is returned.
        self._only_forward = options_list['only_forward'].get_value()

        self._ignore_regex = options_list['ignore_regex'].get_value()
        self._follow_regex = options_list['follow_regex'].get_value()

        self._ignore_extensions = options_list['ignore_extensions'].get_value()
        self._ignore_extensions = [ext.lower() for ext in self._ignore_extensions]

    def _compile_re(self):
        Compile the regular expressions that are going to be used to ignore
        or follow links.
        if self._ignore_regex:
            # Compilation of this regex can't fail because it was already
            # verified as valid at regex_option.py: see REGEX in get_options()
            self._compiled_ignore_re = re.compile(self._ignore_regex)
            # If the self._ignore_regex is empty then I don't have to ignore
            # anything. To be able to do that, I simply compile an re with "abc"
            # as the pattern, which won't match any URL since they will all
            # start with http:// or https://
            self._compiled_ignore_re = re.compile('abc')

        # Compilation of this regex can't fail because it was already
        # verified as valid at regex_option.py: see REGEX in get_options()
        self._compiled_follow_re = re.compile(self._follow_regex)

    def get_long_desc(self):
        :return: A DETAILED description of the plugin functions and features.
        return """
Ejemplo n.º 36
class ParserCache(CacheStats):
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()

        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        Clear all the internal variables
        :return: None
        om.out.debug('Called clear() on ParserCache')

        # Stop any workers

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):

        # We don't need the parsers anymore

    def should_cache(self, http_response):
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(),

        if cached_can_parse is not None:
            return cached_can_parse

        # We need to verify if we can parse this HTTP response
            can_parse = DocumentParser.can_parse(http_response)
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None

    def get_document_parser_for(self, http_response, cache=True):
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        # We know that we can parse this document, lets work!
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(
            if not wait_result:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            return parser
            # Not in cache, have to work.

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url(
                raise BaseFrameworkException(msg)
            except ScanMustStopException as e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
                # Act just like when there is no parser
                msg = 'There is no parser for "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
                save_to_cache = self.should_cache(http_response) and cache
                if save_to_cache:
                    self._cache[hash_string] = parser
                self._parser_finished_events.pop(hash_string, None)

            return parser

    def _log_return_empty(self, http_response, detail):
        msg = 'Returning empty list in get_tags_by_filter("%s"). '
        msg += detail
        om.out.debug(msg % http_response.get_uri())

    def get_tags_by_filter(self,
        Get specific tags from http_response using the cache if possible

        :param http_response: The http response instance
        :param tags: List of tags to get, or None if all tags should be returned
        :param yield_text: Include the tag text (<a>text</a>)
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        # This is a performance hack that should reduce the time consumed by
        # this method without impacting its results. Note that in HTML this is
        # valid:
        #   <script
        # And this is invalid:
        #   < script
        # We use that in order to speed-up this function
        if tags is not None:
            body_lower = http_response.get_body().lower()

            for tag in tags:
                lt_tag = '<%s' % tag
                if lt_tag in body_lower:
                # No tag was found in the HTML
                return []

        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        if not self.can_parse(http_response):
            self._log_return_empty(http_response, 'No parser available')
            return []

        args = '%r%r' % (tags, yield_text)
        hash_string = get_body_unique_id(http_response, prepend=args)

        if hash_string in self._parser_blacklist:
                                   'HTTP response is blacklisted')
            return []

        # We know that we can parse this document, lets work!
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(
            if not wait_result:
                # Act just like when there is no parser
                                       'Timeout waiting for response')
                return []

        # metric increase

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            return parser
            # Not in cache, have to work.

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

                tags = mp_doc_parser.get_tags_by_filter(http_response,
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.

                # Act just like when there is no parser
                    http_response, 'Timeout waiting for get_tags_by_filter()')
                return []
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.

                # Act just like when there is no parser
                                       'Reached memory usage limit')
                return []
            except ScanMustStopException as e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
            except Exception as e:
                # Act just like when there is no parser
                msg = 'Unhandled exception running get_tags_by_filter("%s"): %s'
                args = (http_response.get_url(), e)
                raise BaseFrameworkException(msg % args)
                if cache:
                    self._cache[hash_string] = tags
                self._parser_finished_events.pop(hash_string, None)

            return tags
Ejemplo n.º 37
class error_500(GrepPlugin):
    Grep every page for error 500 pages that haven't been identified as bugs by
    other plugins.

    :author: Andres Riancho ([email protected])
    # Some HTTP response codes are okay to receive and should not be tagged as
    # a false positive. A couple of examples:
    #   - 400 is bad request, which is potentially generated by long query
    #     string parameters being sent by the scanner
    #   - 406 is Not Acceptable, which is most likely generated by the
    #     content_negotiation plugin during resource discovery
    IGNORE_CODES = (400,

        '<h1>Bad Request (Invalid URL)</h1>',
        '<title>406 Not Acceptable</title>'

    def __init__(self):

        self._error_500_responses = DiskSet(table_prefix='error_500')

    def grep(self, request, response):
        Plugin entry point, identify which requests generated a 500 error.

        :param request: The HTTP request object.
        :param response: The HTTP response object
        :return: None
        if not 400 < response.get_code() < 600:

        if response.get_code() in self.IGNORE_CODES:

        if not response.is_text_or_html():

        if self._is_false_positive(response):

        self._error_500_responses.add((request, response.id))

    def _is_false_positive(self, response):
        Filters out some false positives like this one:

        This false positive is generated by IIS when I send an URL that's "odd"
        Some examples of URLs that trigger this false positive:

        :return: True if the response is a false positive.
        for fps in self.FALSE_POSITIVE_STRINGS:
            if fps in response.get_body():
                return True

        return False

    def end(self):
        This method is called when the plugin wont be used anymore.

        The real job of this plugin is done here, where I will try to see if
        one of the error_500 responses were not identified as a vuln by some
        of my audit plugins
        all_vuln_ids = set()

        for info in kb.kb.get_all_findings_iter():
            for _id in info.get_id():

        for request, error_500_response_id in self._error_500_responses:

            if error_500_response_id not in all_vuln_ids:
                # Found a error 500 that wasn't identified !
                desc = ('An unidentified web application error (HTTP response'
                        ' code 500) was found at: "%s". Enable all plugins and'
                        ' try again, if the vulnerability still is not'
                        ' identified, please verify manually and report it to'
                        ' the w3af developers.')
                desc %= request.get_url()

                v = Vuln('Unhandled error in web application', desc,
                         severity.MEDIUM, error_500_response_id,


                self.kb_append_uniq(self, 'error_500', v, 'VAR')


    def get_long_desc(self):
        :return: A DETAILED description of the plugin functions and features.
        return """
Ejemplo n.º 38
class dir_file_bruter(CrawlPlugin):
    Finds Web server directories and files by bruteforcing.

    :author: Jon Rose ( [email protected] )
    :author: Andres Riancho ( [email protected] )
    :author: Tomas Velazquez
    BASE_PATH = os.path.join(ROOT_PATH, 'plugins', 'crawl', 'dir_file_bruter')
    def __init__(self):

        # User configured parameters
        self._dir_list = os.path.join(self.BASE_PATH, 'common_dirs_small.db')
        self._file_list = os.path.join(self.BASE_PATH, 'common_files_small.db')

        self._bf_directories = True
        self._bf_files = False
        self._be_recursive = False

        # Internal variables
        self._exec = True
        self._already_tested = DiskSet(table_prefix='dir_file_bruter')

    def crawl(self, fuzzable_request):
        Get the file and parse it.

        :param fuzzable_request: A fuzzable_request instance that contains
                               (among other things) the URL to test.
        if not self._exec:
            raise RunOnce()
            domain_path = fuzzable_request.get_url().get_domain_path()

            # Should I run more than once?
            if not self._be_recursive:
                self._exec = False

            if domain_path not in self._already_tested:

    def _dir_name_generator(self, base_path):
        Simple generator that returns the names of the directories and files to
        test. It extracts the information from the user configured wordlist

        @yields: (A string with the directory or file name,
                  a URL object with the dir or file name)
        if self._bf_directories:
            for directory_name in file(self._dir_list):
                directory_name = directory_name.strip()

                # ignore comments and empty lines
                if directory_name and not directory_name.startswith('#'):
                        dir_url = base_path.url_join(directory_name + '/')
                    except ValueError, ve:
                        msg = 'The "%s" line at "%s" generated an ' \
                              'invalid URL: %s'
                        om.out.debug(msg % (directory_name, self._dir_list, ve))
                        yield directory_name, dir_url

        if self._bf_files:
            for file_name in file(self._file_list):
                file_name = file_name.strip()

                # ignore comments and empty lines
                if file_name and not file_name.startswith('#'):
                        dir_url = base_path.url_join(file_name)
                    except ValueError, ve:
                        msg = 'The "%s" line at "%s" generated an ' \
                              'invalid URL: %s'
                        om.out.debug(msg % (file_name, self._file_list, ve))
                        yield file_name, dir_url
Ejemplo n.º 39
class click_jacking(GrepPlugin):
    Grep every page for missing click jacking protection headers.

    :author: Taras ([email protected])
    :author: Andres ([email protected])

    MAX_SAMPLES = 25
    DO_NOT_FRAME = {301, 302, 303, 307, 400, 403, 404, 500}

    def __init__(self):

        self._total_http_request_count = 0
        self._vuln_count = 0
        self._vuln_urls = DiskSet(table_prefix='click_jacking')
        self._vuln_ids = DiskSet(table_prefix='click_jacking')

    def grep(self, request, response):
        Check x-frame-options header
        # Can not iframe a POST, PUT, etc.
        if request.get_method() != 'GET':

        if response.get_code() in self.DO_NOT_FRAME:

        if not response.is_text_or_html():

        # An attacker will never run a clickjacking attack on an empty response
        # Empty responses are common in redirects, 400 and 500 errors, etc.
        if not response.get_body():

        if not self._response_will_be_rendered(response):

        if is_404(response):

        self._total_http_request_count += 1

        if self._is_protected_against_clickjacking(request, response):


    def _response_will_be_rendered(self, response):
        Browsers will never render responses with application/javascript
        content-type, so it doesn't make sense for an attacker to do a
        click-jacking attack on these.

        :param response: An HTTP response
        :return: True if the response has javascript content type
        if 'javascript' in response.content_type:
            return False

        if 'css' in response.content_type:
            return False

        if 'application/xml' in response.content_type:
            return False

        return True

    def _add_response_to_findings(self, response):
        self._vuln_count += 1

        if len(self._vuln_urls) >= self.MAX_SAMPLES:


    def _is_protected_against_clickjacking(self, request, response):
        There are many methods to protect a site against clickjacking, this
        method checks for all of them.

        :param request: HTTP request
        :param response: HTTP response
        :return: True if the response is protected
        methods = [

        for method in methods:
            if method(request, response):
                return True

        return False

    def _is_protected_with_x_frame_options(self, request, response):
        Check if the HTTP response has the x-frame-options header set
        to the secure value.

        :param request: HTTP request
        :param response: HTTP response
        :return: True if the response is protected
        headers = response.get_headers()
        x_frame_options, header_name = headers.iget('x-frame-options', '')

        if x_frame_options.lower() in ('deny', 'sameorigin'):
            return True

        return False

    def _is_protected_with_csp(self, request, response):
        Check if the HTTP response has a CSP header, parse it, extract the
        frame-ancestors attribute and check it is secure.

        :param request: HTTP request
        :param response: HTTP response
        :return: True if the response is protected
        # These are the policies that will be enforced by the browser
        non_report_only_policies = retrieve_csp_policies(response, False, True)
        frame_ancestors = non_report_only_policies.get('frame-ancestors', [])

        # This is the strictest policy, nobody can frame me!
        # Content-Security-Policy: frame-ancestors 'none';
        for policy in frame_ancestors:
            if policy.lower() == 'none':
                return True

        # Fail when the frame-ancestors has insecure wildcards
        #   Content-Security-Policy: frame-ancestors '*';
        #   Content-Security-Policy: frame-ancestors 'https://*';
        insecure_ancestors = ('*',
                              'http', 'https',
                              'http://', 'https://',
                              'http://*', 'https://*')

        for policy in frame_ancestors:
            if policy.lower() in insecure_ancestors:
                return False

        # Content-Security-Policy: frame-ancestors 'self';
        if 'self' in frame_ancestors:
            return True

        # Content-Security-Policy: frame-ancestors 'foo.com' '*.somesite.com';
        if len(frame_ancestors):
            return True

        return False

    def end(self):
        # If all URLs implement protection, don't report anything.
        if not self._vuln_count:

        response_ids = [_id for _id in self._vuln_ids]
        if self._total_http_request_count == self._vuln_count:
            # If none of the URLs implement protection, simply report
            # ONE vulnerability that says that
            desc = 'The application has no protection against Click-Jacking attacks.'

            if len(response_ids) >= self.MAX_SAMPLES:
                desc += (' All the received HTTP responses were found to be'
                         ' vulnerable, only the first %s samples were captured'
                         ' as proof.' % self.MAX_SAMPLES)

            # If most of the URLs implement the protection but some
            # don't, report ONE vulnerability saying: "Most are protected,
            # but x, y are not
            if len(response_ids) >= self.MAX_SAMPLES:
                desc = ('Multiple application URLs have no protection against'
                        ' Click-Jacking attacks. Only the first %s samples were'
                        ' captured as proof. The list of vulnerable URLs is:'
                        '\n\n - ' % self.MAX_SAMPLES)
                desc = ('Multiple application URLs have no protection against'
                        ' Click-Jacking attacks. The list of vulnerable URLs is:'
                        '\n\n - ')

            desc += ' - '.join([str(url) + '\n' for url in self._vuln_urls])

        v = Vuln('Click-Jacking vulnerability',
        self.kb_append(self, 'click_jacking', v)

    def get_long_desc(self):
        return """
Ejemplo n.º 40
class dir_file_bruter(CrawlPlugin):
    Finds Web server directories and files by bruteforcing.

    :author: Jon Rose ( [email protected] )
    :author: Andres Riancho ( [email protected] )
    :author: Tomas Velazquez

    BASE_PATH = os.path.join(ROOT_PATH, 'plugins', 'crawl', 'dir_file_bruter')

    def __init__(self):

        # User configured parameters
        self._dir_list = os.path.join(self.BASE_PATH, 'common_dirs_small.db')
        self._file_list = os.path.join(self.BASE_PATH, 'common_files_small.db')

        self._bf_directories = True
        self._bf_files = False
        self._be_recursive = False

        # Internal variables
        self._exec = True
        self._already_tested = DiskSet(table_prefix='dir_file_bruter')

    def crawl(self, fuzzable_request, debugging_id):
        Get the file and parse it.

        :param debugging_id: A unique identifier for this call to discover()
        :param fuzzable_request: A fuzzable_request instance that contains
                               (among other things) the URL to test.
        if not self._exec:
            raise RunOnce()

        domain_path = fuzzable_request.get_url().get_domain_path()

        # Should I run more than once?
        if not self._be_recursive:
            self._exec = False

        if domain_path in self._already_tested:


    def _bruteforce_directories(self, base_path):
        :param base_path: The base path to use in the bruteforcing process,
                          can be something like http://host.tld/ or
                          http://host.tld/images/ .

        :return: None, the data is stored in self.output_queue
        url_generator = self._url_generator(base_path)
        base_path_repeater = repeat(base_path)
        arg_iter = izip(base_path_repeater, url_generator)


    def _url_generator(self, base_path):
        Simple generator that yields the new URLs to test. It extracts the
        information from the user-configured wordlists and generates both
        directories and file names to test.

        :yields: (String with the directory or file name,
                  URL object with the dir or file name)
        if self._bf_directories:
            is_path = True
            for line, new_url in self._read_db_file_gen_url(
                    base_path, self._dir_list, is_path):
                yield line, new_url

        if self._bf_files:
            is_path = False
            for line, new_url in self._read_db_file_gen_url(
                    base_path, self._file_list, is_path):
                yield line, new_url

    def _read_db_file_gen_url(self, base_path, file_name, is_path):
        :param base_path: The base URL
        :param file_name: The wordlist filename to read
        :param is_path: True if we should generate directories, else generate files
        :yields: (String with the directory or file name,
                  URL object with the dir or file name)
        for line in file(file_name):
            line = line.strip()

            # ignore comments and empty lines
            if not line:

            if line.startswith('#'):

            if is_path:
                line = line + '/'

                new_url = base_path.url_join(line)
            except ValueError, ve:
                msg = 'The "%s" line at "%s" generated an invalid URL: %s'
                om.out.debug(msg % (line, file_name, ve))
                yield line, new_url
Ejemplo n.º 41
class phpinfo(CrawlPlugin):
    Search PHP Info file and if it finds it will determine the version of PHP.
    :author: Viktor Gazdag ( [email protected] )
    :author: Aung Khant ( aungkhant[at]yehg.net )
    PHP_VERSION_RE = re.compile('(<tr class="h"><td>\n|alt="PHP Logo" /></a>)'
                                '<h1 class="p">PHP Version (.*?)</h1>', re.I)
    SYSTEM_RE = re.compile('System </td><td class="v">(.*?)</td></tr>', re.I)

    def __init__(self):

        # Internal variables
        self._analyzed_dirs = DiskSet(table_prefix='phpinfo')
        self._has_audited = False

    def crawl(self, fuzzable_request):
        For every directory, fetch a list of files and analyze the response.

        :param fuzzable_request: A fuzzable_request instance that contains
                                    (among other things) the URL to test.
        for domain_path in fuzzable_request.get_url().get_directories():

            if domain_path in self._analyzed_dirs:

            url_repeater = repeat(domain_path)
            args = izip(url_repeater, self._get_potential_phpinfos())

            self.worker_pool.map_multi_args(self._check_and_analyze, args)

    def _get_potential_phpinfos(self):
        :return: Filename of the php info file.
        res = ['phpinfo.php', 'PhpInfo.php', 'PHPinfo.php', 'PHPINFO.php',
               'phpInfo.php', 'info.php', 'test.php?mode=phpinfo',
               'index.php?view=phpinfo', 'index.php?mode=phpinfo',
               'TEST.php?mode=phpinfo', '?mode=phpinfo', '?view=phpinfo',
               'install.php?mode=phpinfo', 'INSTALL.php?mode=phpinfo',
               'admin.php?mode=phpinfo', 'phpversion.php', 'phpVersion.php',
               'test1.php', 'phpinfo1.php', 'phpInfo1.php', 'info1.php',
               'PHPversion.php', 'x.php', 'xx.php', 'xxx.php']

        identified_os = kb.kb.raw_read('fingerprint_os', 'operating_system_str')

        if not isinstance(identified_os, basestring):
            identified_os = cf.cf.get('target_os')

        # pylint: disable=E1103
        if 'windows' in identified_os.lower():
            res = list(set([path.lower() for path in res]))
        # pylint: enable=E1103

        return res

    def _check_and_analyze(self, domain_path, php_info_filename):
        Check if a php_info_filename exists in the domain_path.
        :return: None, everything is put() into the self.output_queue.
        php_info_url = domain_path.url_join(php_info_filename)
            response = self._uri_opener.GET(php_info_url, cache=True)
        except BaseFrameworkException, w3:
            msg = 'Failed to GET phpinfo file: "%s". Exception: "%s".'
            om.out.debug(msg % (php_info_url, w3))

        # Needs to exist
        if is_404(response):

        # Create the fuzzable request and send it to the core
        fr = FuzzableRequest.from_http_response(response)

        # Check if it's a phpinfo file
        php_version = self.PHP_VERSION_RE.search(response.get_body(), re.I)
        sysinfo = self.SYSTEM_RE.search(response.get_body(), re.I)

        if php_version and sysinfo:
            desc = ('The phpinfo() file was found at: %s. The version'
                    ' of PHP is: "%s" and the system information is:'
                    ' "%s".')
            desc %= (response.get_url(), php_version.group(2), sysinfo.group(1))

            v = Vuln('phpinfo() file found', desc, severity.MEDIUM,
                     response.id, self.get_name())

            kb.kb.append(self, 'phpinfo', v)
            om.out.vulnerability(v.get_desc(), severity=v.get_severity())

            if not self._has_audited:
                self._has_audited = True
Ejemplo n.º 42
class dir_file_bruter(CrawlPlugin):
    Finds Web server directories and files by bruteforcing.

    :author: Jon Rose ( [email protected] )
    :author: Andres Riancho ( [email protected] )
    :author: Tomas Velazquez

    BASE_PATH = os.path.join(ROOT_PATH, 'plugins', 'crawl', 'dir_file_bruter')

    def __init__(self):

        # User configured parameters
        self._dir_list = os.path.join(self.BASE_PATH, 'common_dirs_small.db')
        self._file_list = os.path.join(self.BASE_PATH, 'common_files_small.db')

        self._bf_directories = True
        self._bf_files = False
        self._be_recursive = False

        # Internal variables
        self._exec = True
        self._already_tested = DiskSet(table_prefix='dir_file_bruter')

    def crawl(self, fuzzable_request):
        Get the file and parse it.

        :param fuzzable_request: A fuzzable_request instance that contains
                               (among other things) the URL to test.
        if not self._exec:
            raise RunOnce()
            domain_path = fuzzable_request.get_url().get_domain_path()

            # Should I run more than once?
            if not self._be_recursive:
                self._exec = False

            if domain_path not in self._already_tested:

    def _dir_name_generator(self, base_path):
        Simple generator that returns the names of the directories and files to
        test. It extracts the information from the user configured wordlist

        @yields: (A string with the directory or file name,
                  a URL object with the dir or file name)
        if self._bf_directories:
            for directory_name in file(self._dir_list):
                directory_name = directory_name.strip()

                # ignore comments and empty lines
                if directory_name and not directory_name.startswith('#'):
                        dir_url = base_path.url_join(directory_name + '/')
                    except ValueError, ve:
                        msg = 'The "%s" line at "%s" generated an ' \
                              'invalid URL: %s'
                        om.out.debug(msg %
                                     (directory_name, self._dir_list, ve))
                        yield directory_name, dir_url

        if self._bf_files:
            for file_name in file(self._file_list):
                file_name = file_name.strip()

                # ignore comments and empty lines
                if file_name and not file_name.startswith('#'):
                        dir_url = base_path.url_join(file_name)
                    except ValueError, ve:
                        msg = 'The "%s" line at "%s" generated an ' \
                              'invalid URL: %s'
                        om.out.debug(msg % (file_name, self._file_list, ve))
                        yield file_name, dir_url
Ejemplo n.º 43
    def test_update(self):
        ds = DiskSet()
        ds.update([2, 3, 1])

        self.assertEqual(list(ds), [1, 2, 3])
Ejemplo n.º 44
class DBKnowledgeBase(BasicKnowledgeBase):
    This class saves the data that is sent to it by plugins. It is the only way
    in which plugins can exchange information.

    Data is stored in a DB.

    :author: Andres Riancho ([email protected])

    def __init__(self):
        super(DBKnowledgeBase, self).__init__()
        self.urls = DiskSet(table_prefix='kb_urls')
        self.fuzzable_requests = DiskSet(table_prefix='kb_fuzzable_requests')
        self.db = get_default_persistent_db_instance()

        columns = [('location_a', 'TEXT'),
                   ('location_b', 'TEXT'),
                   ('uniq_id', 'TEXT'),
                   ('pickle', 'BLOB')]

        self.table_name = 'knowledge_base_' + rand_alpha(30)
        self.db.create_table(self.table_name, columns)
        self.db.create_index(self.table_name, ['location_a', 'location_b'])
        self.db.create_index(self.table_name, ['uniq_id',])
        # TODO: Why doesn't this work with a WeakValueDictionary?
        self.observers = {} #WeakValueDictionary()
        self.type_observers = {} #WeakValueDictionary()
        self.url_observers = []
        self._observer_id = 0

    def clear(self, location_a, location_b):
        location_a = self._get_real_name(location_a)
        query = "DELETE FROM %s WHERE location_a = ? and location_b = ?"
        params = (location_a, location_b)
        self.db.execute(query % self.table_name, params)

    def raw_write(self, location_a, location_b, value):
        This method saves value to (location_a,location_b) but previously
        clears any pre-existing values.
        if isinstance(value, Info):
            raise TypeError('Use append or append_uniq to store vulnerabilities')
        location_a = self._get_real_name(location_a)
        self.clear(location_a, location_b)
        self.append(location_a, location_b, value, ignore_type=True)

    def raw_read(self, location_a, location_b):
        This method reads the value from (location_a,location_b)
        location_a = self._get_real_name(location_a)
        result = self.get(location_a, location_b, check_types=False)
        if len(result) > 1:
            msg = 'Incorrect use of raw_write/raw_read, found %s rows.'
            raise RuntimeError(msg % result)
        elif len(result) == 0:
            return []
            return result[0]
    def _get_uniq_id(self, obj):
        if isinstance(obj, Info):
            return obj.get_uniq_id()
            if isinstance(obj, collections.Iterable):
                concat_all = ''.join([str(i) for i in obj])
                return str(hash(concat_all))
                return str(hash(obj))

    def append(self, location_a, location_b, value, ignore_type=False):
        This method appends the location_b value to a dict.
        if not ignore_type and not isinstance(value, (Info, Shell)):
            msg = 'You MUST use raw_write/raw_read to store non-info objects'\
                  ' to the KnowledgeBase.'
            raise TypeError(msg)
        location_a = self._get_real_name(location_a)
        uniq_id = self._get_uniq_id(value)
        pickled_obj = cpickle_dumps(value)
        t = (location_a, location_b, uniq_id, pickled_obj)
        query = "INSERT INTO %s VALUES (?, ?, ?, ?)" % self.table_name
        self.db.execute(query, t)
        self._notify(location_a, location_b, value)

    def get(self, location_a, location_b, check_types=True):
        :param location_a: The plugin that saved the data to the
                           kb.info Typically the name of the plugin,
                           but could also be the plugin instance.

        :param location_b: The name of the variables under which the vuln
                           objects were saved. Typically the same name of
                           the plugin, or something like "vulns", "errors",
                           etc. In most cases this is NOT None. When set
                           to None, a dict with all the vuln objects found
                           by the plugin_name is returned.

        :return: Returns the data that was saved by another plugin.
        location_a = self._get_real_name(location_a)
        if location_b is None:
            query = 'SELECT pickle FROM %s WHERE location_a = ?'
            params = (location_a,)
            query = 'SELECT pickle FROM %s WHERE location_a = ?'\
                                           ' and location_b = ?'
            params = (location_a, location_b)
        result_lst = []
        results = self.db.select(query % self.table_name, params)
        for r in results:
            obj = cPickle.loads(r[0])
            if check_types and not isinstance(obj, (Info, Shell)):
                raise TypeError('Use raw_write and raw_read to query the'
                                ' knowledge base for non-Info objects')
        return result_lst

    def get_by_uniq_id(self, uniq_id):
        query = 'SELECT pickle FROM %s WHERE uniq_id = ?'
        params = (uniq_id,)
        result = self.db.select_one(query % self.table_name, params)
        if result is not None:
            result = cPickle.loads(result[0])
        return result

    def add_observer(self, location_a, location_b, observer):
        Add the observer function to the observer list. The function will be
        called when there is a change in (location_a, location_b).
        You can use None in location_a or location_b as wildcards.
        The observer function needs to be a function which takes three params:
            * location_a
            * location_b
            * value that's added to the kb location
        :return: None
        if not isinstance(location_a, (basestring, types.NoneType)) or \
        not isinstance(location_a, (basestring, types.NoneType)):
            raise TypeError('Observer locations need to be strings or None.')
        observer_id = self.get_observer_id()
        self.observers[(location_a, location_b, observer_id)] = observer
    def add_types_observer(self, type_filter, observer):
        Add the observer function to the list of functions to be called when a
        new object that is of type "type_filter" is added to the KB.
        The type_filter must be one of Info, Vuln or Shell.
        :return: None
        if type_filter not in (Info, Vuln, Shell):
            msg = 'The type_filter needs to be one of Info, Vuln or Shell'
            raise TypeError(msg)
        observer_id = self.get_observer_id()
        self.type_observers[(type_filter, observer_id)] = observer
    def get_observer_id(self):
        self._observer_id += 1
        return self._observer_id
    def _notify(self, location_a, location_b, value):
        Call the observer if the location_a/location_b matches with the
        configured observers.
        :return: None
        # Note that I copy the items list in order to iterate though it without
        # any issues like the size changing
        for (obs_loc_a, obs_loc_b, _), observer in self.observers.items()[:]:
            if obs_loc_a is None and obs_loc_b is None:
                observer(location_a, location_b, value)

            if obs_loc_a == location_a and obs_loc_b is None:
                observer(location_a, location_b, value)
            if obs_loc_a == location_a and obs_loc_b == location_b:
                observer(location_a, location_b, value)
        for (type_filter, _), observer in self.type_observers.items()[:]:
            if isinstance(value, type_filter):
                observer(location_a, location_b, value)

    def get_all_entries_of_class(self, klass):
        :return: A list of all objects of class == klass that are saved in the
        query = 'SELECT pickle FROM %s'
        results = self.db.select(query % self.table_name)
        result_lst = []

        for r in results:
            obj = cPickle.loads(r[0])
            if isinstance(obj, klass):
        return result_lst

    def dump(self):
        result_dict = {}
        query = 'SELECT location_a, location_b, pickle FROM %s'
        results = self.db.select(query % self.table_name)
        for location_a, location_b, pickle in results:
            obj = cPickle.loads(pickle)
            if location_a not in result_dict:
                result_dict[location_a] = {location_b: [obj,]}
            elif location_b not in result_dict[location_a]:
                result_dict[location_a][location_b] = [obj,]
        return result_dict

    def cleanup(self):
        Cleanup internal data.
        self.db.execute("DELETE FROM %s WHERE 1=1" % self.table_name)
        # Remove the old, create new.
        self.urls = DiskSet(table_prefix='kb_urls')
        self.fuzzable_requests = DiskSet(table_prefix='kb_fuzzable_requests')
    def remove(self):
    def get_all_known_urls(self):
        :return: A DiskSet with all the known URLs as URL objects.
        return self.urls

    def add_url_observer(self, observer):

    def _notify_url_observers(self, new_url):
        Call the observer with new_url.
        :return: None
        # Note that I copy the items list in order to iterate though it without
        # any issues like the size changing
        for observer in self.url_observers[:]:            
    def add_url(self, url):
        :return: True if the URL was previously unknown 
        if not isinstance(url, URL):
            msg = 'add_url requires a URL as parameter got %s instead.'
            raise TypeError(msg % type(url))
        return self.urls.add(url)
    def get_all_known_fuzzable_requests(self):
        :return: A DiskSet with all the known URLs as URL objects.
        return self.fuzzable_requests
    def add_fuzzable_request(self, fuzzable_request):
        :return: True if the FuzzableRequest was previously unknown 
        if not isinstance(fuzzable_request, FuzzableRequest):
            msg = 'add_fuzzable_request requires a FuzzableRequest as '\
                  'parameter, got "%s" instead.'
            raise TypeError(msg % type(fuzzable_request))
        return self.fuzzable_requests.add(fuzzable_request)
Ejemplo n.º 45
class ParserCache(CacheStats):
    This class is a document parser cache.

    :author: Andres Riancho ([email protected])
    CACHE_SIZE = 10
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    DEBUG = core_profiling_is_enabled()

    def __init__(self):
        super(ParserCache, self).__init__()
        self._cache = SynchronizedLRUDict(self.CACHE_SIZE)
        self._can_parse_cache = SynchronizedLRUDict(self.CACHE_SIZE * 10)
        self._parser_finished_events = {}
        self._parser_blacklist = DiskSet()

    def clear(self):
        Clear all the internal variables
        :return: None
        om.out.debug('Called clear() on ParserCache')

        # Stop any workers

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            if hasattr(parser, 'clear'):

        # We don't need the parsers anymore

    def should_cache(self, http_response):
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def can_parse(self, http_response):
        Check if we can parse an HTTP response

        :param http_response: The HTTP response to verify
        :return: True if we can parse this HTTP response
        cached_can_parse = self._can_parse_cache.get(http_response.get_id(), default=None)

        if cached_can_parse is not None:
            return cached_can_parse

        # We need to verify if we can parse this HTTP response
            can_parse = DocumentParser.can_parse(http_response)
            # We catch all the exceptions here and just return False because
            # the real parsing procedure will (most likely) fail to parse
            # this response too.
            can_parse = False

        self._can_parse_cache[can_parse] = can_parse
        return can_parse

    def add_to_blacklist(self, hash_string):
        Add a hash_string representing an HTTP response to the blacklist,
        indicating that we won't try to parse this response never again.

        :return: None

    def get_document_parser_for(self, http_response, cache=True):
        Get a document parser for http_response using the cache if possible

        :param http_response: The http response instance
        :param cache: True if the document parser should be saved to the cache
        :return: An instance of DocumentParser
        # Before doing anything too complex like caching, sending the HTTP
        # response to a different process for parsing, checking events, etc.
        # check if we can parse this HTTP response.
        # This is a performance improvement that works *only if* the
        # DocumentParser.can_parse call is *fast*, which means that the
        # `can_parse` implementations of each parser needs to be fast
        # It doesn't matter if we say "yes" here and then parsing exceptions
        # appear later, that should be a 1 / 10000 calls and we would still
        # be gaining a lot of performance
        if not self.can_parse(http_response):
            msg = 'There is no parser for "%s".'
            raise BaseFrameworkException(msg % http_response.get_url())

        hash_string = get_response_unique_id(http_response)

        if hash_string in self._parser_blacklist:
            msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.'
            raise BaseFrameworkException(msg % http_response.get_url())

        # We know that we can parse this document, lets work!
        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            wait_result = parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT)
            if not wait_result:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s". Waited more than %s sec.'
                args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT)
                raise BaseFrameworkException(msg % args)

        # metric increase

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            return parser
            # Not in cache, have to work.

            # Create a new instance of DocumentParser, add it to the cache
            event = threading.Event()
            self._parser_finished_events[hash_string] = event

                parser = mp_doc_parser.get_document_parser_for(http_response)
            except TimeoutError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles trying
                # to parse it over and over.

                # Act just like when there is no parser
                msg = 'Reached timeout parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except MemoryError:
                # We failed to get a parser for this HTTP response, we better
                # ban this HTTP response so we don't waste more CPU cycles or
                # memory trying to parse it over and over.

                # Act just like when there is no parser
                msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)
            except ScanMustStopException, e:
                msg = 'The document parser is in an invalid state! %s'
                raise ScanMustStopException(msg % e)
Ejemplo n.º 46
    def test_update(self):
        ds = DiskSet()
        ds.update([2, 3, 1])

        self.assertEqual(list(ds), [1, 2, 3])