def mapper(self, _, line):
        f = None
        #if self.options.runner in ['inline']:
        #  print self.options.runner + "lol"
        #  print 'Loading local file {}'.format(line)
        #  f = warc.WARCFile(fileobj=gzip.open(line))
        #else:
        conn = boto.connect_s3(anon=True)
        pds = conn.get_bucket('aws-publicdatasets')
        k = Key(pds, line)
        f = warc.WARCFile(fileobj=GzipStreamFile(k))

        for i, record in enumerate(f):
            if record['Content-Type'] == 'application/http; msgtype=response':
                payload = record.payload.read()
                headers, body = payload.split('\r\n\r\n', 1)
                data = []
                #data = data + Detector().check_headers(headers)
                data = data + Detector().check_script(body)
                data = data + Detector().check_html(body)
                data = {
                    "tech": data,
                    "url": record.url,
                    "date": record.date,
                    "domain": urlparse(record.url).netloc
                }
                yield data, 1
Ejemplo n.º 2
0
    def scan_target(self, url):
        """Function to detect technologies running on target and list them in
        gui treeview"""
        _id = None
        try:
            d = Detector().detect(url=url, timeout=5)
            for result in d:
                if d[result]:
                    ext = tldextract.extract(url)
                    _id = self.treeview.insert('',
                                               'end',
                                               text='.'.join(ext[:3]))
                    tech_type, software = d[result][0].get('type'), \
                                          d[result][0].get('app')
                    version = d[result][0].get('ver')

                    # assign to gui treeview
                    if not version:
                        version = 'None'
                    self.treeview.insert(_id,
                                         'end',
                                         text=tech_type,
                                         values=(software, version))
                    self.status['text'] = 'done'
                else:
                    self.status['text'] = 'No results found'

        except ValueError:
            self.status['text'] = "Invalid! Please input a full url"
        finally:
            del _id
Ejemplo n.º 3
0
    def test_remove_duplicates(self):
        with_duplicates = [
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': None}, {'app': 'D', 'ver': "7.0"},
            {'app': 'E', 'ver': "1"}, {'app': 'F', 'ver': "2.2"},
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "222"},
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'E', 'ver': None}, {'app': 'E', 'ver': "1.3"},
            {'app': 'F', 'ver': "2"}, {'app': 'F', 'ver': None},
        ]

        without_duplicates = [
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "7.0"},
            {'app': 'E', 'ver': "1.3"},
            {'app': 'F', 'ver': "2.2"}, {'app': 'D', 'ver': "222"},
        ]

        Detector().remove_duplicates(with_duplicates)
        assert with_duplicates == without_duplicates
Ejemplo n.º 4
0
 def setUp(self):
     self.detector = Detector()
     self.apps = self.detector.apps
     self.categories = self.detector.categories
Ejemplo n.º 5
0
class TestDetector(unittest.TestCase):
    def setUp(self):
        self.detector = Detector()
        self.apps = self.detector.apps
        self.categories = self.detector.categories

    def mock_detector_run(self, url='', content='', headers=None):
        with mock.patch('wad.detection.tools') as mockObj:
            page = mock.MagicMock()
            page.geturl.return_value = url
            if six.PY3:
                page.read.return_value = bytes(content, encoding='utf-8')
            else:
                page.read.return_value = content
            page.info.return_value = headers or dict()
            mockObj.urlopen = mock.Mock(return_value=page)
            results = self.detector.detect('http://abc.xyz')
        return results

    def test_check_re(self):
        # checking version patterns:
        #
        #   "headers": { "Server": "IIS(?:/([\\d.]+))?\\;version:\\1" },
        assert (self.detector.check_re(
            self.apps['IIS']['headers_re']['Server'],
            self.apps['IIS']['headers']['Server'],
            'Microsoft-IIS/7.5',
            [], None, 'IIS') == [{'app': 'IIS', 'ver': '7.5'}])

        # (?:maps\\.google\\.com/maps\\?file=api(?:&v=([\\d.]+))?|
        # maps\\.google\\.com/maps/api/staticmap)\\;version:API v\\1
        assert (self.detector.check_re(
            self.apps['Google Maps']['script_re'][0],
            self.apps['Google Maps']['script'][0],
            'abc <script src="maps.google.com/maps?file=api&v=123"> def',
            [], None, 'Google Maps') == [{'app': 'Google Maps', 'ver': 'API v123'}])

        # "script": [ "js/mage", "skin/frontend/(?:default|(enterprise))\\;version:\\1?Enterprise:Community" ],
        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/whatever"> def',
            [], None, 'Magento') == [])

        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/default"> def',
            [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Community'}])

        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/enterprise"> def',
            [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Enterprise'}])

    def test_check_url(self):
        assert self.detector.check_url("http://whatever.blogspot.com") == [{'app': 'Blogger', 'ver': None}]
        assert self.detector.check_url("https://whatever-else3414.de/script.php") == [{'app': 'PHP', 'ver': None}]

    def test_check_html(self):
        content = '<html><div id="gsNavBar" class="gcBorder1">whatever'
        assert self.detector.check_html(content) == [{'app': 'Gallery', 'ver': None}]

    def test_check_meta(self):
        assert (self.detector.check_meta('<html>    s<meta name="generator" content="Percussion">sssss    whatever') ==
                [{'app': 'Percussion', 'ver': None}])
        assert (self.detector.check_meta(" dcsaasd f<meta   name    = 'cargo_title' dd  content  =   'Pdafadfda'  >") ==
                [{'app': 'Cargo', 'ver': None}])
        assert (self.detector.check_meta(" dcsaasd f<mfffffffeta     name='cargo_title' dd  content='Pdafadfda'  >") ==
                [])
        assert self.detector.check_meta(" dcsaasd f<meta     name='cargo_title' >") == []

    def test_check_script(self):
        assert (self.detector.check_script('<html>    s<script  sda f     src    =  "jquery1.7.js">') ==
                [{'app': 'jQuery', 'ver': None}])
        assert self.detector.check_script(" dcsaasd f<script     src='' >") == []

    def test_check_headers(self):
        headers = [('Host', 'abc.com'), ('Server', 'Linux Ubuntu 12.10')]
        headers_mock = mock.Mock()
        headers_mock.items.return_value = headers

        assert (self.detector.check_headers(headers_mock) ==
                [{'app': 'Ubuntu', 'ver': None}])

    def test_check_cookies(self):
        headers = {'Set-Cookie': 'x=1; xid=%s; y=2' % ('a'*32)}

        assert (self.detector.check_cookies(headers) ==
                [{'app': 'X-Cart', 'ver': None}])

    def test_implied_by(self):
        # ASP implies WS and IIS and IIS implies WS;
        # but we already know about IIS, so the only new implied app is WS
        assert self.detector.implied_by(['Microsoft ASP.NET', 'IIS']) == ['Windows Server']

    def test_follow_implies(self):
        # empty findings
        findings = []
        self.detector.follow_implies(findings)
        assert findings == []

        # no implies
        findings = [{'app': 'reCAPTCHA', 'ver': None}]
        self.detector.follow_implies(findings)
        assert findings == [{'app': 'reCAPTCHA', 'ver': None}]

        # Django CMS implies Django, and Django implies Python - let's see if this chain is followed
        findings = [{'app': 'Django CMS', 'ver': None}]
        self.detector.follow_implies(findings)
        assert (findings ==
                [{'app': 'Django CMS', 'ver': None},
                 {'app': 'Django', 'ver': None},
                 {'app': 'Python', 'ver': None}])

    def test_remove_duplicates(self):
        with_duplicates = [
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': None}, {'app': 'D', 'ver': "7.0"},
            {'app': 'E', 'ver': "1"}, {'app': 'F', 'ver': "2.2"},
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "222"},
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'E', 'ver': None}, {'app': 'E', 'ver': "1.3"},
            {'app': 'F', 'ver': "2"}, {'app': 'F', 'ver': None},
        ]

        without_duplicates = [
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "7.0"},
            {'app': 'E', 'ver': "1.3"},
            {'app': 'F', 'ver': "2.2"}, {'app': 'D', 'ver': "222"},
        ]

        Detector().remove_duplicates(with_duplicates)
        assert with_duplicates == without_duplicates

    def test_excluded_by(self):
        # both 'Neos Flow' and 'Neos CMS' exclude 'TYPO3 CMS'
        assert self.detector.excluded_by(['Neos Flow', 'Neos CMS']) == ['TYPO3 CMS']
        # 'JBoss Web' excludes 'Apache Tomcat'; 'Mambo' excludes 'Joomla'
        assert set(self.detector.excluded_by(['JBoss Web', 'Jetty', 'Mambo'])) == set(['Joomla', 'Apache Tomcat'])
        # 'IIS' doesn't exclude anything
        assert self.detector.excluded_by(['IIS']) == []

    def test_remove_exclusions(self):
        # empty findings
        findings = []
        self.detector.remove_exclusions(findings)
        assert findings == []

        # no implies
        findings = [{'app': 'reCAPTCHA', 'ver': None}]
        self.detector.remove_exclusions(findings)
        assert findings == [{'app': 'reCAPTCHA', 'ver': None}]

        # real exclusions
        findings = [{'app': 'JBoss Web', 'ver': None},
                    {'app': 'Apache Tomcat', 'ver': None},
                    {'app': 'IIS', 'ver': None},
                    {'app': 'TYPO3 CMS', 'ver': None},
                    {'app': 'Neos Flow', 'ver': None}]
        self.detector.remove_exclusions(findings)
        assert (findings ==
                [{'app': 'JBoss Web', 'ver': None},
                 {'app': 'IIS', 'ver': None},
                 {'app': 'Neos Flow', 'ver': None}])

    def test_add_categories(self):
        findings = [
            {'app': 'Django CMS', 'ver': None},
            {'app': 'Django', 'ver': None},
            {'app': 'Python', 'ver': '2.7'},
            {'app': 'Dynamicweb', 'ver': 'beta'}]
        original = copy.deepcopy(findings)
        original[0]["type"] = "CMS"
        original[1]["type"] = "Web Application Frameworks"
        original[2]["type"] = "Programming Languages"
        original[3]["type"] = "CMS,Ecommerce,Analytics"

        self.detector.add_categories(findings)
        assert original == findings

    def test_url_match(self):
        assert self.detector.url_match(url='', regexp=None, default='test') == 'test'
        assert self.detector.url_match(url='example.com', regexp='exampl', default='test') is not None
        assert self.detector.url_match(url='example.com', regexp='ampl', default='test') is None

    def test_expected_url(self):
        url = "http://site.abc.com/dir/sub/script.php"
        assert self.detector.expected_url(url, None, None)
        assert self.detector.expected_url(url, 'http://.*abc.com/', None)
        assert not self.detector.expected_url(url, 'http://abc.com/', None)
        assert self.detector.expected_url(url, 'http://.*abc.com/', "php")
        assert not self.detector.expected_url(url, 'http://.*abc.com/', ".*php")
        assert self.detector.expected_url(url, None, ".*\\.asp")
        assert not self.detector.expected_url(url, None, ".*\\.php")

    def test_detect(self):
        expected = {
            'http://home.web.cern.ch/': [
                {'app': 'Apache', 'type': 'Web Servers', 'ver': None},
                {'app': 'Drupal', 'type': 'CMS', 'ver': '7'},
                {'app': 'Lightbox', 'type': 'JavaScript Libraries', 'ver': None},
                {'app': 'jQuery', 'type': 'JavaScript Libraries', 'ver': None},
                {'app': 'Google Font API', 'type': 'Font Scripts', 'ver': None},
                {'app': 'PHP', 'type': 'Programming Languages', 'ver': None}
            ]
        }

        results = self.mock_detector_run(url=cern_ch_test_data['geturl'], content=cern_ch_test_data['content'],
                                         headers=cern_ch_test_data['headers'])
        assert list(six.iterkeys(results)) == list(six.iterkeys(expected))
        assert (sorted(next(six.itervalues(results)), key=operator.itemgetter('app')) ==
                sorted(next(six.itervalues(expected)), key=operator.itemgetter('app')))

    def test_detect_multiple(self):
        urls_list = ["http://cern.ch", None, "", "http://cern.ch", "example.com"]
        with mock.patch('wad.detection.Detector.detect') as mockObj:
            mockObj.side_effect = [{'test1': 1}, {'test2': 2}]
            assert self.detector.detect_multiple(urls_list) == {'test1': 1, 'test2': 2}
            assert (('example.com', None, None, TIMEOUT),) in mockObj.call_args_list
            assert (('http://cern.ch', None, None, TIMEOUT),) in mockObj.call_args_list

    def test_normalize_url(self):
        assert self.detector.normalize_url('http://abc.pl') == 'http://abc.pl/'
        assert self.detector.normalize_url('http://abc.pl/') == 'http://abc.pl/'
        assert self.detector.normalize_url('http://abc.pl/def') == 'http://abc.pl/def'

    def test_regression_meta_attributes_order(self):
        # This bug was caused by hardcoded attributes order in re_meta pattern.
        # Example app that was affected was GitLab CI.
        content1 = "<meta content='GitLab Continuous Integration' name='description'>"
        content2 = "<meta name='description' content='GitLab Continuous Integration'>"

        results1 = self.detector.check_meta(content1)
        results2 = self.detector.check_meta(content2)

        expected = [{'app': 'GitLab CI', 'ver': None}]

        assert results1 == results2 == expected

    def test_regression_empty_content_should_run_checks(self):
        # This bug was introduced while abstracting some methods in detect method of Detector
        # Shortly, if the content was empty, code didn't run further (while it should, there might be something in
        # headers etc.)
        expected = {
            'http://home.web.cern.ch/': [
                {'app': 'Apache', 'type': 'Web Servers', 'ver': None},
                {'app': 'Drupal', 'type': 'CMS', 'ver': '7'},
                {'app': 'PHP', 'type': 'Programming Languages', 'ver': None}
            ]
        }
        results = self.mock_detector_run(url=cern_ch_test_data['geturl'], content='',
                                         headers=cern_ch_test_data['headers'])
        assert list(six.iterkeys(results)) == list(six.iterkeys(expected))
        assert (sorted(next(six.itervalues(results)), key=operator.itemgetter('app')) ==
                sorted(next(six.itervalues(expected)), key=operator.itemgetter('app')))

    def test_regression_urls_not_normalized(self):
        # This bug caused .pl top level domain to be recognized as Perl file.
        # It is due to the fact, that Wappalyzer receives normalized URI from browser ("http://abc.xyz/")
        # even if you open "http://abc.xyz", while we didn't normalize the URL.
        results = self.mock_detector_run(url='http://abc.pl')
        assert results == {'http://abc.pl/': []}
Ejemplo n.º 6
0
def main(timeout=TIMEOUT):
    desc = """WAD -
This component analyzes given URL(s) and detects technologies, libraries,
frameworks etc. used by this application, from the OS and web server level,
to the programming platform and frameworks, and server- and client-side
applications, tools and libraries. For example: OS=Linux, webserver=Apache,
platform=PHP, cms=Drupal, analytics=Google Analytics, javascript-lib=jQuery
etc."""

    parser = OptionParser(
        description=desc,
        usage="Usage: %prog -u <URLs|@URLfile>\nHelp:  %prog -h",
        version="%prog 1.0")

    parser.add_option(
        "-u",
        "--url",
        dest="urls",
        metavar="URLS|@FILE",
        help=
        "list of URLs (comma-separated), or a file with a list of URLs (one per line)"
    )

    parser.add_option(
        "-l",
        "--limit",
        dest="limit",
        metavar="URLMASK",
        help=
        "in case of redirections, only include pages with URLs matching this mask - "
        "e.g. 'https?://[^/]*\.abc\.com/'")

    parser.add_option(
        "-x",
        "--exclude",
        dest="exclude",
        metavar="URLMASK",
        help=
        "in case of redirections, exclude pages with URL matching this mask - "
        "e.g. 'https?://[^/]*/(login|logout)'")

    parser.add_option(
        "-o",
        "--output",
        dest="output_file",
        metavar="FILE",
        help="output file for detection results (default: STDOUT)")

    parser.add_option(
        "-c",
        "--clues",
        dest="clues_file",
        metavar="FILE",
        default=None,
        help="clues for detecting web applications and technologies")

    parser.add_option(
        "-t",
        "--timeout",
        action="store",
        dest="TIMEOUT",
        default=timeout,
        help="set timeout (in seconds) for accessing a single URL")

    parser.add_option(
        "-f",
        "--format",
        action="store",
        dest="format",
        default='json',
        help="output format, allowed values: csv, txt, json (default)")

    parser.add_option(
        "-g",
        "--group",
        action="store_true",
        dest="group",
        default=False,
        help=
        "group results (i.e. technologies found on subpages of other scanned URL "
        "aren't listed)")

    tools.add_log_options(parser)

    options = parser.parse_args()[0]

    tools.use_log_options(options)

    if not options.urls:
        parser.error("Argument -u missing")
        return

    timeout = int(options.TIMEOUT)

    if options.urls[0] == "@":
        try:
            f = open(options.urls[1:])
            urls = f.readlines()
            f.close()
        except Exception as e:
            # an I/O exception?
            logging.error("Error reading URL file %s, terminating: %s",
                          options.urls[1:], tools.error_to_str(e))
            return
    else:
        urls = [x.strip() for x in options.urls.split(",") if x.strip() != ""]

    if options.format not in output_format_map.keys():
        parser.error("Invalid format specified")
        return

    Clues.get_clues(options.clues_file)

    results = Detector().detect_multiple(urls,
                                         limit=options.limit,
                                         exclude=options.exclude,
                                         timeout=timeout)

    if options.group:
        results = group(results)

    output = output_format_map[options.format]().retrieve(results=results)

    if options.output_file:
        try:
            f = open(options.output_file, "w")
            f.write(output)
            f.close()
            logging.debug("Results written to file %s", options.output_file)
        except Exception as e:
            # an I/O exception?
            logging.error("Error writing results to file %s, terminating: %s",
                          options.output_file, tools.error_to_str(e))
            return

    print(output)
Ejemplo n.º 7
0
 def setUp(self):
     self.detector = Detector()
     self.apps = self.detector.apps
     self.categories = self.detector.categories
Ejemplo n.º 8
0
class TestDetector(unittest.TestCase):
    def setUp(self):
        self.detector = Detector()
        self.apps = self.detector.apps
        self.categories = self.detector.categories

    def mock_detector_run(self, url='', content='', headers=None):
        with mock.patch('wad.detection.tools') as mockObj:
            page = mock.Mock()
            page.geturl.return_value = url
            if six.PY3:
                page.read.return_value = bytes(content, encoding='utf-8')
            else:
                page.read.return_value = content
            headers_mock = mock.Mock()
            headers_mock.items.return_value = headers or []
            page.info.return_value = headers_mock
            mockObj.urlopen = mock.Mock(return_value=page)
            results = self.detector.detect('http://abc.xyz')
        return results

    def test_check_re(self):
        # checking version patterns:
        #
        #   "headers": { "Server": "IIS(?:/([\\d.]+))?\\;version:\\1" },
        assert (self.detector.check_re(
            self.apps['IIS']['headers_re']['Server'],
            self.apps['IIS']['headers']['Server'],
            'Microsoft-IIS/7.5',
            [], None, 'IIS') == [{'app': 'IIS', 'ver': '7.5'}])

        # (?:maps\\.google\\.com/maps\\?file=api(?:&v=([\\d.]+))?|
        # maps\\.google\\.com/maps/api/staticmap)\\;version:API v\\1
        assert (self.detector.check_re(
            self.apps['Google Maps']['script_re'][0],
            self.apps['Google Maps']['script'][0],
            'abc <script src="maps.google.com/maps?file=api&v=123"> def',
            [], None, 'Google Maps') == [{'app': 'Google Maps', 'ver': 'API v123'}])

        # "script": [ "js/mage", "skin/frontend/(?:default|(enterprise))\\;version:\\1?Enterprise:Community" ],
        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/whatever"> def',
            [], None, 'Magento') == [])

        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/default"> def',
            [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Community'}])

        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/enterprise"> def',
            [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Enterprise'}])

    def test_check_url(self):
        assert self.detector.check_url("http://whatever.blogspot.com") == [{'app': 'Blogger', 'ver': None}]
        assert self.detector.check_url("https://whatever-else3414.de/script.php") == [{'app': 'PHP', 'ver': None}]

    def test_check_html(self):
        content = '<html><div id="gsNavBar" class="gcBorder1">whatever'
        assert self.detector.check_html(content) == [{'app': 'Gallery', 'ver': None}]

    def test_check_meta(self):
        assert (self.detector.check_meta('<html>    s<meta name="generator" content="Percussion">sssss    whatever') ==
                [{'app': 'Percussion', 'ver': None}])
        assert (self.detector.check_meta(" dcsaasd f<meta   name    = 'cargo_title' dd  content  =   'Pdafadfda'  >") ==
                [{'app': 'Cargo', 'ver': None}])
        assert (self.detector.check_meta(" dcsaasd f<mfffffffeta     name='cargo_title' dd  content='Pdafadfda'  >") ==
                [])
        assert self.detector.check_meta(" dcsaasd f<meta     name='cargo_title' >") == []

    def test_check_script(self):
        assert (self.detector.check_script('<html>    s<script  sda f     src    =  "jquery1.7.js">') ==
                [{'app': 'jQuery', 'ver': None}])
        assert self.detector.check_script(" dcsaasd f<script     src='' >") == []

    def test_check_headers(self):
        headers = [('Host', 'abc.com'), ('Server', 'Linux Ubuntu 12.10')]
        headers_mock = mock.Mock()
        headers_mock.items.return_value = headers

        assert (self.detector.check_headers(headers_mock) ==
                [{'app': 'Ubuntu', 'ver': None}])

    def test_implied_by(self):
        # ASP implies WS and IIS and IIS implies WS;
        # but we already know about IIS, so the only new implied app is WS
        assert self.detector.implied_by(['Microsoft ASP.NET', 'IIS']) == ['Windows Server']

    def test_follow_implies(self):
        # empty findings
        findings = []
        self.detector.follow_implies(findings)
        assert findings == []

        # no implies
        findings = [{'app': 'reCAPTCHA', 'ver': None}]
        self.detector.follow_implies(findings)
        assert findings == [{'app': 'reCAPTCHA', 'ver': None}]

        # Django CMS implies Django, and Django implies Python - let's see if this chain is followed
        findings = [{'app': 'Django CMS', 'ver': None}]
        self.detector.follow_implies(findings)
        assert (findings ==
                [{'app': 'Django CMS', 'ver': None},
                 {'app': 'Django', 'ver': None},
                 {'app': 'Python', 'ver': None}])

    def test_remove_duplicates(self):
        with_duplicates = [
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': None}, {'app': 'D', 'ver': "7.0"},
            {'app': 'E', 'ver': "1"}, {'app': 'F', 'ver': "2.2"},
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "222"},
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'E', 'ver': None}, {'app': 'E', 'ver': "1.3"},
            {'app': 'F', 'ver': "2"}, {'app': 'F', 'ver': None},
        ]

        without_duplicates = [
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "7.0"},
            {'app': 'E', 'ver': "1.3"},
            {'app': 'F', 'ver': "2.2"}, {'app': 'D', 'ver': "222"},
        ]

        Detector().remove_duplicates(with_duplicates)
        assert with_duplicates == without_duplicates

    def test_excluded_by(self):
        # both 'TYPO3 Flow' and 'TYPO3 Neos' exclude 'TYPO3 CMS'
        assert self.detector.excluded_by(['TYPO3 Flow', 'TYPO3 Neos']) == ['TYPO3 CMS']
        # 'JBoss Web' excludes 'Apache Tomcat'; 'Mambo' excludes 'Joomla'
        assert set(self.detector.excluded_by(['Jo', 'JBoss Web', 'Jetty', 'Mambo'])) == set(['Joomla', 'Apache Tomcat'])
        # 'IIS' doesn't exclude anything
        assert self.detector.excluded_by(['IIS']) == []

    def test_remove_exclusions(self):
        # empty findings
        findings = []
        self.detector.remove_exclusions(findings)
        assert findings == []

        # no implies
        findings = [{'app': 'reCAPTCHA', 'ver': None}]
        self.detector.remove_exclusions(findings)
        assert findings == [{'app': 'reCAPTCHA', 'ver': None}]

        # real exclusions
        findings = [{'app': 'JBoss Web', 'ver': None},
                    {'app': 'Apache Tomcat', 'ver': None},
                    {'app': 'IIS', 'ver': None},
                    {'app': 'TYPO3 CMS', 'ver': None},
                    {'app': 'TYPO3 Flow', 'ver': None}]
        self.detector.remove_exclusions(findings)
        assert (findings ==
                [{'app': 'JBoss Web', 'ver': None},
                 {'app': 'IIS', 'ver': None},
                 {'app': 'TYPO3 Flow', 'ver': None}])

    def test_add_categories(self):
        findings = [
            {'app': 'Django CMS', 'ver': None},
            {'app': 'Django', 'ver': None},
            {'app': 'Python', 'ver': '2.7'},
            {'app': 'Dynamicweb', 'ver': 'beta'}]
        original = copy.deepcopy(findings)
        original[0]["type"] = "cms"
        original[1]["type"] = "web-frameworks"
        original[2]["type"] = "programming-languages"
        original[3]["type"] = "cms,ecommerce,analytics"

        self.detector.add_categories(findings)
        assert original == findings

    def test_url_match(self):
        assert self.detector.url_match(url='', regexp=None, default='test') == 'test'
        assert self.detector.url_match(url='example.com', regexp='exampl', default='test') is not None
        assert self.detector.url_match(url='example.com', regexp='ampl', default='test') is None

    def test_expected_url(self):
        url = "http://site.abc.com/dir/sub/script.php"
        assert self.detector.expected_url(url, None, None)
        assert self.detector.expected_url(url, 'http://.*abc.com/', None)
        assert not self.detector.expected_url(url, 'http://abc.com/', None)
        assert self.detector.expected_url(url, 'http://.*abc.com/', "php")
        assert not self.detector.expected_url(url, 'http://.*abc.com/', ".*php")
        assert self.detector.expected_url(url, None, ".*\.asp")
        assert not self.detector.expected_url(url, None, ".*\.php")

    def test_detect(self):
        expected = {
            'http://home.web.cern.ch/': [
                {'app': 'Apache', 'type': 'web-servers', 'ver': None},
                {'app': 'Drupal', 'type': 'cms', 'ver': '7'},
                {'app': 'Lightbox', 'type': 'photo-galleries,javascript-frameworks', 'ver': None},
                {'app': 'jQuery', 'type': 'javascript-frameworks', 'ver': None},
                {'app': 'Google Font API', 'type': 'font-scripts', 'ver': None},
                {'app': 'PHP', 'type': 'programming-languages', 'ver': None}
            ]
        }

        results = self.mock_detector_run(url=cern_ch_test_data['geturl'], content=cern_ch_test_data['content'],
                                         headers=cern_ch_test_data['headers'].items())
        assert list(six.iterkeys(results)) == list(six.iterkeys(expected))
        assert (sorted(next(six.itervalues(results)), key=operator.itemgetter('app')) ==
                sorted(next(six.itervalues(expected)), key=operator.itemgetter('app')))

    def test_detect_multiple(self):
        urls_list = ["http://cern.ch", None, "", "http://cern.ch", "example.com"]
        with mock.patch('wad.detection.Detector.detect') as mockObj:
            mockObj.side_effect = [{'test1': 1}, {'test2': 2}]
            assert self.detector.detect_multiple(urls_list) == {'test1': 1, 'test2': 2}
            assert (('example.com', None, None, TIMEOUT),) in mockObj.call_args_list
            assert (('http://cern.ch', None, None, TIMEOUT),) in mockObj.call_args_list

    def test_normalize_url(self):
        assert self.detector.normalize_url('http://abc.pl') == 'http://abc.pl/'
        assert self.detector.normalize_url('http://abc.pl/') == 'http://abc.pl/'
        assert self.detector.normalize_url('http://abc.pl/def') == 'http://abc.pl/def'

    def test_regression_meta_attributes_order(self):
        # This bug was caused by hardcoded attributes order in re_meta pattern.
        # Example app that was affected was GitLab CI.
        content1 = "<meta content='GitLab Continuous Integration' name='description'>"
        content2 = "<meta name='description' content='GitLab Continuous Integration'>"

        results1 = self.detector.check_meta(content1)
        results2 = self.detector.check_meta(content2)

        expected = [{'app': 'GitLab CI', 'ver': None}]

        assert results1 == results2 == expected

    def test_regression_empty_content_should_run_checks(self):
        # This bug was introduced while abstracting some methods in detect method of Detector
        # Shortly, if the content was empty, code didn't run further (while it should, there might be something in
        # headers etc.)
        expected = {
            'http://home.web.cern.ch/': [
                {'app': 'Apache', 'type': 'web-servers', 'ver': None},
                {'app': 'Drupal', 'type': 'cms', 'ver': '7'},
                {'app': 'PHP', 'type': 'programming-languages', 'ver': None}
            ]
        }
        results = self.mock_detector_run(url=cern_ch_test_data['geturl'], content='',
                                         headers=cern_ch_test_data['headers'].items())
        assert list(six.iterkeys(results)) == list(six.iterkeys(expected))
        assert (sorted(next(six.itervalues(results)), key=operator.itemgetter('app')) ==
                sorted(next(six.itervalues(expected)), key=operator.itemgetter('app')))

    def test_regression_urls_not_normalized(self):
        # This bug caused .pl top level domain to be recognized as Perl file.
        # It is due to the fact, that Wappalyzer receives normalized URI from browser ("http://abc.xyz/")
        # even if you open "http://abc.xyz", while we didn't normalize the URL.
        results = self.mock_detector_run(url='http://abc.pl')
        assert results == {'http://abc.pl/': []}
Ejemplo n.º 9
0
class TestDetector(unittest.TestCase):
    def setUp(self):
        self.detector = Detector()
        self.apps = self.detector.apps
        self.categories = self.detector.categories

    def test_check_re(self):
        # checking version patterns:
        #
        #   "headers": { "Server": "IIS(?:/([\\d.]+))?\\;version:\\1" },
        assert (self.detector.check_re(
            self.apps['IIS']['headers_re']['Server'],
            self.apps['IIS']['headers']['Server'],
            'Microsoft-IIS/7.5',
            [], None, 'IIS') == [{'app': 'IIS', 'ver': '7.5'}])

        # (?:maps\\.google\\.com/maps\\?file=api(?:&v=([\\d.]+))?|
        # maps\\.google\\.com/maps/api/staticmap)\\;version:API v\\1
        assert (self.detector.check_re(
            self.apps['Google Maps']['script_re'][0],
            self.apps['Google Maps']['script'][0],
            'abc <script src="maps.google.com/maps?file=api&v=123"> def',
            [], None, 'Google Maps') == [{'app': 'Google Maps', 'ver': 'API v123'}])

        # "script": [ "js/mage", "skin/frontend/(?:default|(enterprise))\\;version:\\1?Enterprise:Community" ],
        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/whatever"> def',
            [], None, 'Magento') == [])

        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/default"> def',
            [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Community'}])

        assert (self.detector.check_re(
            self.apps['Magento']['script_re'][1],
            self.apps['Magento']['script'][1],
            'abc <script src="skin/frontend/enterprise"> def',
            [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Enterprise'}])

    def test_check_url(self):
        assert self.detector.check_url("http://whatever.blogspot.com") == [{'app': 'Blogger', 'ver': None}]
        assert self.detector.check_url("https://whatever-else3414.de/script.php") == [{'app': 'PHP', 'ver': None}]

    def test_check_html(self):
        content = '<html><div id="gsNavBar" class="gcBorder1">whatever'
        assert self.detector.check_html(content) == [{'app': 'Gallery', 'ver': None}]

    def test_check_meta(self):
        assert (self.detector.check_meta('<html>    s<meta name="generator" content="Percussion">sssss    whatever') ==
                [{'app': 'Percussion', 'ver': None}])
        assert (self.detector.check_meta(" dcsaasd f<meta   name    = 'cargo_title' dd  content  =   'Pdafadfda'  >") ==
                [{'app': 'Cargo', 'ver': None}])
        assert (self.detector.check_meta(" dcsaasd f<mfffffffeta     name='cargo_title' dd  content='Pdafadfda'  >") ==
                [])
        assert self.detector.check_meta(" dcsaasd f<meta     name='cargo_title' >") == []

    def test_check_script(self):
        assert (self.detector.check_script('<html>    s<script  sda f     src    =  "jquery1.7.js">') ==
                [{'app': 'jQuery', 'ver': None}])
        assert self.detector.check_script(" dcsaasd f<script     src='' >") == []

    def test_check_headers(self):
        headers = {
            'Host': 'abc.com',
            'Server': 'Linux Ubuntu 12.10',
        }
        headers_mock = mock.Mock()
        headers_mock.dict = headers

        assert (self.detector.check_headers(headers_mock) ==
                [{'app': 'Ubuntu', 'ver': None}])

    def test_implied_by(self):
        # ASP implies WS and IIS and IIS implies WS;
        # but we already know about IIS, so the only new implied app is WS
        assert self.detector.implied_by(['Microsoft ASP.NET', 'IIS']) == ['Windows Server']

    def test_follow_implies(self):
        # empty findings
        findings = []
        self.detector.follow_implies(findings)
        assert findings == []

        # no implies
        findings = [{'app': 'reCAPTCHA', 'ver': None}]
        self.detector.follow_implies(findings)
        assert findings == [{'app': 'reCAPTCHA', 'ver': None}]

        # Django CMS implies Django, and Django implies Python - let's see if this chain is followed
        findings = [{'app': 'Django CMS', 'ver': None}]
        self.detector.follow_implies(findings)
        assert (findings ==
                [{'app': 'Django CMS', 'ver': None},
                 {'app': 'Django', 'ver': None},
                 {'app': 'Python', 'ver': None}])

    def test_remove_duplicates(self):
        with_duplicates = [
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': None}, {'app': 'D', 'ver': "7.0"},
            {'app': 'E', 'ver': "1"}, {'app': 'F', 'ver': "2.2"},
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "222"},
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'E', 'ver': None}, {'app': 'E', 'ver': "1.3"},
            {'app': 'F', 'ver': "2"}, {'app': 'F', 'ver': None},
        ]

        without_duplicates = [
            {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"},
            {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "7.0"},
            {'app': 'E', 'ver': "1.3"},
            {'app': 'F', 'ver': "2.2"}, {'app': 'D', 'ver': "222"},
        ]

        Detector().remove_duplicates(with_duplicates)
        assert with_duplicates == without_duplicates

    def test_excluded_by(self):
        # both 'TYPO3 Flow' and 'TYPO3 Neos' exclude 'TYPO3 CMS'
        assert self.detector.excluded_by(['TYPO3 Flow', 'TYPO3 Neos']) == ['TYPO3 CMS']
        # 'JBoss Web' excludes 'Apache Tomcat'; 'Mambo' excludes 'Joomla'
        assert set(self.detector.excluded_by(['Jo', 'JBoss Web', 'Jetty', 'Mambo'])) == set(['Joomla', 'Apache Tomcat'])
        # 'IIS' doesn't exclude anything
        assert self.detector.excluded_by(['IIS']) == []

    def test_remove_exclusions(self):
        # empty findings
        findings = []
        self.detector.remove_exclusions(findings)
        assert findings == []

        # no implies
        findings = [{'app': 'reCAPTCHA', 'ver': None}]
        self.detector.remove_exclusions(findings)
        assert findings == [{'app': 'reCAPTCHA', 'ver': None}]

        # real exclusions
        findings = [{'app': 'JBoss Web', 'ver': None},
                    {'app': 'Apache Tomcat', 'ver': None},
                    {'app': 'IIS', 'ver': None},
                    {'app': 'TYPO3 CMS', 'ver': None},
                    {'app': 'TYPO3 Flow', 'ver': None}]
        self.detector.remove_exclusions(findings)
        assert (findings ==
                [{'app': 'JBoss Web', 'ver': None},
                 {'app': 'IIS', 'ver': None},
                 {'app': 'TYPO3 Flow', 'ver': None}])

    def test_add_categories(self):
        findings = [
            {'app': 'Django CMS', 'ver': None},
            {'app': 'Django', 'ver': None},
            {'app': 'Python', 'ver': '2.7'},
            {'app': 'Dynamicweb', 'ver': 'beta'}]
        original = copy.deepcopy(findings)
        original[0]["type"] = "cms"
        original[1]["type"] = "web-frameworks"
        original[2]["type"] = "programming-languages"
        original[3]["type"] = "cms,ecommerce,analytics"

        self.detector.add_categories(findings)
        assert original == findings

    def test_url_match(self):
        assert self.detector.url_match(url='', regexp=None, default='test') == 'test'
        assert self.detector.url_match(url='example.com', regexp='exampl', default='test') is not None
        assert self.detector.url_match(url='example.com', regexp='ampl', default='test') is None

    def test_expected_url(self):
        url = "http://site.abc.com/dir/sub/script.php"
        assert self.detector.expected_url(url, None, None)
        assert self.detector.expected_url(url, 'http://.*abc.com/', None)
        assert not self.detector.expected_url(url, 'http://abc.com/', None)
        assert self.detector.expected_url(url, 'http://.*abc.com/', "php")
        assert not self.detector.expected_url(url, 'http://.*abc.com/', ".*php")
        assert self.detector.expected_url(url, None, ".*\.asp")
        assert not self.detector.expected_url(url, None, ".*\.php")

    def test_detect(self):
        expected = {
            'http://home.web.cern.ch/': [
                {'app': 'Apache',
                 'type': 'web-servers',
                 'ver': None},
                {'app': 'Drupal',
                 'type': 'cms',
                 'ver': '7'},
                {'app': 'Lightbox',
                 'type': 'photo-galleries,javascript-frameworks',
                 'ver': None},
                {'app': 'jQuery',
                 'type': 'javascript-frameworks',
                 'ver': None},
                {'app': 'Google Font API',
                 'type': 'font-scripts',
                 'ver': None},
                {'app': u'PHP',
                 'type': 'programming-languages',
                 'ver': None}
            ]
        }

        with mock.patch('wad.wad.tools') as mockObj:
            page = mock.Mock()
            page.geturl.return_value = cern_ch_test_data['geturl']
            page.read.return_value = cern_ch_test_data['content']
            headers_mock = mock.Mock()
            headers_mock.dict = cern_ch_test_data['headers']
            page.info.return_value = headers_mock
            mockObj.urlopen = mock.Mock(return_value=page)
            assert self.detector.detect('http://cern.ch') == expected

    def test_detect_multiple(self):
        urls_list = ["http://cern.ch", None, "", "http://cern.ch", "example.com"]
        with mock.patch('wad.detection.Detector.detect') as mockObj:
            mockObj.side_effect = [{'test1': 1}, {'test2': 2}]
            assert self.detector.detect_multiple(urls_list) == {'test1': 1, 'test2': 2}
            assert mockObj.call_args_list == [(('example.com', None, None, TIMEOUT),),
                                              (('http://cern.ch', None, None, TIMEOUT),)]