def mapper(self, _, line): f = None #if self.options.runner in ['inline']: # print self.options.runner + "lol" # print 'Loading local file {}'.format(line) # f = warc.WARCFile(fileobj=gzip.open(line)) #else: conn = boto.connect_s3(anon=True) pds = conn.get_bucket('aws-publicdatasets') k = Key(pds, line) f = warc.WARCFile(fileobj=GzipStreamFile(k)) for i, record in enumerate(f): if record['Content-Type'] == 'application/http; msgtype=response': payload = record.payload.read() headers, body = payload.split('\r\n\r\n', 1) data = [] #data = data + Detector().check_headers(headers) data = data + Detector().check_script(body) data = data + Detector().check_html(body) data = { "tech": data, "url": record.url, "date": record.date, "domain": urlparse(record.url).netloc } yield data, 1
def scan_target(self, url): """Function to detect technologies running on target and list them in gui treeview""" _id = None try: d = Detector().detect(url=url, timeout=5) for result in d: if d[result]: ext = tldextract.extract(url) _id = self.treeview.insert('', 'end', text='.'.join(ext[:3])) tech_type, software = d[result][0].get('type'), \ d[result][0].get('app') version = d[result][0].get('ver') # assign to gui treeview if not version: version = 'None' self.treeview.insert(_id, 'end', text=tech_type, values=(software, version)) self.status['text'] = 'done' else: self.status['text'] = 'No results found' except ValueError: self.status['text'] = "Invalid! Please input a full url" finally: del _id
def test_remove_duplicates(self): with_duplicates = [ {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': None}, {'app': 'D', 'ver': "7.0"}, {'app': 'E', 'ver': "1"}, {'app': 'F', 'ver': "2.2"}, {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "222"}, {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'E', 'ver': None}, {'app': 'E', 'ver': "1.3"}, {'app': 'F', 'ver': "2"}, {'app': 'F', 'ver': None}, ] without_duplicates = [ {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "7.0"}, {'app': 'E', 'ver': "1.3"}, {'app': 'F', 'ver': "2.2"}, {'app': 'D', 'ver': "222"}, ] Detector().remove_duplicates(with_duplicates) assert with_duplicates == without_duplicates
def setUp(self): self.detector = Detector() self.apps = self.detector.apps self.categories = self.detector.categories
class TestDetector(unittest.TestCase): def setUp(self): self.detector = Detector() self.apps = self.detector.apps self.categories = self.detector.categories def mock_detector_run(self, url='', content='', headers=None): with mock.patch('wad.detection.tools') as mockObj: page = mock.MagicMock() page.geturl.return_value = url if six.PY3: page.read.return_value = bytes(content, encoding='utf-8') else: page.read.return_value = content page.info.return_value = headers or dict() mockObj.urlopen = mock.Mock(return_value=page) results = self.detector.detect('http://abc.xyz') return results def test_check_re(self): # checking version patterns: # # "headers": { "Server": "IIS(?:/([\\d.]+))?\\;version:\\1" }, assert (self.detector.check_re( self.apps['IIS']['headers_re']['Server'], self.apps['IIS']['headers']['Server'], 'Microsoft-IIS/7.5', [], None, 'IIS') == [{'app': 'IIS', 'ver': '7.5'}]) # (?:maps\\.google\\.com/maps\\?file=api(?:&v=([\\d.]+))?| # maps\\.google\\.com/maps/api/staticmap)\\;version:API v\\1 assert (self.detector.check_re( self.apps['Google Maps']['script_re'][0], self.apps['Google Maps']['script'][0], 'abc <script src="maps.google.com/maps?file=api&v=123"> def', [], None, 'Google Maps') == [{'app': 'Google Maps', 'ver': 'API v123'}]) # "script": [ "js/mage", "skin/frontend/(?:default|(enterprise))\\;version:\\1?Enterprise:Community" ], assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/whatever"> def', [], None, 'Magento') == []) assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/default"> def', [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Community'}]) assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/enterprise"> def', [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Enterprise'}]) def test_check_url(self): assert self.detector.check_url("http://whatever.blogspot.com") == [{'app': 'Blogger', 'ver': None}] assert self.detector.check_url("https://whatever-else3414.de/script.php") == [{'app': 'PHP', 'ver': None}] def test_check_html(self): content = '<html><div id="gsNavBar" class="gcBorder1">whatever' assert self.detector.check_html(content) == [{'app': 'Gallery', 'ver': None}] def test_check_meta(self): assert (self.detector.check_meta('<html> s<meta name="generator" content="Percussion">sssss whatever') == [{'app': 'Percussion', 'ver': None}]) assert (self.detector.check_meta(" dcsaasd f<meta name = 'cargo_title' dd content = 'Pdafadfda' >") == [{'app': 'Cargo', 'ver': None}]) assert (self.detector.check_meta(" dcsaasd f<mfffffffeta name='cargo_title' dd content='Pdafadfda' >") == []) assert self.detector.check_meta(" dcsaasd f<meta name='cargo_title' >") == [] def test_check_script(self): assert (self.detector.check_script('<html> s<script sda f src = "jquery1.7.js">') == [{'app': 'jQuery', 'ver': None}]) assert self.detector.check_script(" dcsaasd f<script src='' >") == [] def test_check_headers(self): headers = [('Host', 'abc.com'), ('Server', 'Linux Ubuntu 12.10')] headers_mock = mock.Mock() headers_mock.items.return_value = headers assert (self.detector.check_headers(headers_mock) == [{'app': 'Ubuntu', 'ver': None}]) def test_check_cookies(self): headers = {'Set-Cookie': 'x=1; xid=%s; y=2' % ('a'*32)} assert (self.detector.check_cookies(headers) == [{'app': 'X-Cart', 'ver': None}]) def test_implied_by(self): # ASP implies WS and IIS and IIS implies WS; # but we already know about IIS, so the only new implied app is WS assert self.detector.implied_by(['Microsoft ASP.NET', 'IIS']) == ['Windows Server'] def test_follow_implies(self): # empty findings findings = [] self.detector.follow_implies(findings) assert findings == [] # no implies findings = [{'app': 'reCAPTCHA', 'ver': None}] self.detector.follow_implies(findings) assert findings == [{'app': 'reCAPTCHA', 'ver': None}] # Django CMS implies Django, and Django implies Python - let's see if this chain is followed findings = [{'app': 'Django CMS', 'ver': None}] self.detector.follow_implies(findings) assert (findings == [{'app': 'Django CMS', 'ver': None}, {'app': 'Django', 'ver': None}, {'app': 'Python', 'ver': None}]) def test_remove_duplicates(self): with_duplicates = [ {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': None}, {'app': 'D', 'ver': "7.0"}, {'app': 'E', 'ver': "1"}, {'app': 'F', 'ver': "2.2"}, {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "222"}, {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'E', 'ver': None}, {'app': 'E', 'ver': "1.3"}, {'app': 'F', 'ver': "2"}, {'app': 'F', 'ver': None}, ] without_duplicates = [ {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "7.0"}, {'app': 'E', 'ver': "1.3"}, {'app': 'F', 'ver': "2.2"}, {'app': 'D', 'ver': "222"}, ] Detector().remove_duplicates(with_duplicates) assert with_duplicates == without_duplicates def test_excluded_by(self): # both 'Neos Flow' and 'Neos CMS' exclude 'TYPO3 CMS' assert self.detector.excluded_by(['Neos Flow', 'Neos CMS']) == ['TYPO3 CMS'] # 'JBoss Web' excludes 'Apache Tomcat'; 'Mambo' excludes 'Joomla' assert set(self.detector.excluded_by(['JBoss Web', 'Jetty', 'Mambo'])) == set(['Joomla', 'Apache Tomcat']) # 'IIS' doesn't exclude anything assert self.detector.excluded_by(['IIS']) == [] def test_remove_exclusions(self): # empty findings findings = [] self.detector.remove_exclusions(findings) assert findings == [] # no implies findings = [{'app': 'reCAPTCHA', 'ver': None}] self.detector.remove_exclusions(findings) assert findings == [{'app': 'reCAPTCHA', 'ver': None}] # real exclusions findings = [{'app': 'JBoss Web', 'ver': None}, {'app': 'Apache Tomcat', 'ver': None}, {'app': 'IIS', 'ver': None}, {'app': 'TYPO3 CMS', 'ver': None}, {'app': 'Neos Flow', 'ver': None}] self.detector.remove_exclusions(findings) assert (findings == [{'app': 'JBoss Web', 'ver': None}, {'app': 'IIS', 'ver': None}, {'app': 'Neos Flow', 'ver': None}]) def test_add_categories(self): findings = [ {'app': 'Django CMS', 'ver': None}, {'app': 'Django', 'ver': None}, {'app': 'Python', 'ver': '2.7'}, {'app': 'Dynamicweb', 'ver': 'beta'}] original = copy.deepcopy(findings) original[0]["type"] = "CMS" original[1]["type"] = "Web Application Frameworks" original[2]["type"] = "Programming Languages" original[3]["type"] = "CMS,Ecommerce,Analytics" self.detector.add_categories(findings) assert original == findings def test_url_match(self): assert self.detector.url_match(url='', regexp=None, default='test') == 'test' assert self.detector.url_match(url='example.com', regexp='exampl', default='test') is not None assert self.detector.url_match(url='example.com', regexp='ampl', default='test') is None def test_expected_url(self): url = "http://site.abc.com/dir/sub/script.php" assert self.detector.expected_url(url, None, None) assert self.detector.expected_url(url, 'http://.*abc.com/', None) assert not self.detector.expected_url(url, 'http://abc.com/', None) assert self.detector.expected_url(url, 'http://.*abc.com/', "php") assert not self.detector.expected_url(url, 'http://.*abc.com/', ".*php") assert self.detector.expected_url(url, None, ".*\\.asp") assert not self.detector.expected_url(url, None, ".*\\.php") def test_detect(self): expected = { 'http://home.web.cern.ch/': [ {'app': 'Apache', 'type': 'Web Servers', 'ver': None}, {'app': 'Drupal', 'type': 'CMS', 'ver': '7'}, {'app': 'Lightbox', 'type': 'JavaScript Libraries', 'ver': None}, {'app': 'jQuery', 'type': 'JavaScript Libraries', 'ver': None}, {'app': 'Google Font API', 'type': 'Font Scripts', 'ver': None}, {'app': 'PHP', 'type': 'Programming Languages', 'ver': None} ] } results = self.mock_detector_run(url=cern_ch_test_data['geturl'], content=cern_ch_test_data['content'], headers=cern_ch_test_data['headers']) assert list(six.iterkeys(results)) == list(six.iterkeys(expected)) assert (sorted(next(six.itervalues(results)), key=operator.itemgetter('app')) == sorted(next(six.itervalues(expected)), key=operator.itemgetter('app'))) def test_detect_multiple(self): urls_list = ["http://cern.ch", None, "", "http://cern.ch", "example.com"] with mock.patch('wad.detection.Detector.detect') as mockObj: mockObj.side_effect = [{'test1': 1}, {'test2': 2}] assert self.detector.detect_multiple(urls_list) == {'test1': 1, 'test2': 2} assert (('example.com', None, None, TIMEOUT),) in mockObj.call_args_list assert (('http://cern.ch', None, None, TIMEOUT),) in mockObj.call_args_list def test_normalize_url(self): assert self.detector.normalize_url('http://abc.pl') == 'http://abc.pl/' assert self.detector.normalize_url('http://abc.pl/') == 'http://abc.pl/' assert self.detector.normalize_url('http://abc.pl/def') == 'http://abc.pl/def' def test_regression_meta_attributes_order(self): # This bug was caused by hardcoded attributes order in re_meta pattern. # Example app that was affected was GitLab CI. content1 = "<meta content='GitLab Continuous Integration' name='description'>" content2 = "<meta name='description' content='GitLab Continuous Integration'>" results1 = self.detector.check_meta(content1) results2 = self.detector.check_meta(content2) expected = [{'app': 'GitLab CI', 'ver': None}] assert results1 == results2 == expected def test_regression_empty_content_should_run_checks(self): # This bug was introduced while abstracting some methods in detect method of Detector # Shortly, if the content was empty, code didn't run further (while it should, there might be something in # headers etc.) expected = { 'http://home.web.cern.ch/': [ {'app': 'Apache', 'type': 'Web Servers', 'ver': None}, {'app': 'Drupal', 'type': 'CMS', 'ver': '7'}, {'app': 'PHP', 'type': 'Programming Languages', 'ver': None} ] } results = self.mock_detector_run(url=cern_ch_test_data['geturl'], content='', headers=cern_ch_test_data['headers']) assert list(six.iterkeys(results)) == list(six.iterkeys(expected)) assert (sorted(next(six.itervalues(results)), key=operator.itemgetter('app')) == sorted(next(six.itervalues(expected)), key=operator.itemgetter('app'))) def test_regression_urls_not_normalized(self): # This bug caused .pl top level domain to be recognized as Perl file. # It is due to the fact, that Wappalyzer receives normalized URI from browser ("http://abc.xyz/") # even if you open "http://abc.xyz", while we didn't normalize the URL. results = self.mock_detector_run(url='http://abc.pl') assert results == {'http://abc.pl/': []}
def main(timeout=TIMEOUT): desc = """WAD - This component analyzes given URL(s) and detects technologies, libraries, frameworks etc. used by this application, from the OS and web server level, to the programming platform and frameworks, and server- and client-side applications, tools and libraries. For example: OS=Linux, webserver=Apache, platform=PHP, cms=Drupal, analytics=Google Analytics, javascript-lib=jQuery etc.""" parser = OptionParser( description=desc, usage="Usage: %prog -u <URLs|@URLfile>\nHelp: %prog -h", version="%prog 1.0") parser.add_option( "-u", "--url", dest="urls", metavar="URLS|@FILE", help= "list of URLs (comma-separated), or a file with a list of URLs (one per line)" ) parser.add_option( "-l", "--limit", dest="limit", metavar="URLMASK", help= "in case of redirections, only include pages with URLs matching this mask - " "e.g. 'https?://[^/]*\.abc\.com/'") parser.add_option( "-x", "--exclude", dest="exclude", metavar="URLMASK", help= "in case of redirections, exclude pages with URL matching this mask - " "e.g. 'https?://[^/]*/(login|logout)'") parser.add_option( "-o", "--output", dest="output_file", metavar="FILE", help="output file for detection results (default: STDOUT)") parser.add_option( "-c", "--clues", dest="clues_file", metavar="FILE", default=None, help="clues for detecting web applications and technologies") parser.add_option( "-t", "--timeout", action="store", dest="TIMEOUT", default=timeout, help="set timeout (in seconds) for accessing a single URL") parser.add_option( "-f", "--format", action="store", dest="format", default='json', help="output format, allowed values: csv, txt, json (default)") parser.add_option( "-g", "--group", action="store_true", dest="group", default=False, help= "group results (i.e. technologies found on subpages of other scanned URL " "aren't listed)") tools.add_log_options(parser) options = parser.parse_args()[0] tools.use_log_options(options) if not options.urls: parser.error("Argument -u missing") return timeout = int(options.TIMEOUT) if options.urls[0] == "@": try: f = open(options.urls[1:]) urls = f.readlines() f.close() except Exception as e: # an I/O exception? logging.error("Error reading URL file %s, terminating: %s", options.urls[1:], tools.error_to_str(e)) return else: urls = [x.strip() for x in options.urls.split(",") if x.strip() != ""] if options.format not in output_format_map.keys(): parser.error("Invalid format specified") return Clues.get_clues(options.clues_file) results = Detector().detect_multiple(urls, limit=options.limit, exclude=options.exclude, timeout=timeout) if options.group: results = group(results) output = output_format_map[options.format]().retrieve(results=results) if options.output_file: try: f = open(options.output_file, "w") f.write(output) f.close() logging.debug("Results written to file %s", options.output_file) except Exception as e: # an I/O exception? logging.error("Error writing results to file %s, terminating: %s", options.output_file, tools.error_to_str(e)) return print(output)
class TestDetector(unittest.TestCase): def setUp(self): self.detector = Detector() self.apps = self.detector.apps self.categories = self.detector.categories def mock_detector_run(self, url='', content='', headers=None): with mock.patch('wad.detection.tools') as mockObj: page = mock.Mock() page.geturl.return_value = url if six.PY3: page.read.return_value = bytes(content, encoding='utf-8') else: page.read.return_value = content headers_mock = mock.Mock() headers_mock.items.return_value = headers or [] page.info.return_value = headers_mock mockObj.urlopen = mock.Mock(return_value=page) results = self.detector.detect('http://abc.xyz') return results def test_check_re(self): # checking version patterns: # # "headers": { "Server": "IIS(?:/([\\d.]+))?\\;version:\\1" }, assert (self.detector.check_re( self.apps['IIS']['headers_re']['Server'], self.apps['IIS']['headers']['Server'], 'Microsoft-IIS/7.5', [], None, 'IIS') == [{'app': 'IIS', 'ver': '7.5'}]) # (?:maps\\.google\\.com/maps\\?file=api(?:&v=([\\d.]+))?| # maps\\.google\\.com/maps/api/staticmap)\\;version:API v\\1 assert (self.detector.check_re( self.apps['Google Maps']['script_re'][0], self.apps['Google Maps']['script'][0], 'abc <script src="maps.google.com/maps?file=api&v=123"> def', [], None, 'Google Maps') == [{'app': 'Google Maps', 'ver': 'API v123'}]) # "script": [ "js/mage", "skin/frontend/(?:default|(enterprise))\\;version:\\1?Enterprise:Community" ], assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/whatever"> def', [], None, 'Magento') == []) assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/default"> def', [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Community'}]) assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/enterprise"> def', [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Enterprise'}]) def test_check_url(self): assert self.detector.check_url("http://whatever.blogspot.com") == [{'app': 'Blogger', 'ver': None}] assert self.detector.check_url("https://whatever-else3414.de/script.php") == [{'app': 'PHP', 'ver': None}] def test_check_html(self): content = '<html><div id="gsNavBar" class="gcBorder1">whatever' assert self.detector.check_html(content) == [{'app': 'Gallery', 'ver': None}] def test_check_meta(self): assert (self.detector.check_meta('<html> s<meta name="generator" content="Percussion">sssss whatever') == [{'app': 'Percussion', 'ver': None}]) assert (self.detector.check_meta(" dcsaasd f<meta name = 'cargo_title' dd content = 'Pdafadfda' >") == [{'app': 'Cargo', 'ver': None}]) assert (self.detector.check_meta(" dcsaasd f<mfffffffeta name='cargo_title' dd content='Pdafadfda' >") == []) assert self.detector.check_meta(" dcsaasd f<meta name='cargo_title' >") == [] def test_check_script(self): assert (self.detector.check_script('<html> s<script sda f src = "jquery1.7.js">') == [{'app': 'jQuery', 'ver': None}]) assert self.detector.check_script(" dcsaasd f<script src='' >") == [] def test_check_headers(self): headers = [('Host', 'abc.com'), ('Server', 'Linux Ubuntu 12.10')] headers_mock = mock.Mock() headers_mock.items.return_value = headers assert (self.detector.check_headers(headers_mock) == [{'app': 'Ubuntu', 'ver': None}]) def test_implied_by(self): # ASP implies WS and IIS and IIS implies WS; # but we already know about IIS, so the only new implied app is WS assert self.detector.implied_by(['Microsoft ASP.NET', 'IIS']) == ['Windows Server'] def test_follow_implies(self): # empty findings findings = [] self.detector.follow_implies(findings) assert findings == [] # no implies findings = [{'app': 'reCAPTCHA', 'ver': None}] self.detector.follow_implies(findings) assert findings == [{'app': 'reCAPTCHA', 'ver': None}] # Django CMS implies Django, and Django implies Python - let's see if this chain is followed findings = [{'app': 'Django CMS', 'ver': None}] self.detector.follow_implies(findings) assert (findings == [{'app': 'Django CMS', 'ver': None}, {'app': 'Django', 'ver': None}, {'app': 'Python', 'ver': None}]) def test_remove_duplicates(self): with_duplicates = [ {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': None}, {'app': 'D', 'ver': "7.0"}, {'app': 'E', 'ver': "1"}, {'app': 'F', 'ver': "2.2"}, {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "222"}, {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'E', 'ver': None}, {'app': 'E', 'ver': "1.3"}, {'app': 'F', 'ver': "2"}, {'app': 'F', 'ver': None}, ] without_duplicates = [ {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "7.0"}, {'app': 'E', 'ver': "1.3"}, {'app': 'F', 'ver': "2.2"}, {'app': 'D', 'ver': "222"}, ] Detector().remove_duplicates(with_duplicates) assert with_duplicates == without_duplicates def test_excluded_by(self): # both 'TYPO3 Flow' and 'TYPO3 Neos' exclude 'TYPO3 CMS' assert self.detector.excluded_by(['TYPO3 Flow', 'TYPO3 Neos']) == ['TYPO3 CMS'] # 'JBoss Web' excludes 'Apache Tomcat'; 'Mambo' excludes 'Joomla' assert set(self.detector.excluded_by(['Jo', 'JBoss Web', 'Jetty', 'Mambo'])) == set(['Joomla', 'Apache Tomcat']) # 'IIS' doesn't exclude anything assert self.detector.excluded_by(['IIS']) == [] def test_remove_exclusions(self): # empty findings findings = [] self.detector.remove_exclusions(findings) assert findings == [] # no implies findings = [{'app': 'reCAPTCHA', 'ver': None}] self.detector.remove_exclusions(findings) assert findings == [{'app': 'reCAPTCHA', 'ver': None}] # real exclusions findings = [{'app': 'JBoss Web', 'ver': None}, {'app': 'Apache Tomcat', 'ver': None}, {'app': 'IIS', 'ver': None}, {'app': 'TYPO3 CMS', 'ver': None}, {'app': 'TYPO3 Flow', 'ver': None}] self.detector.remove_exclusions(findings) assert (findings == [{'app': 'JBoss Web', 'ver': None}, {'app': 'IIS', 'ver': None}, {'app': 'TYPO3 Flow', 'ver': None}]) def test_add_categories(self): findings = [ {'app': 'Django CMS', 'ver': None}, {'app': 'Django', 'ver': None}, {'app': 'Python', 'ver': '2.7'}, {'app': 'Dynamicweb', 'ver': 'beta'}] original = copy.deepcopy(findings) original[0]["type"] = "cms" original[1]["type"] = "web-frameworks" original[2]["type"] = "programming-languages" original[3]["type"] = "cms,ecommerce,analytics" self.detector.add_categories(findings) assert original == findings def test_url_match(self): assert self.detector.url_match(url='', regexp=None, default='test') == 'test' assert self.detector.url_match(url='example.com', regexp='exampl', default='test') is not None assert self.detector.url_match(url='example.com', regexp='ampl', default='test') is None def test_expected_url(self): url = "http://site.abc.com/dir/sub/script.php" assert self.detector.expected_url(url, None, None) assert self.detector.expected_url(url, 'http://.*abc.com/', None) assert not self.detector.expected_url(url, 'http://abc.com/', None) assert self.detector.expected_url(url, 'http://.*abc.com/', "php") assert not self.detector.expected_url(url, 'http://.*abc.com/', ".*php") assert self.detector.expected_url(url, None, ".*\.asp") assert not self.detector.expected_url(url, None, ".*\.php") def test_detect(self): expected = { 'http://home.web.cern.ch/': [ {'app': 'Apache', 'type': 'web-servers', 'ver': None}, {'app': 'Drupal', 'type': 'cms', 'ver': '7'}, {'app': 'Lightbox', 'type': 'photo-galleries,javascript-frameworks', 'ver': None}, {'app': 'jQuery', 'type': 'javascript-frameworks', 'ver': None}, {'app': 'Google Font API', 'type': 'font-scripts', 'ver': None}, {'app': 'PHP', 'type': 'programming-languages', 'ver': None} ] } results = self.mock_detector_run(url=cern_ch_test_data['geturl'], content=cern_ch_test_data['content'], headers=cern_ch_test_data['headers'].items()) assert list(six.iterkeys(results)) == list(six.iterkeys(expected)) assert (sorted(next(six.itervalues(results)), key=operator.itemgetter('app')) == sorted(next(six.itervalues(expected)), key=operator.itemgetter('app'))) def test_detect_multiple(self): urls_list = ["http://cern.ch", None, "", "http://cern.ch", "example.com"] with mock.patch('wad.detection.Detector.detect') as mockObj: mockObj.side_effect = [{'test1': 1}, {'test2': 2}] assert self.detector.detect_multiple(urls_list) == {'test1': 1, 'test2': 2} assert (('example.com', None, None, TIMEOUT),) in mockObj.call_args_list assert (('http://cern.ch', None, None, TIMEOUT),) in mockObj.call_args_list def test_normalize_url(self): assert self.detector.normalize_url('http://abc.pl') == 'http://abc.pl/' assert self.detector.normalize_url('http://abc.pl/') == 'http://abc.pl/' assert self.detector.normalize_url('http://abc.pl/def') == 'http://abc.pl/def' def test_regression_meta_attributes_order(self): # This bug was caused by hardcoded attributes order in re_meta pattern. # Example app that was affected was GitLab CI. content1 = "<meta content='GitLab Continuous Integration' name='description'>" content2 = "<meta name='description' content='GitLab Continuous Integration'>" results1 = self.detector.check_meta(content1) results2 = self.detector.check_meta(content2) expected = [{'app': 'GitLab CI', 'ver': None}] assert results1 == results2 == expected def test_regression_empty_content_should_run_checks(self): # This bug was introduced while abstracting some methods in detect method of Detector # Shortly, if the content was empty, code didn't run further (while it should, there might be something in # headers etc.) expected = { 'http://home.web.cern.ch/': [ {'app': 'Apache', 'type': 'web-servers', 'ver': None}, {'app': 'Drupal', 'type': 'cms', 'ver': '7'}, {'app': 'PHP', 'type': 'programming-languages', 'ver': None} ] } results = self.mock_detector_run(url=cern_ch_test_data['geturl'], content='', headers=cern_ch_test_data['headers'].items()) assert list(six.iterkeys(results)) == list(six.iterkeys(expected)) assert (sorted(next(six.itervalues(results)), key=operator.itemgetter('app')) == sorted(next(six.itervalues(expected)), key=operator.itemgetter('app'))) def test_regression_urls_not_normalized(self): # This bug caused .pl top level domain to be recognized as Perl file. # It is due to the fact, that Wappalyzer receives normalized URI from browser ("http://abc.xyz/") # even if you open "http://abc.xyz", while we didn't normalize the URL. results = self.mock_detector_run(url='http://abc.pl') assert results == {'http://abc.pl/': []}
class TestDetector(unittest.TestCase): def setUp(self): self.detector = Detector() self.apps = self.detector.apps self.categories = self.detector.categories def test_check_re(self): # checking version patterns: # # "headers": { "Server": "IIS(?:/([\\d.]+))?\\;version:\\1" }, assert (self.detector.check_re( self.apps['IIS']['headers_re']['Server'], self.apps['IIS']['headers']['Server'], 'Microsoft-IIS/7.5', [], None, 'IIS') == [{'app': 'IIS', 'ver': '7.5'}]) # (?:maps\\.google\\.com/maps\\?file=api(?:&v=([\\d.]+))?| # maps\\.google\\.com/maps/api/staticmap)\\;version:API v\\1 assert (self.detector.check_re( self.apps['Google Maps']['script_re'][0], self.apps['Google Maps']['script'][0], 'abc <script src="maps.google.com/maps?file=api&v=123"> def', [], None, 'Google Maps') == [{'app': 'Google Maps', 'ver': 'API v123'}]) # "script": [ "js/mage", "skin/frontend/(?:default|(enterprise))\\;version:\\1?Enterprise:Community" ], assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/whatever"> def', [], None, 'Magento') == []) assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/default"> def', [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Community'}]) assert (self.detector.check_re( self.apps['Magento']['script_re'][1], self.apps['Magento']['script'][1], 'abc <script src="skin/frontend/enterprise"> def', [], None, 'Magento') == [{'app': 'Magento', 'ver': 'Enterprise'}]) def test_check_url(self): assert self.detector.check_url("http://whatever.blogspot.com") == [{'app': 'Blogger', 'ver': None}] assert self.detector.check_url("https://whatever-else3414.de/script.php") == [{'app': 'PHP', 'ver': None}] def test_check_html(self): content = '<html><div id="gsNavBar" class="gcBorder1">whatever' assert self.detector.check_html(content) == [{'app': 'Gallery', 'ver': None}] def test_check_meta(self): assert (self.detector.check_meta('<html> s<meta name="generator" content="Percussion">sssss whatever') == [{'app': 'Percussion', 'ver': None}]) assert (self.detector.check_meta(" dcsaasd f<meta name = 'cargo_title' dd content = 'Pdafadfda' >") == [{'app': 'Cargo', 'ver': None}]) assert (self.detector.check_meta(" dcsaasd f<mfffffffeta name='cargo_title' dd content='Pdafadfda' >") == []) assert self.detector.check_meta(" dcsaasd f<meta name='cargo_title' >") == [] def test_check_script(self): assert (self.detector.check_script('<html> s<script sda f src = "jquery1.7.js">') == [{'app': 'jQuery', 'ver': None}]) assert self.detector.check_script(" dcsaasd f<script src='' >") == [] def test_check_headers(self): headers = { 'Host': 'abc.com', 'Server': 'Linux Ubuntu 12.10', } headers_mock = mock.Mock() headers_mock.dict = headers assert (self.detector.check_headers(headers_mock) == [{'app': 'Ubuntu', 'ver': None}]) def test_implied_by(self): # ASP implies WS and IIS and IIS implies WS; # but we already know about IIS, so the only new implied app is WS assert self.detector.implied_by(['Microsoft ASP.NET', 'IIS']) == ['Windows Server'] def test_follow_implies(self): # empty findings findings = [] self.detector.follow_implies(findings) assert findings == [] # no implies findings = [{'app': 'reCAPTCHA', 'ver': None}] self.detector.follow_implies(findings) assert findings == [{'app': 'reCAPTCHA', 'ver': None}] # Django CMS implies Django, and Django implies Python - let's see if this chain is followed findings = [{'app': 'Django CMS', 'ver': None}] self.detector.follow_implies(findings) assert (findings == [{'app': 'Django CMS', 'ver': None}, {'app': 'Django', 'ver': None}, {'app': 'Python', 'ver': None}]) def test_remove_duplicates(self): with_duplicates = [ {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': None}, {'app': 'D', 'ver': "7.0"}, {'app': 'E', 'ver': "1"}, {'app': 'F', 'ver': "2.2"}, {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "222"}, {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'E', 'ver': None}, {'app': 'E', 'ver': "1.3"}, {'app': 'F', 'ver': "2"}, {'app': 'F', 'ver': None}, ] without_duplicates = [ {'app': 'A', 'ver': None}, {'app': 'B', 'ver': "1.5"}, {'app': 'C', 'ver': "be"}, {'app': 'D', 'ver': "7.0"}, {'app': 'E', 'ver': "1.3"}, {'app': 'F', 'ver': "2.2"}, {'app': 'D', 'ver': "222"}, ] Detector().remove_duplicates(with_duplicates) assert with_duplicates == without_duplicates def test_excluded_by(self): # both 'TYPO3 Flow' and 'TYPO3 Neos' exclude 'TYPO3 CMS' assert self.detector.excluded_by(['TYPO3 Flow', 'TYPO3 Neos']) == ['TYPO3 CMS'] # 'JBoss Web' excludes 'Apache Tomcat'; 'Mambo' excludes 'Joomla' assert set(self.detector.excluded_by(['Jo', 'JBoss Web', 'Jetty', 'Mambo'])) == set(['Joomla', 'Apache Tomcat']) # 'IIS' doesn't exclude anything assert self.detector.excluded_by(['IIS']) == [] def test_remove_exclusions(self): # empty findings findings = [] self.detector.remove_exclusions(findings) assert findings == [] # no implies findings = [{'app': 'reCAPTCHA', 'ver': None}] self.detector.remove_exclusions(findings) assert findings == [{'app': 'reCAPTCHA', 'ver': None}] # real exclusions findings = [{'app': 'JBoss Web', 'ver': None}, {'app': 'Apache Tomcat', 'ver': None}, {'app': 'IIS', 'ver': None}, {'app': 'TYPO3 CMS', 'ver': None}, {'app': 'TYPO3 Flow', 'ver': None}] self.detector.remove_exclusions(findings) assert (findings == [{'app': 'JBoss Web', 'ver': None}, {'app': 'IIS', 'ver': None}, {'app': 'TYPO3 Flow', 'ver': None}]) def test_add_categories(self): findings = [ {'app': 'Django CMS', 'ver': None}, {'app': 'Django', 'ver': None}, {'app': 'Python', 'ver': '2.7'}, {'app': 'Dynamicweb', 'ver': 'beta'}] original = copy.deepcopy(findings) original[0]["type"] = "cms" original[1]["type"] = "web-frameworks" original[2]["type"] = "programming-languages" original[3]["type"] = "cms,ecommerce,analytics" self.detector.add_categories(findings) assert original == findings def test_url_match(self): assert self.detector.url_match(url='', regexp=None, default='test') == 'test' assert self.detector.url_match(url='example.com', regexp='exampl', default='test') is not None assert self.detector.url_match(url='example.com', regexp='ampl', default='test') is None def test_expected_url(self): url = "http://site.abc.com/dir/sub/script.php" assert self.detector.expected_url(url, None, None) assert self.detector.expected_url(url, 'http://.*abc.com/', None) assert not self.detector.expected_url(url, 'http://abc.com/', None) assert self.detector.expected_url(url, 'http://.*abc.com/', "php") assert not self.detector.expected_url(url, 'http://.*abc.com/', ".*php") assert self.detector.expected_url(url, None, ".*\.asp") assert not self.detector.expected_url(url, None, ".*\.php") def test_detect(self): expected = { 'http://home.web.cern.ch/': [ {'app': 'Apache', 'type': 'web-servers', 'ver': None}, {'app': 'Drupal', 'type': 'cms', 'ver': '7'}, {'app': 'Lightbox', 'type': 'photo-galleries,javascript-frameworks', 'ver': None}, {'app': 'jQuery', 'type': 'javascript-frameworks', 'ver': None}, {'app': 'Google Font API', 'type': 'font-scripts', 'ver': None}, {'app': u'PHP', 'type': 'programming-languages', 'ver': None} ] } with mock.patch('wad.wad.tools') as mockObj: page = mock.Mock() page.geturl.return_value = cern_ch_test_data['geturl'] page.read.return_value = cern_ch_test_data['content'] headers_mock = mock.Mock() headers_mock.dict = cern_ch_test_data['headers'] page.info.return_value = headers_mock mockObj.urlopen = mock.Mock(return_value=page) assert self.detector.detect('http://cern.ch') == expected def test_detect_multiple(self): urls_list = ["http://cern.ch", None, "", "http://cern.ch", "example.com"] with mock.patch('wad.detection.Detector.detect') as mockObj: mockObj.side_effect = [{'test1': 1}, {'test2': 2}] assert self.detector.detect_multiple(urls_list) == {'test1': 1, 'test2': 2} assert mockObj.call_args_list == [(('example.com', None, None, TIMEOUT),), (('http://cern.ch', None, None, TIMEOUT),)]