def identify_rfu(tempdir): """ Given a temporary directory, attempts to distinguish CMS' from non-CMS websites and from each other. If a single CMS file is identified, then no hashing is performed and the file is assumed to be of that particular CMS. False positives will be weeded during the version detection phase. If all files requested were responded with 200 OK, the site is discarded. This is a design decision I might reconsider if it results in too many false negatives. @param tempfile: as returned by download_rfu. @return: DeferredList """ rfu = pu.get_rfu() plugins = pu.plugins_base_get() files_found = async.rfu_path(tempdir, plugins) if len(rfu) == len(files_found): msg = "Url responded 200 OK to everything" return defer.fail(UnknownCMSException(msg)) cms_name = identify_rfu_easy(tempdir, files_found) if cms_name: return defer.succeed(cms_name) return defer.fail(UnknownCMSException("This shouldn't happen too often."))
def download_rfu(base_url, host_header): """ Download all "regular file urls" for all CMS. @param base_url: @param host_header: @return DeferredList """ def ret_result(results, tempdir, location): succ = filter(lambda r: r[0], results) if len(succ) == 0: msg = "'%s' not identified as any CMS." return Failure(UnknownCMSException(msg % str(location))) else: return tempdir def clean(fail, tempdir): delete_tempdir(tempdir) return fail tempdir = mkdtemp(prefix='dscan') + "/" required_files = pu.get_rfu() ds = [] for f in required_files: url = base_url + f download_location = tempdir + async.filename_encode(f) d = async.download_url(url, host_header, download_location) ds.append(d) dl = defer.DeferredList(ds, consumeErrors=True) dl.addCallback(ret_result, tempdir, (base_url, host_header)) dl.addErrback(clean, tempdir) return dl
def identify_rfu(tempdir): """ Given a temporary directory, attempts to distinguish CMS' from non-CMS websites and from each other. If a single CMS file is identified, then no hashing is performed and the file is assumed to be of that particular CMS. False positives will be weeded during the version detection phase. If all files requested were responded with 200 OK, the site is discarded. This is a design decision I might reconsider if it results in too many false negatives. @param tempfile: as returned by download_rfu. @return: DeferredList """ rfu = pu.get_rfu() plugins = pu.plugins_base_get() files_found = async .rfu_path(tempdir, plugins) if len(rfu) == len(files_found): msg = "Url responded 200 OK to everything" return defer.fail(UnknownCMSException(msg)) cms_name = identify_rfu_easy(tempdir, files_found) if cms_name: return defer.succeed(cms_name) return defer.fail(UnknownCMSException("This shouldn't happen too often."))
def download_rfu(base_url, host_header): """ Download all "regular file urls" for all CMS. @param base_url: @param host_header: @return DeferredList """ def ret_result(results, tempdir, location): succ = filter(lambda r: r[0], results) if len(succ) == 0: msg = "'%s' not identified as any CMS." return Failure(UnknownCMSException(msg % str(location))) else: return tempdir def clean(fail, tempdir): delete_tempdir(tempdir) return fail tempdir = mkdtemp(prefix='dscan') + "/" required_files = pu.get_rfu() ds = [] for f in required_files: url = base_url + f download_location = tempdir + async .filename_encode(f) d = async .download_url(url, host_header, download_location) ds.append(d) dl = defer.DeferredList(ds, consumeErrors=True) dl.addCallback(ret_result, tempdir, (base_url, host_header)) dl.addErrback(clean, tempdir) return dl
def test_identify_calls_all_rfu(self): rfu = pu.get_rfu() with patch(ASYNC + 'download_url', autospec=True) as du: identify_url(self.base_url, None) self.assertEquals(du.call_count, len(rfu)) for i, call in enumerate(du.call_args_list): args, kwargs = call self.assertEquals(args[0], self.base_url + rfu[i]) self.assertTrue(args[2].endswith(async.filename_encode(rfu[i])))
def test_identify_calls_all_rfu(self): rfu = pu.get_rfu() with patch(ASYNC + 'download_url', autospec=True) as du: identify_url(self.base_url, None) self.assertEquals(du.call_count, len(rfu)) for i, call in enumerate(du.call_args_list): args, kwargs = call self.assertEquals(args[0], self.base_url + rfu[i]) self.assertTrue(args[2].endswith(async .filename_encode( rfu[i])))
def test_identify_rfu_single_file(self): rfu = pu.get_rfu() fake_dir = '/tmp/dsadasdadaa/' joomla_file = fake_dir + async.filename_encode("media/system/js/validate.js") def isfile(path): if path == joomla_file: return True else: return False with patch("os.path.isfile", side_effect=isfile, autospec=True) as if_mock: d = identify_rfu(fake_dir) cms_name = self.successResultOf(d) self.assertEquals(cms_name, "joomla") self.assertEquals(if_mock.call_count, len(rfu))
def test_identify_raises_when_none_found(self, rt, mt, isdir): ret = '/tmp/lelelellee' mt.return_value = ret def fail(*args, **kwargs): return f() rfu = pu.get_rfu() with patch(ASYNC + 'download_url', side_effect=fail, autospec=True) as du: with patch(ASYNC_SCAN + 'identify_rfu') as ir: self.assertFailure(identify_url(self.base_url, None), UnknownCMSException) self.assertEquals(ir.call_count, 0) args, kwargs = rt.call_args self.assertEquals(rt.call_count, 1) self.assertEquals(mt.call_count, 1) self.assertEquals(args[0], ret + "/")
def test_identify_rfu_single_file(self): rfu = pu.get_rfu() fake_dir = '/tmp/dsadasdadaa/' joomla_file = fake_dir + async .filename_encode( "media/system/js/validate.js") def isfile(path): if path == joomla_file: return True else: return False with patch("os.path.isfile", side_effect=isfile, autospec=True) as if_mock: d = identify_rfu(fake_dir) cms_name = self.successResultOf(d) self.assertEquals(cms_name, "joomla") self.assertEquals(if_mock.call_count, len(rfu))