def test_simple_regex_match(self): print(Source.from_url(data_url).to_json_object()) obj = { "scan_tag": { "scanner": { "name": "integration_test", "pk": 0 }, "time": "2020-01-01T00:00:00+00:00" }, "source": Source.from_url(data_url).to_json_object(), "rule": rule.to_json_object() } self.messages.append(( obj, "os2ds_scan_specs", )) self.run_pipeline() self.assertEqual(len(self.unhandled), 2) results = {body["origin"]: body for body, _ in self.unhandled} self.assertTrue(results["os2ds_matches"]["matched"], "RegexRule match failed") self.assertEqual(results["os2ds_matches"]["matches"], expected_matches, "RegexRule match did not produce expected result")
def test_simple_regex_match(self): print(Source.from_url(data_url).to_json_object()) obj = { "scan_tag": { "scanner": { "name": "integration_test", "pk": 0 }, "time": "2020-01-01T00:00:00+00:00" }, "source": Source.from_url(data_url).to_json_object(), "rule": rule.to_json_object() } self.runner.channel.basic_publish(exchange='', routing_key="os2ds_scan_specs", body=dumps(obj).encode()) try: self.runner.run_consumer() except StopHandling as e: self.assertTrue(self.runner.messages["os2ds_matches"]["matched"], "RegexRule match failed") self.assertEqual( self.runner.messages["os2ds_matches"]["matches"], expected_matches, "RegexRule match did not produce expected result")
def test_simple_regex_match(self): print(Source.from_url(data_url).to_json_object()) obj = { "scan_tag": "integration_test", "source": Source.from_url(data_url).to_json_object(), "rule": rule.to_json_object() } self.channel.basic_publish(exchange='', routing_key="os2ds_scan_specs", body=dumps(obj).encode()) messages = {} def result_received(a, b, c, d): body = loads(d.decode("utf-8")) messages[body["origin"]] = body if len(messages) == 2: raise StopHandling() self.channel.basic_consume("os2ds_results", result_received) try: self.channel.start_consuming() except StopHandling as e: self.assertTrue(messages["os2ds_matches"]["matched"], "RegexRule match failed") self.assertEqual( messages["os2ds_matches"]["matches"], expected_matches, "RegexRule match did not produce expected result")
def process(self, source, sm, depth=0): if depth == 0: self.assertIsNone(source.handle, "{0}: unexpected backing handle".format(source)) for handle in source.handles(sm): print("{0}{1}".format(" " * depth, handle)) guessed = Source.from_handle(handle) computed = Source.from_handle(handle, sm) if computed or guessed: self.process(computed or guessed, sm, depth + 1) elif handle.name == "url": with handle.follow(sm).make_stream() as fp: url = fp.read().decode("utf-8") self.process(Source.from_url(url), sm, depth + 1) elif handle.name == "test-vector" or isinstance( source, DataSource): r = handle.follow(sm) self.assertTrue(r.check(), "check() method failed") reported_size = r.get_size() last_modified = r.get_last_modified() with r.make_stream() as fp: stream_raw = fp.read() stream_size = len(stream_raw) stream_content = stream_raw.decode("utf-8") with r.make_path() as p: with open(p, "rb") as fp: file_raw = fp.read() file_size = len(file_raw) file_content = file_raw.decode("utf-8") self.assertIsInstance(last_modified, SingleResult, ("{0}: last modification date is not a" " SingleResult").format(handle)) self.assertIsInstance( last_modified.value, datetime, ("{0}: last modification date value is not a" "datetime.datetime").format(handle)) self.assertIsInstance(reported_size, SingleResult, ("{0}: resource length is not a" " SingleResult").format(handle)) self.assertEqual( stream_size, reported_size.value, "{0}: model stream length invalid".format(handle)) self.assertEqual( file_size, reported_size.value, "{0}: model stream length invalid".format(handle)) self.assertEqual( file_raw, stream_raw, "{0}: model file and stream not equal".format(handle)) self.assertEqual(stream_content, self.correct_content, "{0}: model stream invalid".format(handle)) self.assertEqual(file_content, self.correct_content, "{0}: model file invalid".format(handle))
def test_incomplete_json(self): with self.assertRaises(DeserialisationError): Source.from_json_object({"hostname": "gopher.invalid"}) with self.assertRaises(DeserialisationError): Handle.from_json_object({ "source": { "type": "gopher", "hostname": "gopher.invalid" }, "path": "/Reference" })
def test_invalid_json(self): with self.assertRaises(UnknownSchemeError): Source.from_json_object({ "type": "gopher", "hostname": "gopher.invalid" }) with self.assertRaises(UnknownSchemeError): Handle.from_json_object({ "type": "gopher", "source": { "type": "gopher", "hostname": "gopher.invalid" }, "path": "/Reference" })
def test_corrupted_doc(self): corrupted_doc_handle = FilesystemHandle.make_handle( os.path.join(test_data_path, "msoffice/corrupted/test.trunc.doc")) corrupted_doc = Source.from_handle(corrupted_doc_handle) with SourceManager() as sm: self.assertEqual( list(corrupted_doc.handles(sm)), [], "unrecognised CDFV2 document should be empty and wasn't")
def test_derived_source(self): with SourceManager() as sm: s = FilesystemSource(test_data_path) h = FilesystemHandle(s, "data/engine2/zip-here/test-vector.zip") zs = Source.from_handle(h) self.assertIsNotNone( zs.handle, "{0}: derived source has no handle".format(zs))
def try_apply(sm, source): for handle in source.handles(sm): derived = Source.from_handle(handle, sm) if derived: try_apply(sm, derived) else: resource = handle.follow(sm) representation = convert(resource, OutputType.Text) return representation.value
def test_libreoffice_size(self): large_doc_handle = FilesystemHandle.make_handle( os.path.join(test_data_path, "libreoffice/html-explosion.ods")) large_doc = Source.from_handle(large_doc_handle) with SourceManager() as sm: for h in large_doc.handles(sm): if h.name.endswith(".html"): r = h.follow(sm) self.assertLess(r.get_size().value, 1048576, "LibreOffice HTML output was too big")
def test_smbc_url(self): with SourceManager() as sm: source = Source.from_url( "smbc://*****:*****@samba/general") try: with contextlib.closing(source.handles(sm)) as c: next(c) except Exception: self.skipTest("test Samba server not up (not running in CI?)") self.process(source, sm)
def try_apply(sm, source, rule): for handle in source.handles(sm): derived = Source.from_handle(handle, sm) if derived: yield from try_apply(sm, derived, rule) else: resource = handle.follow(sm) representation = convert(resource, rule.operates_on) if representation: yield from rule.match(representation.value)
def test_eml_files(self): fs = FilesystemSource(test_data_path) with SourceManager() as sm: for h in fs.handles(sm): mail_source = Source.from_handle(h) self.assertIsInstance( mail_source, MailSource, "conversion of {0} to MailSource failed".format(h)) for h in mail_source.handles(sm): self.assertIsInstance( h, MailPartHandle)
def handle(self, **kwargs): urls = kwargs['urls'] guess, summarise = kwargs['guess'], kwargs['summarise'] with SourceManager() as sm: for i in urls: try: s = Source.from_url(i) url_explorer.print_source(sm, s, guess=guess, summarise=summarise) except UnknownSchemeError: pass
def test_simple_regex_match(self): print(Source.from_url(data_url).to_json_object()) obj = { "scan_tag": { "scanner": { "name": "integration_test", "pk": 0 }, "time": "2020-01-01T00:00:00+00:00" }, "source": Source.from_url(data_url).to_json_object(), "rule": rule.to_json_object() } self.runner.channel.basic_publish(exchange='', routing_key="os2ds_scan_specs", body=dumps(obj).encode()) messages = {} def result_received(channel, method, properties, body): channel.basic_ack(method.delivery_tag) body = loads(body.decode("utf-8")) messages[body["origin"]] = body if len(messages) == 2: raise StopHandling() self.runner.channel.basic_consume("os2ds_results", result_received) try: self.runner.run_consumer() except StopHandling as e: self.assertTrue(messages["os2ds_matches"]["matched"], "RegexRule match failed") self.assertEqual( messages["os2ds_matches"]["matches"], expected_matches, "RegexRule match did not produce expected result")
def test_invalid_url(self): with self.assertRaises(UnknownSchemeError): Source.from_url("Well, this just isn't a URL at all!")
def test_handles_failure(self): with self.assertRaises(Exception): with SourceManager() as sm: source = Source.from_url("http://example.invalid./") with contextlib.closing(source.handles(sm)) as handles: next(handles)
def test_invalid_scheme(self): with self.assertRaises(UnknownSchemeError): Source.from_url("xxx-invalid://data/20")
def test_local_url(self): with SourceManager() as sm: self.process(Source.from_url("file://" + test_data_path), sm)
"name": "test.txt", } # json `data` content needs to be base64 encoded json_gzip = { "type": "data", "content": b64encode(gzip_content), "mime": "application/gzip", "name": "test.txt", } for j in ( json_data, json_gzip, ): s = Source.from_json_object(j) while True: h_generator = s.handles(sm) h = next(h_generator) r = h.follow(sm) #if h.guess_type() == "text/plain": print(f"handle\t{h}") print(f"resource\t{r}") print("raw content:") with r.make_stream() as fp: print("\t\t{0}".format(fp.read())) # should succed for text -> text conversion try:
def run_rule_on_handle(self, handle): with SourceManager() as sm: source = Source.from_handle(handle, sm) self.assertIsNotNone( source, "{0} couldn't be made into a Source".format(handle)) self.run_rule(source, sm)
body = { "rule": { "type": "regex", "expression": "[Tt]est" }, "source": { "type": "data", "content": "VGhpcyBpcyBvbmx5IGEgdGVzdA==", "mime": "text/plain", "name": "test.txt" } } source = Source.from_json_object(body["source"]) top_type = _get_top(source).type_label rule = Rule.from_json_object(body["rule"]) message = messages.ScanSpecMessage(scan_tag=messages.ScanTagFragment( time=time_now(), user=None, scanner=messages.ScannerFragment(pk=0, name="API server demand scan"), organisation=messages.OrganisationFragment(name="API server", uuid=uuid4())), source=source, rule=rule, configuration={}, progress=None).to_json_object()
def get_content_from_handle(handle): with SourceManager() as sm: source = Source.from_handle(handle, sm) assert source is not None, f"{handle} cound not be made into a Source" return try_apply(sm, source)
with r.make_stream() as fp: content = fp.read() # same as r.compute_type() implemented in FileResource # we could only read the first 512 bytes to get mime type mtype = mime.from_buffer(content) with open(fname, 'wb') as fh: fh.write(content) # To see how the pipeline can work with data sources of all kinds without # knowing what they are, we can try working with the JSON form of ToySource: from os2datascanner.engine2.model.core import Source, SourceManager sm = SourceManager() generic_source = Source.from_json_object({ "type": "toy", "username": "******", "password": "******" }) print([h.relative_path for h in generic_source.handles(sm)]) """ The description of Handles earlier glossed them as references to "objects". But what is an object? To some extent this depends on the Source. In a filesystem, an object is a file: a named stream of bytes with some metadata. In an email account, an object is an email. In a case management system, an object is a case. But sometimes the lines are blurrier than that. For example, consider a Zip file. It is a file: it's a stream of bytes with a name, a size, and some metadata. It can also, however, be viewed as a container for other files, each of which in turn also has these properties.