Ejemplo n.º 1
0
 def test_corrupted_ocr(self):
     fs = FilesystemSource(os.path.join(test_data_path, "corrupted"))
     with SourceManager() as sm:
         for h in fs.handles(sm):
             resource = h.follow(sm)
             self.assertEqual(convert(resource, OutputType.Text), None,
                              "{0}: error handling failed".format(h))
Ejemplo n.º 2
0
 def test_ocr_conversions(self):
     fs = FilesystemSource(os.path.join(test_data_path, "good"))
     with SourceManager() as sm:
         for h in fs.handles(sm):
             resource = h.follow(sm)
             self.assertEqual(
                 convert(resource, OutputType.Text).value, expected_result,
                 "{0}: content failed".format(h))
Ejemplo n.º 3
0
 def test_size_computation(self):
     fs = FilesystemSource(test_data_path)
     with SourceManager() as sm:
         for h in fs.handles(sm):
             resource = h.follow(sm)
             size = convert(resource, OutputType.ImageDimensions)
             if not size:
                 if "rgba32" in h.relative_path:
                     self.skipTest("Pillow RGBA bug detected -- skipping")
             else:
                 size = size.value
             self.assertEqual(size, expected_size, "{0}: size failed")
Ejemplo n.º 4
0
 def test_eml_files(self):
     fs = FilesystemSource(test_data_path)
     with SourceManager() as sm:
         for h in fs.handles(sm):
             mail_source = Source.from_handle(h)
             self.assertIsInstance(
                     mail_source,
                     MailSource,
                     "conversion of {0} to MailSource failed".format(h))
             for h in mail_source.handles(sm):
                 self.assertIsInstance(
                         h,
                         MailPartHandle)
Ejemplo n.º 5
0
    def test_ocr_skip(self):
        obj = {
            "scan_tag": {
                "scanner": {
                    "name": "integration_test",
                    "pk": 0
                },
                "time": "2020-01-01T00:00:00+00:00"
            },
            "source":
            FilesystemSource(os.path.join(test_data_path, "ocr",
                                          "good")).to_json_object(),
            "rule":
            CPRRule(modulus_11=False,
                    ignore_irrelevant=False).to_json_object(),
            "configuration": {
                "skip_mime_types": ["image/*"]
            }
        }

        self.messages.append((
            obj,
            "os2ds_scan_specs",
        ))
        self.run_pipeline()

        for message, queue in self.unhandled:
            if queue == "os2ds_results":
                self.assertFalse(message["matched"],
                                 "OCR match found with OCR disabled")
            else:
                self.fail("unexpected message in queue {0}".format(queue))
Ejemplo n.º 6
0
    def test_derived_source(self):
        with SourceManager() as sm:
            s = FilesystemSource(test_data_path)
            h = FilesystemHandle(s, "data/engine2/zip-here/test-vector.zip")

            zs = Source.from_handle(h)
            self.assertIsNotNone(
                zs.handle, "{0}: derived source has no handle".format(zs))
def get_different_filesystemhandle(file_ending, folder_level):
    path = '/'
    for x in range(0, folder_level):
        path += ''.join(
            random.choice(string.ascii_lowercase) for i in range(10)) + '/'
    return FilesystemHandle(
        FilesystemSource("/mnt/fs01.magenta.dk/brugere/af"),
        "{0}{1}{2}".format(path, random.choice(string.ascii_lowercase),
                           file_ending))
Ejemplo n.º 8
0
 def test_encrypted_zip(self):
     # Check that all the ZipHandles we get out of an encrypted Zip file
     # actually work. (It's fine if we don't get any, but the ones we *do*
     # need to work!)
     encrypted_file = ZipSource(
             FilesystemHandle(
                     FilesystemSource(test_data_path),
                     "encrypted-test-vector.zip"))
     with SourceManager() as sm:
         for h in encrypted_file.handles(sm):
             h.follow(sm).compute_type()
Ejemplo n.º 9
0
    def test_corrupted_container(self):
        obj = {
            "scan_tag":
            "integration_test",
            "source":
            FilesystemSource(os.path.join(test_data_path, "pdf",
                                          "corrupted")).to_json_object(),
            "rule":
            CPRRule(modulus_11=False,
                    ignore_irrelevant=False).to_json_object(),
            "configuration": {}
        }

        self.messages.append((
            obj,
            "os2ds_scan_specs",
        ))
        self.run_pipeline()

        print(self.unhandled)

        self.assertEqual(len(self.unhandled), 1)
        self.assertEqual(self.unhandled[0][0]["origin"], "os2ds_problems")
import os.path
import unittest

from os2datascanner.utils.metadata import guess_responsible_party
from os2datascanner.engine2.model.core import Handle, Source, SourceManager
from os2datascanner.engine2.model.file import (FilesystemHandle,
                                               FilesystemSource)
from os2datascanner.engine2.model.derived.libreoffice import (
    LibreOfficeObjectHandle, LibreOfficeSource)

here_path = os.path.dirname(__file__)
test_data_path = os.path.join(here_path, "data")
test_handle = LibreOfficeObjectHandle(
    LibreOfficeSource(
        FilesystemHandle(FilesystemSource(test_data_path),
                         "libreoffice/embedded-cpr.odt")), "embedded-cpr.html")


class CountingProxy:
    def __init__(self, real_handle):
        self.__attr_accesses = {}
        self._real_handle = real_handle

    def __getattr__(self, attr):
        self.__attr_accesses[attr] = self.get_attr_access_count(attr) + 1
        return getattr(self._real_handle, attr)

    def get_attr_access_count(self, attr):
        return self.__attr_accesses.get(attr, 0)

Ejemplo n.º 11
0
 def from_json_object(obj):
     return FilesystemSource(path=obj["path"])
Ejemplo n.º 12
0
    def test_sources(self):
        sources_and_urls = [
            (FilesystemSource("/usr"), "file:///usr"),
            (
                SMBSource("//10.0.0.30/Share$/Documents"),
                "smb://10.0.0.30/Share%24/Documents",
            ),
            (
                SMBSource("//10.0.0.30/Share$/Documents", "FaithfullA"),
                "smb://[email protected]/Share%24/Documents",
            ),
            (
                SMBSource(
                    "//10.0.0.30/Share$/Documents",
                    "FaithfullA",
                    "secretpassword",
                ),
                "smb://*****:*****@10.0.0.30/Share%24/Documents",
            ),
            (
                SMBSource(
                    "//10.0.0.30/Share$/Documents",
                    "FaithfullA",
                    "secretpassword",
                    "SYSGRP",
                ),
                "smb://SYSGRP;FaithfullA:[email protected]/Share%24"
                "/Documents",
            ),
            (
                SMBSource(
                    "//10.0.0.30/Share$/Documents",
                    "FaithfullA",
                    None,
                    "SYSGRP",
                ),
                "smb://SYSGRP;[email protected]/Share%24/Documents",
            ),
            (
                SMBCSource(
                    "//INT-SRV-01/Q$",
                    "FaithfullA",
                    None,
                    "SYSGRP",
                ),
                "smbc://SYSGRP;FaithfullA@INT-SRV-01/Q%24",
            ),
            (WebSource("http://www.example.com"), "http://www.example.com"),
            (
                SecureWebSource("https://www.example.com"),
                "https://www.example.com",
            ),
            (
                DataSource(b"This is a test", "text/plain"),
                "data:text/plain;base64,VGhpcyBpcyBhIHRlc3Q=",
            ),
        ]

        for source, url in sources_and_urls:
            with self.subTest(url):
                generated_url = source.to_url()
                self.assertEqual(url, generated_url)
Ejemplo n.º 13
0
    def test_json_round_trip(self):
        example_handles = [
            FilesystemHandle(FilesystemSource("/usr/share/common-licenses"),
                             "GPL-3"),
            DataHandle(DataSource(b"Test", "text/plain"), "file"),
            FilteredHandle(
                GzipSource(
                    FilesystemHandle(
                        FilesystemSource("/usr/share/doc/coreutils"),
                        "changelog.Debian.gz")), "changelog.Debian"),
            SMBHandle(SMBSource("//SERVER/Resource", "username"),
                      "~ocument.docx"),
            SMBCHandle(
                SMBCSource("//SERVER/Resource", "username", "topsecret",
                           "WORKGROUP8"), "~ocument.docx"),
            ZipHandle(
                ZipSource(
                    SMBCHandle(
                        SMBCSource("//SERVER/Resource",
                                   "username",
                                   driveletter="W"),
                        "Confidential Documents.zip")),
                "doc/Personal Information.docx"),
            WebHandle(WebSource("https://secret.data.invalid/"),
                      "lottery-numbers-for-next-week.txt"),
            TarHandle(
                TarSource(
                    FilesystemHandle(FilesystemSource("/home/user"),
                                     "Downloads/data.tar.gz")), "data0.txt"),
            MailPartHandle(
                MailSource(
                    EWSMailHandle(
                        EWSAccountSource(domain="cloudy.example",
                                         server=CLOUD,
                                         admin_user="******",
                                         admin_password="******",
                                         user="******"),
                        "SW5ib3hJRA==.TWVzc2dJRA==",
                        "Re: Castles in the sky")), "1/pictograph.jpeg",
                "image/jpeg"),
            PDFObjectHandle(
                PDFPageSource(
                    PDFPageHandle(
                        PDFSource(
                            FilesystemHandle(
                                FilesystemSource("/home/kiddw"
                                                 "/Documents"),
                                "1699 Gardiners trip/"
                                "treasure_map.pdf")), "10")),
                "X-marks-the-spot_000-0.png"),
            LibreOfficeObjectHandle(
                LibreOfficeSource(
                    FilesystemHandle(FilesystemSource("/media/user/USB STICK"),
                                     "What I Did On My Holidays.doc")),
                "What I Did On My Holidays.html")
        ]

        for handle in example_handles:
            with self.subTest(handle):
                json = handle.to_json_object()
                print(handle)
                print(json)
                self.assertEqual(handle, handle.from_json_object(json))
                print("--")
time2 = "2020-10-28T14:36:20+01:00"
scan_tag0 = {
    "scanner": "Dummy test scanner",
    "time": time0
}
scan_tag1 = {
    "scanner": "Dummy test scanner",
    "time": time1
}
scan_tag2 = {
    "scanner": "Dummy test scanner",
    "time": time2
}

common_handle = FilesystemHandle(
        FilesystemSource("/mnt/fs01.magenta.dk/brugere/af"),
        "OS2datascanner/Dokumenter/Verdensherredømme - plan.txt")
common_rule = RegexRule("Vores hemmelige adgangskode er",
                        sensitivity=Sensitivity.WARNING)
dimension_rule = DimensionsRule()


common_scan_spec = messages.ScanSpecMessage(
        scan_tag=None, # placeholder
        source=common_handle.source,
        rule=common_rule,
        configuration={},
        progress=None)

positive_match = messages.MatchesMessage(
        scan_spec=common_scan_spec._replace(scan_tag=scan_tag0),
Ejemplo n.º 15
0
                                                        MSGraphFileHandle)

from os2datascanner.engine2.model.derived.filtered import (GzipSource,
                                                           FilteredHandle)
from os2datascanner.engine2.model.derived.libreoffice import (
    LibreOfficeSource, LibreOfficeObjectHandle)
from os2datascanner.engine2.model.derived.mail import (MailSource,
                                                       MailPartHandle)
from os2datascanner.engine2.model.derived.pdf import (PDFSource, PDFPageHandle,
                                                      PDFPageSource,
                                                      PDFObjectHandle)
from os2datascanner.engine2.model.derived.tar import TarSource, TarHandle
from os2datascanner.engine2.model.derived.zip import ZipSource, ZipHandle

example_handles = [
    FilesystemHandle(FilesystemSource("/usr/share/common-licenses"), "GPL-3"),
    DataHandle(DataSource(b"Test", "text/plain"), "file"),
    FilteredHandle(
        GzipSource(
            FilesystemHandle(FilesystemSource("/usr/share/doc/coreutils"),
                             "changelog.Debian.gz")), "changelog.Debian"),
    SMBHandle(SMBSource("//SERVER/Resource", "username"), "~ocument.docx"),
    SMBCHandle(
        SMBCSource("//SERVER/Resource", "username", "topsecret", "WORKGROUP8"),
        "~ocument.docx"),
    ZipHandle(
        ZipSource(
            SMBCHandle(
                SMBCSource("//SERVER/Resource", "username", driveletter="W"),
                "Confidential Documents.zip")),
        "doc/Personal Information.docx"),
Ejemplo n.º 16
0
    Handle,
    SourceManager,
    UnknownSchemeError,
    DeserialisationError,
)

# are we running from console? Then set __file__
try:
    __file__
except:
    __file__ = str(Path("./derived.py").resolve())

datadir = (Path(__file__).parents[1] / "data/files").resolve()
fwd = datadir.absolute()

testfile = FilesystemHandle(FilesystemSource(fwd), "test.txt")

fs = FilesystemSource(fwd)
fh = FilesystemHandle(fs, "cpr-test-single.zip")
zs = ZipSource(fh)
zh = ZipHandle(zs, "cpr-test-single.txt")

zsm = ZipSource(
    FilesystemHandle(FilesystemSource(fwd), "cpr-test-multiple.zip"))
zhm1 = ZipHandle(zsm, "cpr-test/cpr-test2.zip")
zhm2 = ZipHandle(zsm, "cpr-test/cpr-test3.zip")

zsmd1 = ZipSource(zhm1)
zhmd1 = ZipHandle(zsmd1, "cpr2-test.txt")

sm = SourceManager()
Ejemplo n.º 17
0
 def test_relative_filesystemsource(self):
     with self.assertRaises(ValueError):
         FilesystemSource("../../projects/admin/tests/data/")
Ejemplo n.º 18
0
 def test_relative_filesystemsource(self):
     with self.assertRaises(ValueError):
         FilesystemSource("data/")