Example #1
0
 def test_expected_compression_results(self):
     """ This test asserts some expected behavior in terms out compressed output, if you change the compression
         algorithm (or decoding table) this will change. """
     self.assertEqual(len(compress('thethethe')), 3)
     self.assertEqual(len(compress('thewhich')), 2)
     self.assertEqual(len(compress('123thewhich123')), 12)
     self.assertEqual(len(compress('not-a-g00d-Exampl333')), 20)
Example #2
0
 def test_expected_compression_results(self):
     """ This test asserts some expected behavior in terms out compressed output, if you change the compression
         algorithm (or decoding table) this will change. """
     self.assertEqual(len(compress("thethethe")), 3)
     self.assertEqual(len(compress("thewhich")), 2)
     self.assertEqual(len(compress("123thewhich123")), 12)
     self.assertEqual(len(compress("not-a-g00d-Exampl333")), 20)
Example #3
0
    def cycle(
        self, input_str, quiet=False, compress_tree=None, decompress_table=None, show_input_and_output=True, strict=True
    ):
        """ Exercise a complete co -> dec cycle """
        compressed_text = compress(input_str, compression_tree=compress_tree, backtracking=False, check_ascii=strict)
        backtracked_compressed_text = compress(
            input_str, compression_tree=compress_tree, backtracking=True, check_ascii=strict
        )
        decompressed_text = decompress(compressed_text, decompress_table=decompress_table)
        backtracked_decompressed_text = decompress(backtracked_compressed_text, decompress_table=decompress_table)
        classic_compresssed_text = compress_classic(input_str)
        classic_decompressed_test = decompress(classic_compresssed_text)

        if not quiet and input_str:
            print("---------------------------------------------------------------------")
            if show_input_and_output:
                print(decompressed_text)
                print(compressed_text)
            if backtracked_compressed_text != compressed_text:
                if show_input_and_output:
                    print("--back tracked:--")
                    print(backtracked_compressed_text)
            ratio = 1.0 / (float(len(input_str)) / float(len(compressed_text)))
            b_ratio = 1.0 / (float(len(input_str)) / float(len(backtracked_compressed_text)))
            c_ratio = 1.0 / (float(len(input_str)) / float(len(classic_compresssed_text)))
            bz2c = bz2.compress(input_str)
            zlibc = zlib.compress(input_str, 9)
            bz2ratio = 1.0 / (float(len(input_str)) / float(len(bz2c)))
            zlibratio = 1.0 / (float(len(input_str)) / float(len(zlibc)))
            if backtracked_compressed_text != compressed_text:
                print(
                    "backtracked compression ratio 1:%f (%.2f%%) from %d bytes to %d bytes"
                    % (b_ratio, b_ratio * 100.0, len(input_str), len(backtracked_compressed_text))
                )
                self.assertTrue(
                    len(compressed_text) >= len(backtracked_compressed_text),
                    "Back-tracking (%d) should always be better than not-backtracking (%d)"
                    % (len(input_str), len(backtracked_compressed_text)),
                )
            print(
                "compression ratio 1:%f (%.2f%%) from %d bytes to %d bytes"
                % (ratio, ratio * 100.0, len(input_str), len(compressed_text))
            )
            print(" vs ")
            print("  zlib ratio 1:%f (%.2f%%) to %d bytes" % (zlibratio, zlibratio * 100.0, len(zlibc)))
            print("  bz2 ratio 1:%f (%.2f%%) to %d bytes" % (bz2ratio, bz2ratio * 100.0, len(bz2c)))
            print(
                "  smaz classic 1:%f (%.2f%%) to %d bytes" % (c_ratio, c_ratio * 100.0, len(classic_compresssed_text))
            )

        self.assertEqual(input_str, decompressed_text)
        self.assertEqual(input_str, backtracked_decompressed_text)
        self.assertEqual(input_str, classic_decompressed_test)
Example #4
0
def encrypt(text_to_encrypt, encryption_base):
    digits = []
    for i in range(48, 48 + encryption_base):
        try:
            digits.append(bytes(chr(i), "utf-8").decode("utf-8"))
        except UnicodeEncodeError:
            pass
    text = smaz.compress(str(text_to_encrypt))
    if text == b"":
        text = zlib.compress(bytes(text_to_encrypt, encoding="utf-8"))
    textInts = [i for i in text]
    textNum = ""
    result = -1
    remainder = -1
    cipher = """"""
    for i in textInts:
        m = str(i)
        for _ in range(3 - len(m)):
            m = f"0{m}"
        textNum = f"{textNum}{m}"

    try:
        result = int(textNum)
    except:
        result = 0
    while result != 0:
        remainder = result % len(digits)
        result = result // len(digits)
        cipher = f"{digits[remainder]}{cipher}"
    return cipher
Example #5
0
    def cycle(self, input_str, quiet=False, compress_tree=None, decompress_table=None, show_input_and_output=True,
              strict=True):
        """ Exercise a complete co -> dec cycle """
        compressed_text = compress(input_str, compression_tree=compress_tree, backtracking=False,
                                   check_ascii=strict)
        backtracked_compressed_text = compress(input_str, compression_tree=compress_tree, backtracking=True,
                                               check_ascii=strict)
        decompressed_text = decompress(compressed_text, decompress_table=decompress_table)
        backtracked_decompressed_text = decompress(backtracked_compressed_text, decompress_table=decompress_table)
        classic_compresssed_text = compress_classic(input_str)
        classic_decompressed_test = decompress(classic_compresssed_text)

        if not quiet and input_str:
            print('---------------------------------------------------------------------')
            if show_input_and_output:
                print(decompressed_text)
                print(compressed_text)
            if backtracked_compressed_text != compressed_text:
                if show_input_and_output:
                    print('--back tracked:--')
                    print(backtracked_compressed_text)
            ratio = 1.0 / (float(len(input_str)) / float(len(compressed_text)))
            b_ratio = 1.0 / (float(len(input_str)) / float(len(backtracked_compressed_text)))
            c_ratio = 1.0 / (float(len(input_str)) / float(len(classic_compresssed_text)))
            bz2c = bz2.compress(input_str)
            zlibc = zlib.compress(input_str, 9)
            bz2ratio = 1.0 / (float(len(input_str)) / float(len(bz2c)))
            zlibratio = 1.0 / (float(len(input_str)) / float(len(zlibc)))
            if backtracked_compressed_text != compressed_text:
                print(('backtracked compression ratio 1:%f (%.2f%%) from %d bytes to %d bytes' %
                      (b_ratio, b_ratio * 100., len(input_str), len(backtracked_compressed_text))))
                self.assertTrue(len(compressed_text) >= len(backtracked_compressed_text),
                                'Back-tracking (%d) should always be better than not-backtracking (%d)'
                                % (len(input_str), len(backtracked_compressed_text)))
            print(('compression ratio 1:%f (%.2f%%) from %d bytes to %d bytes' %
                  (ratio, ratio * 100., len(input_str), len(compressed_text))))
            print(' vs ')
            print(('  zlib ratio 1:%f (%.2f%%) to %d bytes' %
                  (zlibratio, zlibratio * 100., len(zlibc))))
            print(('  bz2 ratio 1:%f (%.2f%%) to %d bytes' %
                  (bz2ratio, bz2ratio * 100., len(bz2c))))
            print(('  smaz classic 1:%f (%.2f%%) to %d bytes' %
                  (c_ratio, c_ratio * 100., len(classic_compresssed_text))))

        self.assertEqual(input_str, decompressed_text)
        self.assertEqual(input_str, backtracked_decompressed_text)
        self.assertEqual(input_str, classic_decompressed_test)
Example #6
0
    def test_scaling(self):
        """ Test (but don't assert) that SMAZ scales linearly with string length - i.e. O(N) """
        print('factor should remain roughly constant if performance is O(N)')
        for i in (1, 5, 10, 20, 50, 100, 250, 500, 1000, 2500, 10000, 100000):
            runs = 1
            if i < 10000:
                runs = 100
                if i < 500:
                    runs = 1000

            tick = datetime.datetime.now()
            cdata = [compress(FIVE_MEGABYTES_OF_MOBY_DICK[0:i]) for _ in range(runs)]
            tock = datetime.datetime.now()
            tdf = self.timedelta_to_float(tock - tick)
            print(('%i, %f, factor: %.10f - %d' % (i, tdf, tdf / (float(i) * float(runs)), len(cdata))))
Example #7
0
 def assert_smaz_optimal(self, comb, display=False):
     """ Assert that SMAZ is optimal for a given string, setting display shows the output """
     if display:
         print(comb)
     smaz_comp = bz2_comp = zlib_comp = 0
     try:
         bz2_comp = bz2.compress(comb)
         zlib_comp = zlib.compress(comb, 9)
         smaz_comp = compress(comb)
         self.assertTrue(len(bz2_comp) >= len(smaz_comp))
         self.assertTrue(len(zlib_comp) >= len(smaz_comp))
     except AssertionError:
         raise AssertionError(
             'Found String (%d) where SMAZ not >=. SMAZ len: %d bz2 len: %d zlib len: %d string: %s' %
             (len(comb), len(smaz_comp), len(bz2_comp), len(zlib_comp), comb))
Example #8
0
    def test_scaling(self):
        """ Test (but don't assert) that SMAZ scales linearly with string length - i.e. O(N) """
        print("factor should remain roughly constant if performance is O(N)")
        for i in (1, 5, 10, 20, 50, 100, 250, 500, 1000, 2500, 10000, 100000):
            runs = 1
            if i < 10000:
                runs = 100
                if i < 500:
                    runs = 1000

            tick = datetime.datetime.now()
            cdata = [compress(FIVE_MEGABYTES_OF_MOBY_DICK[0:i]) for _ in xrange(runs)]
            tock = datetime.datetime.now()
            tdf = self.timedelta_to_float(tock - tick)
            print("%i, %f, factor: %.10f - %d" % (i, tdf, tdf / (float(i) * float(runs)), len(cdata)))
Example #9
0
 def assert_smaz_optimal(self, comb, display=False):
     """ Assert that SMAZ is optimal for a given string, setting display shows the output """
     if display:
         print(comb)
     smaz_comp = bz2_comp = zlib_comp = 0
     try:
         bz2_comp = bz2.compress(comb)
         zlib_comp = zlib.compress(comb, 9)
         smaz_comp = compress(comb)
         self.assertTrue(len(bz2_comp) >= len(smaz_comp))
         self.assertTrue(len(zlib_comp) >= len(smaz_comp))
     except AssertionError:
         raise AssertionError(
             "Found String (%d) where SMAZ not >=. SMAZ len: %d bz2 len: %d zlib len: %d string: %s"
             % (len(comb), len(smaz_comp), len(bz2_comp), len(zlib_comp), comb)
         )
Example #10
0
    def corpus_line_by_line(self, filename):
        """ Process a .txt corpus file line by line
        """
        with open(filename, 'r') as f:
            lines = f.read()
        test_data = lines.split('\n')
        c_data = []
        bz_data = []
        zlib_data = []
        c_cl_data = []
        c_cl_path_data = []
        for test in test_data:
            c_data.append(compress(test))
            bz_data.append(bz2.compress(test))
            zlib_data.append(zlib.compress(test))
            c_cl_data.append(compress_classic(test, pathological_case_detection=False))
            c_cl_path_data.append(compress_classic(test, pathological_case_detection=True))

        print(('Total data size %d bytes' % sum(len(x) for x in test_data)))
        print((' Smaz size %d bytes' % sum(len(x) for x in c_data)))
        print((' bz2 size %d bytes' % sum(len(x) for x in bz_data)))
        print((' zlib size %d bytes' % sum(len(x) for x in zlib_data)))
        print((' Smaz classic size %d bytes' % sum(len(x) for x in c_cl_data)))
        print((' Smaz classic with pathological case detection size %d bytes' % sum(len(x) for x in c_cl_path_data)))
Example #11
0
    def corpus_line_by_line(self, filename):
        """ Process a .txt corpus file line by line
        """
        with open(filename, "r") as f:
            lines = f.read()
        test_data = lines.split("\n")
        c_data = []
        bz_data = []
        zlib_data = []
        c_cl_data = []
        c_cl_path_data = []
        for test in test_data:
            c_data.append(compress(test))
            bz_data.append(bz2.compress(test))
            zlib_data.append(zlib.compress(test))
            c_cl_data.append(compress_classic(test, pathological_case_detection=False))
            c_cl_path_data.append(compress_classic(test, pathological_case_detection=True))

        print("Total data size %d bytes" % sum(len(x) for x in test_data))
        print(" Smaz size %d bytes" % sum(len(x) for x in c_data))
        print(" bz2 size %d bytes" % sum(len(x) for x in bz_data))
        print(" zlib size %d bytes" % sum(len(x) for x in zlib_data))
        print(" Smaz classic size %d bytes" % sum(len(x) for x in c_cl_data))
        print(" Smaz classic with pathological case detection size %d bytes" % sum(len(x) for x in c_cl_path_data))
Example #12
0
 def compress(self, inp):
     return smaz.compress(inp)
Example #13
0
 def test_ascii_check(self):
     """ Test the ascii check """
     self.assertTrue(_check_ascii('1230ABCZADSADW'))
     self.assertFalse(_check_ascii(chr(129) + chr(129)))
     self.assertEqual(None, compress(chr(129), raise_on_error=False))
Example #14
0
 def test_ascii(self):
     """ By default, we check we are only processing ascii data"""
     self.assertRaises(ValueError, compress, chr(129))
     for i in range(127):
         compress(chr(i))  # Doesn't raise - valid data
Example #15
0
 def test_ascii_check(self):
     """ Test the ascii check """
     self.assertTrue(_check_ascii("1230ABCZADSADW"))
     self.assertFalse(_check_ascii(chr(129) + chr(129)))
     self.assertEquals(None, compress(chr(129), raise_on_error=False))
Example #16
0
 def test_ascii(self):
     """ By default, we check we are only processing ascii data"""
     self.assertRaises(ValueError, compress, chr(129))
     for i in xrange(127):
         compress(chr(i))  # Doesn't raise - valid data