def test_is_block(self): """:: IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+""" tests = { # positive and negative tests 'BasicLatin': ("ABC", ul("\xc0\xdf\xa9")), 'Latin-1Supplement': (ul("\xc0\xdf\xa9"), "ABC"), 'CurrencySymbols': (u8(b'\xe2\x82\xa4\xe2\x82\xa9\xe2\x82\xac'), ul("\x24\xa2\xa3")), 'NumberForms': (u8(b'\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98'), "1/5 2/5 3/5 4/5") } for b in dict_keys(tests): p = xsi.RegularExpressionParser("Is" + b) cclass = p.require_is_block() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in Is%s" % (repr(c), b)) p = xsi.RegularExpressionParser("IsNumberFoams") try: cclass = p.require_is_block() self.fail("IsNumberFoams") except xsi.RegularExpressionError: pass
def test_char_prop(self): """:: charProp ::= IsCategory | IsBlock""" tests = { # positive and negative tests 'Nd': (u8(b'123\xdb\xb1\xdb\xb2\xdb\xb3'), u8(b'ABC\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98')), 'S': (u8(b'+<=>\xe2\x81\x84\xe2\x82\xac'), "(){}"), 'IsBasicLatin': ("ABC", ul("\xc0\xdf\xa9")), 'IsLatin-1Supplement': (ul("\xc0\xdf\xa9"), "ABC"), 'IsCurrencySymbols': (u8(b'\xe2\x82\xa4\xe2\x82\xa9\xe2\x82\xac'), ul("\x24\xa2\xa3")), 'IsNumberForms': (u8(b'\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98'), "1/5 2/5 3/5 4/5") } for b in dict_keys(tests): p = xsi.RegularExpressionParser(b) cclass = p.require_char_prop() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in %s" % (repr(c), b))
def test_constructor(self): u = uri.URI(SIMPLE_EXAMPLE) self.assertTrue(isinstance(u, uri.URI)) self.assertTrue(str(u) == SIMPLE_EXAMPLE) self.assertTrue(is_unicode(u.octets), "octets must be a character string") if py2: self.assertTrue(to_text(u) == SIMPLE_EXAMPLE) try: u = uri.URI(LIST_EXAMPLE) # we don't support this type of thing any more # self.assertTrue(str(u)==SIMPLE_EXAMPLE,"Simple from list") except uri.URIException: pass u = uri.URI.from_octets(u8(b'\xe8\x8b\xb1\xe5\x9b\xbd.xml')) self.assertTrue( str(u) == '%E8%8B%B1%E5%9B%BD.xml', "Unicode example: %s" % str(u)) self.assertTrue(is_unicode(u.octets), "octets must be a character string") try: u = uri.URI.from_octets(u8(b'\xe8\x8b\xb1\xe5\x9b\xbd.xml'), strict=True) self.fail("strict mode requires %-encoding") except uri.URIException: pass # binary string must be US-ASCII clean try: u = uri.URI.from_octets(b'Caf\xe9') self.fail("binary string must be US-ASCII") except UnicodeDecodeError: pass # but URI-encoded is OK even if it is binary u = uri.URI.from_octets(b'Caf%E9') self.assertTrue(is_unicode(u.octets), "octets must be a character string")
def test_is_block(self): """:: IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+""" tests = { # positive and negative tests 'BasicLatin': ("ABC", ul("\xc0\xdf\xa9")), 'Latin-1Supplement': (ul("\xc0\xdf\xa9"), "ABC"), 'CurrencySymbols': (u8(b'\xe2\x82\xa4\xe2\x82\xa9\xe2\x82\xac'), ul("\x24\xa2\xa3")), 'NumberForms': ( u8(b'\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98'), "1/5 2/5 3/5 4/5") } for b in dict_keys(tests): p = xsi.RegularExpressionParser("Is" + b) cclass = p.require_is_block() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in Is%s" % (repr(c), b)) p = xsi.RegularExpressionParser("IsNumberFoams") try: cclass = p.require_is_block() self.fail("IsNumberFoams") except xsi.RegularExpressionError: pass
def test_char_prop(self): """:: charProp ::= IsCategory | IsBlock""" tests = { # positive and negative tests 'Nd': (u8(b'123\xdb\xb1\xdb\xb2\xdb\xb3'), u8(b'ABC\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98')), 'S': (u8(b'+<=>\xe2\x81\x84\xe2\x82\xac'), "(){}"), 'IsBasicLatin': ("ABC", ul("\xc0\xdf\xa9")), 'IsLatin-1Supplement': (ul("\xc0\xdf\xa9"), "ABC"), 'IsCurrencySymbols': (u8(b'\xe2\x82\xa4\xe2\x82\xa9\xe2\x82\xac'), ul("\x24\xa2\xa3")), 'IsNumberForms': ( u8(b'\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98'), "1/5 2/5 3/5 4/5") } for b in dict_keys(tests): p = xsi.RegularExpressionParser(b) cclass = p.require_char_prop() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in %s" % (repr(c), b))
def test_parse_hex_digit(self): p = unicode5.BasicParser( u8(b"0123456789abcdefghijklmnopqrstuvwxyz" b"ABCDEFGHIJKLMNOPQRSTUVWXYZ" b"\xd9\xa0\xd9\xa1\xd9\xa2\xd9\xa3\xd9\xa4\xd9\xa5" b"\xd9\xa6\xd9\xa7\xd9\xa8\xd9\xa9")) result = [] while p.the_char is not None: digit = p.parse_hex_digit() if digit is not None: result.append(digit) else: p.next_char() self.assertTrue(ul('').join(result) == ul('0123456789abcdefABCDEF')) # and now binary p = unicode5.BasicParser( b"0123456789abcdefghijklmnopqrstuvwxyz" b"ABCDEFGHIJKLMNOPQRSTUVWXYZ") result = [] while p.the_char is not None: digit = p.parse_hex_digit() if digit is not None: result.append(digit) else: p.next_char() self.assertTrue(join_bytes(result) == b'0123456789abcdefABCDEF')
def test_parse_hex_digit(self): p = unicode5.BasicParser( u8(b"0123456789abcdefghijklmnopqrstuvwxyz" b"ABCDEFGHIJKLMNOPQRSTUVWXYZ" b"\xd9\xa0\xd9\xa1\xd9\xa2\xd9\xa3\xd9\xa4\xd9\xa5" b"\xd9\xa6\xd9\xa7\xd9\xa8\xd9\xa9")) result = [] while p.the_char is not None: digit = p.parse_hex_digit() if digit is not None: result.append(digit) else: p.next_char() self.assertTrue(ul('').join(result) == ul('0123456789abcdefABCDEF')) # and now binary p = unicode5.BasicParser(b"0123456789abcdefghijklmnopqrstuvwxyz" b"ABCDEFGHIJKLMNOPQRSTUVWXYZ") result = [] while p.the_char is not None: digit = p.parse_hex_digit() if digit is not None: result.append(digit) else: p.next_char() self.assertTrue(join_bytes(result) == b'0123456789abcdefABCDEF')
def test_is_category(self): """:: IsCategory ::= Letters | Marks | Numbers | Punctuation | Separators | Symbols | Others Letters ::= 'L' [ultmo]? Marks ::= 'M' [nce]? Numbers ::= 'N' [dlo]? Punctuation ::= 'P' [cdseifo]? Separators ::= 'Z' [slp]? Symbols ::= 'S' [mcko]? Others ::= 'C' [cfon]?""" tests = [ "L", "Lu", "Ll", "Lt", "Lm", "Lo", "M", "Mn", "Mc", "Me", "N", "Nd", "Nl", "No", "P", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Z", "Zs", "Zl", "Zp", "S", "Sm", "Sc", "Sk", "So", "C", "Cc", "Cf", "Co", "Cn" ] bad = ["A", "Za"] for s in tests: p = xsi.RegularExpressionParser(s) self.assertTrue(isinstance(p.require_is_category(), CharClass), "Missing category: %s" % s) self.assertTrue(p.the_char is None, "Incomplete parse of category: %s" % s) for s in bad: p = xsi.RegularExpressionParser(s) try: p.require_is_category() self.assertFalse(p.the_char is None, "Undetected bad category: %s" % s) except xsi.RegularExpressionError: pass tests = { # positive and negative tests 'Nd': (u8(b'123\xdb\xb1\xdb\xb2\xdb\xb3'), u8(b'ABC\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98')), 'S': (u8(b'+<=>\xe2\x81\x84\xe2\x82\xac'), "(){}") } for b in dict_keys(tests): p = xsi.RegularExpressionParser(b) cclass = p.require_is_category() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in %s" % (repr(c), b))
def test_is_category(self): """:: IsCategory ::= Letters | Marks | Numbers | Punctuation | Separators | Symbols | Others Letters ::= 'L' [ultmo]? Marks ::= 'M' [nce]? Numbers ::= 'N' [dlo]? Punctuation ::= 'P' [cdseifo]? Separators ::= 'Z' [slp]? Symbols ::= 'S' [mcko]? Others ::= 'C' [cfon]?""" tests = ["L", "Lu", "Ll", "Lt", "Lm", "Lo", "M", "Mn", "Mc", "Me", "N", "Nd", "Nl", "No", "P", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Z", "Zs", "Zl", "Zp", "S", "Sm", "Sc", "Sk", "So", "C", "Cc", "Cf", "Co", "Cn"] bad = ["A", "Za"] for s in tests: p = xsi.RegularExpressionParser(s) self.assertTrue(isinstance(p.require_is_category(), CharClass), "Missing category: %s" % s) self.assertTrue( p.the_char is None, "Incomplete parse of category: %s" % s) for s in bad: p = xsi.RegularExpressionParser(s) try: p.require_is_category() self.assertFalse( p.the_char is None, "Undetected bad category: %s" % s) except xsi.RegularExpressionError: pass tests = { # positive and negative tests 'Nd': ( u8(b'123\xdb\xb1\xdb\xb2\xdb\xb3'), u8(b'ABC\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98')), 'S': (u8(b'+<=>\xe2\x81\x84\xe2\x82\xac'), "(){}") } for b in dict_keys(tests): p = xsi.RegularExpressionParser(b) cclass = p.require_is_category() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in %s" % (repr(c), b))
def test_unicode(self): cp = imscp.ContentPackage(TEST_DATA_DIR.join('package_1')) resources = cp.manifest.root.Resources r = resources.Resource[0] self.assertTrue(len(r.File) == 1) f = r.File[0] self.assertTrue(isinstance(f, imscp.File) and str(f.href) == "%E8%8B%B1%E5%9B%BD.xml", "File path") doc = xmlns.Document(baseURI=f.resolve_uri(f.href)) doc.read() self.assertTrue(doc.root.xmlname == 'tag' and doc.root.get_value() == u8(b'Unicode Test: \xe8\x8b\xb1\xe5\x9b\xbd')) cp2 = imscp.ContentPackage( TEST_DATA_DIR.join(u8(b'\xe8\x8b\xb1\xe5\x9b\xbd'))) self.assertTrue( cp2.GetPackageName() == u8(b'\xe8\x8b\xb1\xe5\x9b\xbd'), "Unicode package name test")
def test_char_class_esc(self): """:: charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) """ tests = { '\\?': ("?", "\\"), '\\d': (u8(b'123\xd9\xa1\xd9\xa2\xd9\xa3'), u8(b'ABC\xe2\x82\x81\xe2\x82\x82\xe2\x82\x83')), '\\p{S}': (u8(b'+<=>\xe2\x81\x84\xe2\x82\xac'), "(){}"), '\\P{S}': ("(){}", u8(b'+<=>\xe2\x81\x84\xe2\x82\xac'))} for b in dict_keys(tests): p = xsi.RegularExpressionParser(b) cclass = p.require_char_class_esc() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in %s" % (repr(c), b))
def test_char_class_esc(self): """:: charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc ) """ tests = { '\\?': ("?", "\\"), '\\d': (u8(b'123\xd9\xa1\xd9\xa2\xd9\xa3'), u8(b'ABC\xe2\x82\x81\xe2\x82\x82\xe2\x82\x83')), '\\p{S}': (u8(b'+<=>\xe2\x81\x84\xe2\x82\xac'), "(){}"), '\\P{S}': ("(){}", u8(b'+<=>\xe2\x81\x84\xe2\x82\xac')) } for b in dict_keys(tests): p = xsi.RegularExpressionParser(b) cclass = p.require_char_class_esc() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in %s" % (repr(c), b))
def test_compl_esc(self): """:: complEsc ::= '\P{' charProp '}' """ tests = { # positive and negative tests '\\P{Nd}': (u8(b'ABC\xe2\x85\x95\xe2\x85\x96\xe2\x85\x97\xe2\x85\x98'), u8(b'123\xdb\xb1\xdb\xb2\xdb\xb3')), '\\P{S}': ("(){}", u8(b'+<=>\xe2\x81\x84\xe2\x82\xac')), '\\P{IsBasicLatin}': (ul("\xc0\xdf\xa9"), "ABC") } for b in dict_keys(tests): p = xsi.RegularExpressionParser(b) cclass = p.require_compl_esc() self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in %s" % (repr(c), b))
def test_parse_integer(self): p = unicode5.BasicParser(ul("23p")) # all defaults, unbounded self.assertTrue(p.parse_integer() == 23) self.assertTrue(p.pos == 2) p.setpos(1) # provide a minimum value self.assertTrue(p.parse_integer(4) is None) self.assertTrue(p.parse_integer(2) == 3) p.setpos(1) # provide a minimum and maximum value self.assertTrue(p.parse_integer(0, 2) is None) self.assertTrue(p.parse_integer(1, 4) == 3) p.setpos(0) # min value < 0, should throw an error try: p.parse_integer(-1) self.fail("min = -1 didn't raise exception") except ValueError: # and it shouldn't move the parser self.assertTrue(p.pos == 0) # min value > max, should throw an error try: p.parse_integer(3, 1) self.fail("min > max didn't raise exception") except ValueError: # and it shouldn't move the parser self.assertTrue(p.pos == 0) # check we can exceed ordinary integer sizes istr = ul("123456789" + "0" * 256) p = unicode5.BasicParser(istr) # test max digits self.assertTrue(p.parse_integer(0, None, 10) == 1234567890) # check wide zeros self.assertTrue(p.parse_integer(0, None, 10) == 0) self.assertTrue(p.pos == 20) p.setpos(0) # check large numbers self.assertTrue(p.parse_integer(0, None, 15) == 123456789000000) # test Arabic digits, should not parse! p = unicode5.BasicParser( u8(b'\xd9\xa0\xd9\xa1\xd9\xa2\xd9\xa3\xd9\xa4\xd9\xa5' b'\xd9\xa6\xd9\xa7\xd9\xa8\xd9\xa9')) for i in range3(10): self.assertTrue(p.parse_integer() is None) p.next_char() # test binary forms p = unicode5.BasicParser(b"234p") self.assertTrue(p.parse_integer(max_digits=1) == 2) self.assertTrue(p.parse_integer(0, 2) is None) self.assertTrue(p.parse_integer() == 34) p.next_char() self.assertTrue(p.parse_integer() is None)
def test_zip_write(self): cp = imscp.ContentPackage(TEST_DATA_DIR.join('package_1.zip')) self.dList.append(cp.dPath) cp.ExportToPIF('Package2.zip') cp2 = imscp.ContentPackage('Package2.zip') self.dList.append(cp2.dPath) resources = cp2.manifest.root.Resources f = resources.Resource[0].File[0] doc = xmlns.Document(baseURI=f.resolve_uri(f.href)) doc.read() self.assertTrue(doc.root.xmlname == 'tag' and doc.root.get_value() == u8(b'Unicode Test: \xe8\x8b\xb1\xe5\x9b\xbd'))
def test_multi_char_esc(self): """:: MultiCharEsc ::= '\' [sSiIcCdDwW]""" tests = { # positive and negative tests 's': ("\x09\x0A\x0D ", "ABC"), 'i': ("ABC_:", "-123"), 'c': ("ABC_:-_123", "@<>?"), 'd': (u8(b'123\xd9\xa1\xd9\xa2\xd9\xa3'), u8(b'ABC\xe2\x82\x81\xe2\x82\x82\xe2\x82\x83')), 'w': ("ABC", u8(b'!\xcd\xbe \xe2\x80\x82\x0c')), } for c in dict_keys(tests): p1 = xsi.RegularExpressionParser("\\" + c) cclass1 = p1.require_multi_char_esc() self.assertTrue(p1.pos == 2) p2 = xsi.RegularExpressionParser("\\" + c.upper()) cclass2 = p2.require_multi_char_esc() self.assertTrue(p2.pos == 2) t1, t2 = tests[c] for c1 in t1: self.assertTrue( cclass1.test(c1), "%s not in \\%s" % (repr(c1), c)) self.assertFalse( cclass2.test(c1), "%s in \\%s" % (repr(c1), c.upper())) for c2 in t2: self.assertFalse( cclass1.test(c2), "%s in \\%s" % (repr(c2), c)) self.assertTrue(cclass2.test(c2), "%s in \\%s" % (repr(c2), c.upper())) p = xsi.RegularExpressionParser("\\x") try: p.require_multi_char_esc() self.fail("\\x") except xsi.RegularExpressionError: pass
def test_zip_read(self): cp = imscp.ContentPackage(TEST_DATA_DIR.join('package_1.zip')) self.assertTrue(cp.dPath.isdir(), "Zip constructor must create a temp directory") # Ensure the temporary directory is cleaned up self.dList.append(cp.dPath) self.assertTrue(cp.GetPackageName() == 'package_1', "Zip extension not removed for name") resources = cp.manifest.root.Resources f = resources.Resource[0].File[0] doc = xmlns.Document(baseURI=f.resolve_uri(f.href)) doc.read() self.assertTrue(doc.root.xmlname == 'tag' and doc.root.get_value() == u8(b'Unicode Test: \xe8\x8b\xb1\xe5\x9b\xbd'))
def test_multi_char_esc(self): """:: MultiCharEsc ::= '\' [sSiIcCdDwW]""" tests = { # positive and negative tests 's': ("\x09\x0A\x0D ", "ABC"), 'i': ("ABC_:", "-123"), 'c': ("ABC_:-_123", "@<>?"), 'd': (u8(b'123\xd9\xa1\xd9\xa2\xd9\xa3'), u8(b'ABC\xe2\x82\x81\xe2\x82\x82\xe2\x82\x83')), 'w': ("ABC", u8(b'!\xcd\xbe \xe2\x80\x82\x0c')), } for c in dict_keys(tests): p1 = xsi.RegularExpressionParser("\\" + c) cclass1 = p1.require_multi_char_esc() self.assertTrue(p1.pos == 2) p2 = xsi.RegularExpressionParser("\\" + c.upper()) cclass2 = p2.require_multi_char_esc() self.assertTrue(p2.pos == 2) t1, t2 = tests[c] for c1 in t1: self.assertTrue(cclass1.test(c1), "%s not in \\%s" % (repr(c1), c)) self.assertFalse(cclass2.test(c1), "%s in \\%s" % (repr(c1), c.upper())) for c2 in t2: self.assertFalse(cclass1.test(c2), "%s in \\%s" % (repr(c2), c)) self.assertTrue(cclass2.test(c2), "%s in \\%s" % (repr(c2), c.upper())) p = xsi.RegularExpressionParser("\\x") try: p.require_multi_char_esc() self.fail("\\x") except xsi.RegularExpressionError: pass
def test_parse_digits(self): p = unicode5.BasicParser(ul("23p")) # min value of 0 self.assertTrue(p.parse_digits(0) == ul("23")) self.assertTrue(p.pos == 2) # min value of 2, should fail p.setpos(1) self.assertTrue(p.parse_digits(2) is None) # shouldn't move the parser self.assertTrue(p.pos == 1) # min value of 0, should throw an error try: p.parse_digits(-1) self.fail("min=-1 didn't raise exception") except ValueError: # and it shouldn't move the parser self.assertTrue(p.pos == 1) # min value > max, should throw an error try: p.parse_digits(3, 1) self.fail("min > max didn't raise exception") except ValueError: # and it shouldn't move the parser self.assertTrue(p.pos == 1) # check we can exceed ordinary integer sizes istr = ul("123456789" + "0" * 256) p = unicode5.BasicParser(istr) self.assertTrue(len(p.parse_digits(0, 256)) == 256) # and check that runs of 0 don't mean a thing self.assertTrue(p.parse_digits(0, 256) == ul("000000000")) # test Arabic digits, should not parse! p = unicode5.BasicParser( u8(b'\xd9\xa0\xd9\xa1\xd9\xa2\xd9\xa3\xd9\xa4\xd9\xa5' b'\xd9\xa6\xd9\xa7\xd9\xa8\xd9\xa9')) for i in range3(10): self.assertTrue(p.parse_digits(1) is None) p.next_char() # test binary forms p = unicode5.BasicParser(b"234p") # unlike parse_digit we return a string, even if only one digit self.assertTrue(p.parse_digits(1, 1) == b"2") self.assertTrue(p.parse_digits(1) == b"34") p.next_char() self.assertTrue(p.parse_digits(1) is None) self.assertTrue(p.parse_digits(0) == b"")
def test_parse_digit_value(self): p = unicode5.BasicParser(ul("2p")) self.assertTrue(p.parse_digit_value() == 2) self.assertTrue(p.pos == 1) self.assertTrue(p.parse_digit_value() is None) p.next_char() self.assertTrue(p.parse_digit_value() is None) # test Arabic digits, should not parse! p = unicode5.BasicParser( u8(b'\xd9\xa0\xd9\xa1\xd9\xa2\xd9\xa3\xd9\xa4\xd9\xa5' b'\xd9\xa6\xd9\xa7\xd9\xa8\xd9\xa9')) for i in range3(10): self.assertTrue(p.parse_digit_value() is None) p.next_char() # test binary forms p = unicode5.BasicParser(b"2p") self.assertTrue(p.parse_digit_value() == 2) self.assertTrue(p.parse_digit_value() is None) p.next_char() self.assertTrue(p.parse_digit_value() is None)
def test_match_digit(self): p = unicode5.BasicParser(ul("2p")) self.assertTrue(p.match_digit()) p.next_char() self.assertFalse(p.match_digit()) p.next_char() self.assertFalse(p.match_digit()) # test Arabic digits, should not match! p = unicode5.BasicParser( u8(b'\xd9\xa0\xd9\xa1\xd9\xa2\xd9\xa3\xd9\xa4\xd9\xa5' b'\xd9\xa6\xd9\xa7\xd9\xa8\xd9\xa9')) for i in range3(10): self.assertFalse(p.match_digit()) p.next_char() p = unicode5.BasicParser(b"2p") self.assertTrue(p.match_digit()) p.next_char() self.assertFalse(p.match_digit()) p.next_char() self.assertFalse(p.match_digit())
def test_char_class(self): """:: charClass ::= charClassEsc | charClassExpr | WildcardEsc """ tests = { '\\P{S}': ("(){}", u8(b'+<=>\xe2\x81\x84\xe2\x82\xac')), '[A-z-[\[-\]]]': ("AZaz^_`", "[\\]@{-"), '.': ("abcABC ", "\x0a\x0d") } for b in dict_keys(tests): p = xsi.RegularExpressionParser(b) try: cclass = p.require_char_class() except xsi.RegularExpressionError: logging.debug("Failed to parse %s" % repr(b)) raise self.assertTrue(p.the_char is None) t1, t2 = tests[b] for c in t1: self.assertTrue(cclass.test(c), "%s not in %s" % (repr(c), b)) for c in t2: self.assertFalse(cclass.test(c), "%s in %s" % (repr(c), b))
def test_literals(self): data1 = "hello" if sys.version_info[0] < 3: target_type = types.UnicodeType else: target_type = str self.assertTrue(py2.u8(b"hello") == data1) self.assertTrue(isinstance(py2.u8(b"hello"), target_type)) self.assertTrue(py2.ul(b"hello") == data1) self.assertTrue(isinstance(py2.ul(b"hello"), target_type)) data2 = b'Caf\xc3\xa9'.decode('utf-8') self.assertTrue(py2.u8(b'Caf\xc3\xa9') == data2) self.assertTrue(py2.ul(b'Caf\xe9') == data2) data3 = b'\xe8\x8b\xb1\xe5\x9b\xbd'.decode('utf-8') self.assertTrue(py2.u8(b'\xe8\x8b\xb1\xe5\x9b\xbd') == data3) # Catch common errors # 1: missing b in literal, OK for ASCII text self.assertTrue(py2.u8("hello") == data1) self.assertTrue(py2.ul("hello") == data1) # 2: missing b, u8 fails for 8-bit character try: py2.u8('Caf\xe9') self.fail('8-bit unqualified literal (bad UTF-8)') except UnicodeDecodeError: self.fail('8-bit unqualified literal decoded as utf-8') except ValueError: pass # ... but in Python 2 we can't catch valid utf-8 sequences # pretending to be unicode strings try: py2.u8('Caf\xc3\xa9') self.assertTrue(sys.version_info[0] < 3, '8-bit unqualified literal (good UTF-8)') except ValueError: pass # 3: missing b, ul accepted with 8-bit character self.assertTrue(py2.ul('Caf\xe9')) == data2 # 4: missing b, u8 fails for 16-bit character try: # in python 2 we can't catch this but it was probably a bug # before anyway due to the missing 'u' result = py2.u8('\u82f1\u56fd') self.assertTrue(sys.version_info[0] < 3, '16-bit unqualified literal') self.assertTrue(result == '\\u82f1\\u56fd') except ValueError: self.assertFalse(sys.version_info[0] < 3, '16-bit unqualified literal') # 5: missing b, ul fails for 16-bit character try: result = py2.ul('\u82f1\u56fd') self.assertTrue(sys.version_info[0] < 3, '16-bit unqualified literal') self.assertTrue(result == '\\u82f1\\u56fd') except ValueError: self.assertFalse(sys.version_info[0] < 3, '16-bit unqualified literal') # 6: input already qualified with 'u', benign for ASCII self.assertTrue(py2.u8(u"hello") == data1) self.assertTrue(py2.ul(u"hello") == data1) # ...u8 fails for 8-bit character try: py2.u8(u'Caf\xe9') self.fail('8-bit qualified literal') except UnicodeEncodeError: self.fail('8-bit qualified literal uncaught encode error') except ValueError: pass # ...ul accepted with 8-bit character self.assertTrue(py2.ul(u'Caf\xe9')) == data2 # ...u8 fails for 16-bit character try: py2.u8(u'\u82f1\u56fd') self.fail('16-bit qualified literal') except UnicodeEncodeError: self.fail('16-bit qualified literal uncaught encode error') except ValueError: pass # ...ul fails for 16-bit character try: py2.ul(u'\u82f1\u56fd') self.fail('16-bit qualified literal') except UnicodeEncodeError: self.fail('16-bit qualified literal uncaught encode error') except ValueError: pass
import codecs import logging import unittest from sys import maxunicode import pyslet.unicode5 as unicode5 from pyslet.py2 import (byte, character, is_text, join_bytes, py2, range3, u8, ul) MAX_CHAR = 0x10FFFF if maxunicode < MAX_CHAR: MAX_CHAR = maxunicode CHINESE_TEST = u8(b'\xe8\x8b\xb1\xe5\x9b\xbd') def suite(): return unittest.TestSuite( (unittest.makeSuite(EncodingTests, 'test'), unittest.makeSuite(CharClassTests, 'test'), unittest.makeSuite(UCDTests, 'test'), unittest.makeSuite(ParserTests, 'test'))) class EncodingTests(unittest.TestCase): def test_detection(self): test_string = u"Caf\xe9" for codec, bom in (('utf_8', codecs.BOM_UTF8), ('utf_32_be', codecs.BOM_UTF32_BE),
def test_parse_hex_digits(self): src = ul("23.FG.fg.0.00.abcdefABCDEF0123456789") p = unicode5.BasicParser(src) pb = unicode5.BasicParser(src.encode('ascii')) # min value of 0, should throw an error try: p.parse_hex_digits(-1) self.fail("min=-1 didn't raise exception") except ValueError: # and it shouldn't move the parser self.assertTrue(p.pos == 0) # min value > max, should throw an error try: p.parse_hex_digits(3, 1) self.fail("min > max didn't raise exception") except ValueError: # and it shouldn't move the parser self.assertTrue(p.pos == 0) # check min value of 1 result = [ ul("23"), ul("F"), ul("f"), ul("0"), ul("00"), ul("abcdefABCDEF0123456789") ] i = 0 while p.the_char is not None: resulti = p.parse_hex_digits(1) bresulti = pb.parse_hex_digits(1) if resulti is not None: self.assertTrue(resulti == result[i], resulti) self.assertTrue(bresulti == result[i].encode('ascii'), bresulti) i += 1 p.next_char() pb.next_char() self.assertTrue(i == len(result)) # min value of 2 p.setpos(0) pb.setpos(0) result = [ul("23"), ul("00"), ul("abcdefABCDEF0123456789")] i = 0 while p.the_char is not None: resulti = p.parse_hex_digits(2) bresulti = pb.parse_hex_digits(2) if resulti is not None: self.assertTrue(resulti == result[i], resulti) self.assertTrue(bresulti == result[i].encode('ascii'), bresulti) i += 1 p.next_char() pb.next_char() self.assertTrue(i == len(result)) p.setpos(0) pb.setpos(0) result = [ ul("23"), ul("00"), ul("abcde"), ul("ABCDE"), ul("01234"), ul("6789") ] i = 0 while p.the_char is not None: resulti = p.parse_hex_digits(2, 5) bresulti = pb.parse_hex_digits(2, 5) if resulti is not None: self.assertTrue(resulti == result[i], resulti) self.assertTrue(bresulti == result[i].encode('ascii'), bresulti) i += 1 p.next_char() pb.next_char() self.assertTrue(i == len(result)) # check we can exceed ordinary integer sizes istr = ul("123456789aBcDeF" + "0" * 256) p = unicode5.BasicParser(istr) self.assertTrue(len(p.parse_hex_digits(1, 256)) == 256) # and check that runs of 0 don't mean a thing self.assertTrue(p.parse_hex_digits(1, 256) == ul("000000000000000")) # test Arabic digits, should not parse! p = unicode5.BasicParser( u8(b'\xd9\xa0\xd9\xa1\xd9\xa2\xd9\xa3\xd9\xa4\xd9\xa5' b'\xd9\xa6\xd9\xa7\xd9\xa8\xd9\xa9')) for i in range3(10): self.assertTrue(p.parse_hex_digits(1) is None) p.next_char()
def test_nss(self): """Syntax for URN char:: <trans> | "%" <hex> <hex> Translation is done by encoding each character outside the URN character set as a sequence of one to six octets using UTF-8 encoding [5], and the encoding of each of those octets as "%" followed by two characters from the <hex> character set above. the character [%] used in a literal sense MUST be encoded a character MUST NOT be "%"-encoded if the character is not a reserved character SHOULD NOT use [other reserved characters] characters in unencoded form each character outside the URN character set [is encoded] as a sequence of one to six octets using UTF-8 encoding The presence of an "%" character in an URN MUST be followed by two characters from the <hex> character set In addition, octet 0 (0 hex) should NEVER be used, in either unencoded or %-encoded form.""" trans_tests = { ul('\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10' '\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f ' '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\' ']^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f'): '%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10' '%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%20' '!%22%23$%25%26\'()*+,-.%2F0123456789:;%3C=%3E%3F@ABCDEFGHIJKLMN' 'OPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D' '%7E%7F', u8(b'\xe8\x8b\xb1\xe5\x9b\xbd'): '%E8%8B%B1%E5%9B%BD', ul('Caf\xe9'): 'Caf%C3%A9' } for src, dst in dict_items(trans_tests): self.assertTrue( urn.translate_to_urnchar(src) == dst, "%s -> \n%s, expected \n%s" % (repr(src), repr(urn.translate_to_urnchar(src)), repr(dst))) self.assertTrue( urn.translate_from_urnchar(dst) == src, "%s -> \n%s, expected \n%s" % (repr(dst), repr(urn.translate_from_urnchar(dst)), repr(src))) u = urn.URN(nid='foo', nss=dst) self.assertTrue(u.nss == dst) u = uri.URI.from_octets('urn:foo:%s' % dst) self.assertTrue(u.nss == dst) for wrong in ("100% wrong", "Zero%00"): try: urn.translate_from_urnchar(wrong) self.fail("%s test in URN" % repr(wrong)) except ValueError: pass try: urn.translate_to_urnchar("Zero\x00Byte") self.fail("Zero byte test in URN") except ValueError: pass # let's invent a scheme whereby the reserved characters # include . which is reserved for special meaning and # / is used unencoded as a path separator (even though # it is reserved and *SHOULD* be encoded def dot(c): return c == "." src = "urn:path:.steve/file%2Ename/easy_come%2Feasy_go" u = uri.URI.from_octets(src) path = u.nss.replace('.', 'users/') path = [urn.translate_from_urnchar(s) for s in path.split('/')] self.assertTrue(path == [ 'users', 'steve', 'file.name', 'easy_come/easy_go'], "Parsed: %s" % repr(path)) path = path[1:] # / is always reserved so we don't need to call this out path = [urn.translate_to_urnchar(x, dot) for x in path] # add the newly reserved characters after translation... path = '.' + '/'.join(path) u2 = urn.URN(nid='path', nss=path) self.assertTrue(u == u2) self.assertTrue(str(u) == str(u2))
import logging import unittest from sys import maxunicode import pyslet.unicode5 as unicode5 from pyslet.py2 import byte, character, is_text, join_bytes, u8, ul from pyslet.py2 import py2, range3 MAX_CHAR = 0x10FFFF if maxunicode < MAX_CHAR: MAX_CHAR = maxunicode CHINESE_TEST = u8(b'\xe8\x8b\xb1\xe5\x9b\xbd') def suite(): return unittest.TestSuite(( unittest.makeSuite(EncodingTests, 'test'), unittest.makeSuite(CharClassTests, 'test'), unittest.makeSuite(UCDTests, 'test'), unittest.makeSuite(ParserTests, 'test') )) class EncodingTests(unittest.TestCase): def test_detection(self): test_string = u"Caf\xe9"
def test_parse_hex_digits(self): src = ul("23.FG.fg.0.00.abcdefABCDEF0123456789") p = unicode5.BasicParser(src) pb = unicode5.BasicParser(src.encode('ascii')) # min value of 0, should throw an error try: p.parse_hex_digits(-1) self.fail("min=-1 didn't raise exception") except ValueError: # and it shouldn't move the parser self.assertTrue(p.pos == 0) # min value > max, should throw an error try: p.parse_hex_digits(3, 1) self.fail("min > max didn't raise exception") except ValueError: # and it shouldn't move the parser self.assertTrue(p.pos == 0) # check min value of 1 result = [ul("23"), ul("F"), ul("f"), ul("0"), ul("00"), ul("abcdefABCDEF0123456789")] i = 0 while p.the_char is not None: resulti = p.parse_hex_digits(1) bresulti = pb.parse_hex_digits(1) if resulti is not None: self.assertTrue(resulti == result[i], resulti) self.assertTrue(bresulti == result[i].encode('ascii'), bresulti) i += 1 p.next_char() pb.next_char() self.assertTrue(i == len(result)) # min value of 2 p.setpos(0) pb.setpos(0) result = [ul("23"), ul("00"), ul("abcdefABCDEF0123456789")] i = 0 while p.the_char is not None: resulti = p.parse_hex_digits(2) bresulti = pb.parse_hex_digits(2) if resulti is not None: self.assertTrue(resulti == result[i], resulti) self.assertTrue(bresulti == result[i].encode('ascii'), bresulti) i += 1 p.next_char() pb.next_char() self.assertTrue(i == len(result)) p.setpos(0) pb.setpos(0) result = [ul("23"), ul("00"), ul("abcde"), ul("ABCDE"), ul("01234"), ul("6789")] i = 0 while p.the_char is not None: resulti = p.parse_hex_digits(2, 5) bresulti = pb.parse_hex_digits(2, 5) if resulti is not None: self.assertTrue(resulti == result[i], resulti) self.assertTrue(bresulti == result[i].encode('ascii'), bresulti) i += 1 p.next_char() pb.next_char() self.assertTrue(i == len(result)) # check we can exceed ordinary integer sizes istr = ul("123456789aBcDeF" + "0" * 256) p = unicode5.BasicParser(istr) self.assertTrue(len(p.parse_hex_digits(1, 256)) == 256) # and check that runs of 0 don't mean a thing self.assertTrue(p.parse_hex_digits(1, 256) == ul("000000000000000")) # test Arabic digits, should not parse! p = unicode5.BasicParser( u8(b'\xd9\xa0\xd9\xa1\xd9\xa2\xd9\xa3\xd9\xa4\xd9\xa5' b'\xd9\xa6\xd9\xa7\xd9\xa8\xd9\xa9')) for i in range3(10): self.assertTrue(p.parse_hex_digits(1) is None) p.next_char()