Beispiel #1
0
def test_name2unicode_uni_empty_string():
    """The name "uni20ac" has a single component,
    which is mapped to a euro-sign.

    According to the specification this should be mapped to an empty string,
    but we also want to support lowercase hexadecimals"""
    assert '\u20ac' == name2unicode('uni20ac')
Beispiel #2
0
 def __join_line(self, textline_list):
     '''return text, and clean text'''
     text = ''
     for i in textline_list:
         '''
         pdf font change
         (cid:404) & < (cid:148) (cid:13) (cid:176) (cid:131)
         (cid:147) (cid:150) " (cid:146) > (cid:151)
         '''
         step0 = [j for j in i if j]
         # decode cid
         step1 = [
             encodingdb.name2unicode(j) if re.search('cid', j) else j
             for j in step0
         ]
         # decode &amp
         step2 = []
         for j in step1:
             if j == '&':
                 step2.append('&')
             elif j == '<':
                 step2.append('<')
             elif j == '&gt;':
                 step2.append('>')
             elif j == '&quot;':
                 step2.append('"')
             elif j == '&#39;':
                 step2.append("'")
             else:
                 step2.append(j)
         # -
         try:
             if step2[-1] == '\n':
                 if step2[-2] is '-':
                     step3 = step2[0:-2:]
                 else:
                     step3 = step2[::]
             else:
                 if step2[-1] is '-':
                     step3 = step2[0:-1:]
                 else:
                     step3 = step2 + ['\n']
         except:
             step3 = step2
         # \n
         step4 = [j if j is not '\n' else ' ' for j in step3]
         text += r''.join(step4)
     return text
Beispiel #3
0
def test_name2unicode_u_with_5_digits():
    """The name "u1040C" has a single component, which is mapped to the string
     U+1040C"""
    assert '\U0001040C' == name2unicode('u1040C')
Beispiel #4
0
def test_name2unicode_u_with_4_digits_lowercase():
    """The components "Lcommaaccent," "uni013B," and "u013B" all map to the
    string U+013B"""
    assert '\u013B' == name2unicode('u013b')
Beispiel #5
0
def test_name2unicode_uni_pua_lowercase():
    """"Ogoneksmall" and "uniF6FB" both map to the string that corresponds to
     U+F6FB."""
    assert '\uF6FB' == name2unicode('unif6fb')
Beispiel #6
0
def test_name2unicode_uni_with_sequence_of_digits_lowercase():
    """The name "uni20AC0308" has a single component,
    which is mapped to the string U+20AC U+0308"""
    assert '\u20AC\u0308' == name2unicode('uni20ac0308')
Beispiel #7
0
def test_name2unicode_uni():
    """The components "Lcommaaccent," "uni013B," and "u013B"
    all map to the string U+013B"""
    assert '\u013B' == name2unicode('uni013B')
Beispiel #8
0
def test_name2unicode_pua_ogoneksmall():
    """"
    Ogoneksmall" and "uniF6FB" both map to the string
    that corresponds to U+F6FB."""
    assert '\uF6FB' == name2unicode('Ogoneksmall')
Beispiel #9
0
def test_name2unicode_multiple_components_lowercase():
    """The name "Lcommaaccent_uni20AC0308_u1040C.alternate" is mapped to the
     string U+013B U+20AC U+0308 U+1040C"""
    assert '\u013B\u20AC\u0308\U0001040C' == \
           name2unicode('Lcommaaccent_uni20ac0308_u1040c.alternate')
Beispiel #10
0
def test_name2unicode_name_in_agl():
    """The name "Lcommaaccent" has a single component,
    which is mapped to the string U+013B by AGL"""
    assert '\u013B' == name2unicode('Lcommaaccent')