Exemple #1
0
    def raw_data(self, ord_):
        """
        Return similar Hanzi/Kanji etc characters 
        using the MultiRadicals database
        """
        char = w_unichr(ord_)

        if char in DBothChars:
            LRads = self.get_rads(char)
            LNumStrokes = self.char_data.raw_data('unihan.totalstrokes',
                                                  ord_) or []
            #print LNumStrokes

            L = []
            SAdded = set()

            for rad in LRads:
                for i_ord in DBothRads[rad]:
                    if char == w_unichr(i_ord) and True:
                        continue
                    elif i_ord in SAdded:
                        continue

                    SAdded.add(i_ord)
                    L.append(self._get_cmp_value(LRads, LNumStrokes, i_ord))
            L.sort()

            return [i[-1] for i in L]
        else:
            return None
Exemple #2
0
    def _get_cmp_value(self, LRads, LNumStrokes, ord_):
        i_LRads = self.get_rads(w_unichr(ord_))

        # Get the number of multiradicals which are
        # different between the current character and radical
        num_same = len([i for i in i_LRads if i in LRads])

        # Get the difference in number of strokes. If there are
        # multiple stroke counts for either character, then choose
        # the smallest difference
        stroke_diff = maxsize

        #print unichr(ord_), char_data.raw_data('unihan.totalstrokes', ord_)
        for num_strokes in self.char_data.raw_data('unihan.totalstrokes',
                                                   ord_) or []:
            for i_num_strokes in LNumStrokes:
                x = abs(i_num_strokes - num_strokes)

                if x < stroke_diff:
                    stroke_diff = x

        # TODO: Get the frequency depending on language! ==========================================
        LFreqs = []
        for key in ('unihan.frequency', 'kanjidic.freq'):
            freq = self.char_data.raw_data(key, ord_)
            freq = freq[0] if freq else maxsize
            LFreqs.append(freq)

        return (-num_same, stroke_diff, LFreqs, ord_)
    def raw_data(self, ord_):
        char = w_unichr(ord_)

        if char in DBothChars:
            return ' '.join(tuple([i for i in DBothChars[char] if i != '�']))
        else:
            return None
Exemple #4
0
    def raw_data(self, ord_):
        char = w_unichr(ord_)

        try:
            return unicodedata.normalize(self.typ, char)
        except:
            return None
Exemple #5
0
 def get_L_ranges(self, L):
     """
     """
     LRtn = []
     for typ, value in L:
         # FIXME: Don't allow searches by
         # fulltext unless it's a direct match for
         # name/conscript name/definitions etc! ===============================
         # (readings should allow partial matches I think)
         #print typ, value
         for i in self.char_indexes.search(typ, value):
             if isinstance(i, (list, tuple)):
                 from_, to = i
                 LRtn.append((w_unichr(from_), w_unichr(to)))
             else:
                 LRtn.append(w_unichr(i))
     return tuple(LRtn)
Exemple #6
0
    def raw_data(self, ord_):
        # Add encoding mappings
        char = w_unichr(ord_)

        try:
            return char.encode(self.encoding)
        except Exception:
            from traceback import print_exc
            #print_exc()
            return None
Exemple #7
0
def get_simp_chars(char_data, Char, DTradChars=None):
    if type(Char) == int:
        Char = w_unichr(Char)

    LSubChars = []

    if DTradChars:
        # convert the converted characters/radicals Traditional-Simplified
        LVariants = [
            'unihan.simplifiedvariant',
            'unihan.semanticvariant', # CHECK ME!
            'unihan.specializedsemanticvariant', # CHECK ME!
            'unihan.zvariant'
        ]

        for Variant in LVariants:
            V = char_data.raw_data(Variant, w_ord(Char))
            if V:
                LSubChars += [
                    w_unichr(i) for i in V
                ]
                print(('Char:', Char.encode('utf-8'), 'LSubChars:', ''.join(LSubChars).encode('utf-8'), 'V:', V))

    # Use the transliteration system to convert from T-S,
    # only allowing if different and MultiRad data not
    # already available for that character
    from multi_translit.translit.my_engine.TranslitEngine import get_engine
    TradToSimp = get_engine('Chinese Traditional-Simplified')
    Conv = TradToSimp.convert(Char)
    LSubChars.append(Conv)

    # HACK: Make sure characters that are already
    # Traditional aren't converted so information isn't lost!
    # NOTE: This shouldn't be performed for RADICAL conversion,
    # just the CHARS as it can stuff up the radical lists :-(
    if DTradChars:
        LSubChars = [
            i for i in LSubChars if not i in DTradChars
        ]

    return rem_dupes(LSubChars)
Exemple #8
0
def conv_hex(key, s):
    #print 'KEY:', key
    if key == 'Block Subnames' or key == 'Name' or \
        key == 'Unicode 1.0 Name' or \
        key == 'Xiandai Hanyu Pinlu':
        return s # HACK!
    
    elif key == 'Names List' and s.startswith('# '):
        #print 'COMPAT:', s.encode('utf-8')
        try:
            L1 = []
            L2 = []
            
            for hex in s.split():
                try: 
                    assert len(hex) == 4
                    ord_ = int(hex, 16)
                except:
                    # Not hex, e.g. "<super>" in the trademark symbol?
                    L1.append(hex)
                    continue
                
                L1.append('U+%s' % hex)
                L2.append(w_unichr(ord_))
            return '%s (%s)' % (' '.join(L1), ''.join(L2))
        
        except: 
            return s
    
    try: 
        t_s = s.strip('() ')
        hex = t_s.split(' ')[-1].upper().replace('U+', '')
        assert len(hex) == 4
        ord_ = int(hex, 16)
        t_s = ' '.join(t_s.split(' ')[:-1]).strip(' -')
        t_s = '%s U+%s (%s)' % (t_s, hex, w_unichr(ord_))
        #print 'CONVHEX:', t_s.encode('utf-8')
        return t_s.strip()
    except: 
        return s
Exemple #9
0
    def raw_data(self, ord_, data):
        char = w_unichr(ord_)

        if self.typ == 'lower':
            return char.lower()
        elif self.typ == 'upper':
            return char.upper()
        elif self.typ == 'title':
            return char.title()
        elif self.typ == 'capitalize':
            return char.capitalize()
        else:
            raise Exception("unknown casing type %s" % self.typ)
Exemple #10
0
    def get_D_latin(self, script='Latin'):
        """
        This can also work for Latin-like scripts such as
        Cyrillic to an extent, but works best for Latin.

        It provides a map from e.g. "e" to "é" etc for allowing input popups
        (giving a choice of accented characters after a non-accented character
        is pressed) and grouping into like headers etc
        """
        DLatin = {}

        LLatin = self.char_indexes.search('unicodedata.script', script)
        for L in LLatin:
            if not type(L) in (list, tuple):
                L = [L, L]
            from_, to = L

            for ord_ in range(from_, to + 1):
                char = w_unichr(ord_)

                # Find the relevant key
                name = [
                    _.lower() for _ in self.char_data.raw_data('name', ord_)
                ]
                letters = get_smallest_name(name)
                #print 'SMALLEST NAME:', name, letters, unichr(ord_)

                if not char in DLatin:
                    DLatin[char] = []
                DLatin[char].append(letters)

                if len(letters) > 1:
                    DLatin[char].append(letters[0])

                # TODO: What about "alternates" for characters which
                # look like e.g. an 'o' but sound like another letter?

                # Adjust case as appropriate
                if char.istitle():
                    DLatin[char] = [i.title() for i in DLatin[char]]
                elif char.isupper():
                    DLatin[char] = [i.upper() for i in DLatin[char]]
                elif char.islower():
                    DLatin[char] = [i.lower() for i in DLatin[char]]

        # Spanish HACKS!
        DLatin['!'] = ['¡']
        DLatin['?'] = ['¿']
        DLatin['¡'] = ['!']
        DLatin['¿'] = ['?']
        return DLatin
Exemple #11
0
    def get_D_latin_to_L_chars(self, script='Latin'):
        DRtn = {}

        LChars = self.char_indexes.search('unicodedata.script', script)
        for LRange in LChars:
            if type(LRange) != tuple:
                LRange = (LRange, LRange + 1)

            for ord_ in range(*LRange):
                name = self.char_data.raw_data('name', ord_)
                key = get_smallest_name(name).lower()
                #print key, unichr(ord_).encode('utf-8'), name

                if key != 'zz':
                    key = key[0]  # HACK!

                L = DRtn.setdefault(key, [])
                L.append(w_unichr(ord_))
        return DRtn
Exemple #12
0
    def process_curly_braces(self, x, s):
        """
        Characters inside curly braces/
        brackets ({...}) make a string
        """
        L = []
        backslash_mode = False
        x += 1

        while 1:
            # Get the current char
            try:
                c = s[x]
            except:
                break

            if backslash_mode and c == 'u':
                # A Unicode backslash
                L.append(w_unichr(int(s[x + 1:x + 5], 16)))
                backslash_mode = False
                x += 4

            elif backslash_mode:
                L.append(c)
                backslash_mode = False

            elif c.strip():
                if c == '}':
                    break
                elif c == '\\':
                    backslash_mode = True
                else:
                    L.append(c)
            x += 1

        return x, ''.join(L)
Exemple #13
0
 def _format_data(self, ord_, data):
     return ' '.join(w_unichr(i_ord) for i_ord in data)
Exemple #14
0
    def get_ranges(self, s):
        x = 0
        LRtn = []

        backslash_mode = False
        cur_operator = None
        neg = False

        while 1:
            # Get the current char
            try:
                c = s[x]
            except:
                break
            #print 'GET_RANGES:', c

            if backslash_mode:
                if c in 'pP':
                    # perl-syntax!
                    x, neg, LRanges = self.process_perl(x, s)
                    LRtn.append((RANGES, neg, LRanges))

                elif c == 'u':
                    # Unicode backslash
                    LRtn.append((STRING, w_unichr(int(s[x + 1:x + 5], 16))))
                    x += 4

                elif c == '\\':
                    # a literal backslash
                    LRtn.append((STRING, '\\'))

                backslash_mode = False

            elif c == '\\':
                # Activate backslash mode
                backslash_mode = True

            elif c == '{':
                # Extract a string
                x, str_ = self.process_curly_braces(x, s)
                LRtn.append((STRING, str_))

            elif s[x:x + 2] == '[:':
                # A POSIX range
                x, neg, LRanges = self.process_posix(x, s)
                LRtn.append((RANGES, neg, LRanges))

            elif c == '[':
                # Embedded ranges
                # TODO: Fix PERL BACKSLASHES! =======================================
                x, range_text = self.process_range(x, s)
                assert (range_text[0], range_text[-1]) == ('[', ']')
                LRtn.append(self.get_ranges(range_text[1:-1]))

            elif c == ']':
                # Should never be a closing bracket here!
                raise Exception("']' without '['!")

            elif c in '-&':
                # Operators
                # - is difference
                # & is intersect
                cur_operator = c
                x += 1
                continue

            elif not x and c == '^':
                # Negative mode
                neg = True

            elif c == '$' and (x == len(s) - 1):
                pass
                #FIXME # FIXME! ================================================

            elif c == '$' and self.DVars != None:
                # Process `$variableName`
                y, name = self.process_variable(x, s)

                value = self.DVars[name]
                if isinstance(value, (tuple, list)):
                    value = ''.join(_[-1]
                                    for _ in value)  # TYPE WARNING! ==========

                # Insert the variable, and reprocess
                #print 'BEFORE:', s, value
                s = s[:x] + value + s[y:]
                #print s, s[x]
                x -= 1

            elif c.strip():
                # Non-whitespace - add a single character as a string
                # OPEN ISSUE: Ignore whitespace??? ================================
                LRtn.append((STRING, c))

            else:
                # Whitespace - ignore it
                x += 1
                continue

            if not backslash_mode and cur_operator and len(LRtn) < 2:
                # Allow initial "-" and "&" characters
                LRtn.append((STRING, cur_operator))
                cur_operator = None

            elif not backslash_mode and cur_operator:

                if LRtn[-1][0] in (RANGES, OPERATOR):
                    # A range difference etc, e.g. "[[a-e]-[c]]"
                    assert LRtn[-2][0] in (RANGES, OPERATOR)
                    LRtn.append((OPERATOR, (cur_operator, LRtn.pop())))
                else:
                    # an ordinary range, e.g. "[a-e]"
                    item2, item1 = LRtn.pop(), LRtn.pop()

                    assert cur_operator == '-'
                    assert (item1[0], item2[0]) == (STRING,
                                                    STRING), (item1, item2,
                                                              self.s)
                    assert (len(item1[1]), len(item2[1])) == (1, 1)

                    # Note the "FALSE!"
                    LRtn.append((RANGES, False, ((item1[1], item2[1]), )))

                cur_operator = None

            x += 1
        return (RANGES, neg, tuple(LRtn))