Example #1
0
class CPT(object):
    "Code Point Table."

    def _preinit(self):
        "Internal function for initializing the lookup table"
        self.base, self.data = [1, [[0] * self.size, [0] * self.size]]

    def __init__(self):
        "Setup sizing, lookup table, and redo dictionary"
        self.snip = Cut()
        self.size = self.snip.the.enum
        self._preinit()
        self.redo = {}

    def __setitem__(self, glyph, table):
        "Set the table number for the glyph.  TODO develop __delitem__  method"
        codepoint = glyph if isinstance(glyph, int) else ord(glyph)
        this, cuts = [1, self.snip.cut(codepoint)]
        self.redo[codepoint] = table
        for segment in cuts[:-1]:
            N = len(self.data)
            if self.data[this][segment] == 0:
                if this < self.base:
                    self.data[this][segment] = table
                else:
                    self.data[this][segment] = N
                    self.data.append([0] * self.size)
                    this = N
            else:
                this = self.data[this][segment]
        self.data[this][cuts[-1]] = -table

    def __getitem__(self, glyph):
        "Get the table number for the glyph"
        codepoint = glyph if isinstance(glyph, int) else ord(glyph)
        cuts = self.snip.cut(codepoint)
        this = 1
        for segment in cuts:
            this = self.data[this][segment]
        return -this

    def __delitem__(self, glyph):
        "Remove glyph:table association from lookup table"
        codepoint = glyph if isinstance(glyph, int) else ord(glyph)
        present = self[codepoint]
        if present != 0:
            del self.redo[codepoint]  # Eliminate codepoint from lookup table
            self._preinit()  # Initialize the lookup table
            for codepoint, table in self.redo.iteritems():
                self[codepoint] = table  # Refill the lookup table

    def __len__(self):
        "Return the count of glyphs available for lookup"
        return len(self.redo.keys())
Example #2
0
class Digit(list):
    """
    This class ingests UnicodeData.txt and organizes digits from all languages
    into an efficient search tree for conversions.
    """

    class UnicodeDataLine(dict):
        """
        This class supports turning a line into a Unicode compliant dictionary.
        """

        def __init__(self, columnNames):
            self.__dict__ = self
            self.columnNames = columnNames

        def __call__(self, line):
            if not line:
                return False
            line = line.split(';')
            if len(line) != len(self.columnNames):
                return False
            self.update(dict(zip(self.columnNames, line)))
            return True

    def _columnNameIngest(self):
        """ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
        Column names are hand copied from this URL changing ' ' and '.' to '_',
        and changing 10646_comment_field to comment_field_10646,
        and changing column 0 to its common name: Codepoint.
        It is possible to extract column names using BeautifulSoup but
        the added complexity over using manifest constants is not merited.
        The alternative is to use BeautifulSoup to extract column names from
        TR44 which is organized poorly for column name extraction.
        These names become member names in a UnicodeDataLine instance.
        This considerably simplifies maintenance relative to the publications.
        """
        Unicode_3_0_0_columns = """
Codepoint
Character_name
General_Category
Canonical_Combining_Classes
Bidirectional_Category
Character_Decomposition_Mapping
Decimal_digit_value
Digit_value
Numeric_value
Mirrored
Unicode_1_0_Name
comment_field_10646
Uppercase_Mapping
Lowercase_Mapping
Titlecase_Mapping
        """
        columnNames = Unicode_3_0_0_columns.strip().split('\n')
        self.line = Digit.UnicodeDataLine(columnNames)

    def _digitDataIngest(self):
        "Get data from unicode.org file from which to create tables"
        with open("local/UnicodeData.txt") as source:
            while True:
                if not self.line(source.readline()):
                    break
                if self.line.General_Category == 'Nd':
                    codepoint = int(self.line.Codepoint, 0x10)
                    digit = int(self.line.Digit_value)
                    languageparts = self.line.Character_name.split(' ')[:-2]
                    language = ' '.join(languageparts)
                    if language.strip() == "":
                        language = 'ASCII'
                    self.codepointToLanguage[codepoint] = language
                    self.codepointToDigit[codepoint] = digit
                    self.integerToCodepointList[digit].append(codepoint)
                    if self.wide or codepoint < 0x10000:  # unichr restriction
                        temp = self.languageToDigits.get(language, "")
                        self.languageToDigits[language] = temp
                        self.languageToDigits[language] += unichr(codepoint)

    def _ingest(self):
        "get data to initialize the tables"
        if self.kw.get('ingest', False):
            self.integerToCodepointList = {i: [] for i in range(10)}
            self.codepointToDigit = {}
            self.codepointToLanguage = {}
            self.languageToDigits = {}

            self._columnNameIngest()
            self._digitDataIngest()

            if self.verbose:
                print(str(self.integerToCodepointList))
            for integer in self.integerToCodepointList.keys():
                for character in self.integerToCodepointList[integer]:
                    self(character, integer)
        elif self.kw.get('unique', False):
            for integer, character in enumerate(u"0123456789"):
                self(ord(character), integer)
        else:
            for integer, character in enumerate(u"0123456789"):
                self(ord(character), integer)
            for integer, character in enumerate(u"௦௧௨௩௪௫௬௭௮௯"):
                self(ord(character), integer)
        return self

    def sequence(self, language=None):
        "Generate the digit string '0-9' for a given language"
        if language is None:
            return self.languageToDigits.keys()
        return self.languageToDigits.get(language, "unknown")

    def whatLanguageIs(self, codepoint):
        "Return the language block name associated with a codepoint"
        if isinstance(codepoint, type(u"")):
            codepoint = ord(codepoint)
        return self.codepointToLanguage.get(codepoint, 'Unknown')

    def __init__(self, **kw):
        "Create tables"
        self.kw = kw
        self.verbose = self.kw.get('verbose', False)
        self.codepointToDigit = {}
        self.preIndex = Cut()
        self.wide = self.kw.get('wide', False)
        for _ in range(self.preIndex.the.base):
            self.append([_] * self.preIndex.the.enum)
        self.append([-1] * self.preIndex.the.enum)
        self._ingest()
        self.shape = (len(self), len(self[0]))
        if self.verbose:
            print 'table shape (%d,%d)' % self.shape

    def __call__(self, glyph, digit=-1):
        """
        functor to either add digit codepoint pairs or find codepoint digits
        TODO fix this to use CPT mechanism to follow DRY principle.
        """
        codepoint = glyph if isinstance(glyph, int) else ord(glyph)
        this = 10
        if digit == -1:
            # get mode
            cuts = self.preIndex.cut(codepoint)
            for segment in cuts:
                this = self[this][segment]
                if this < self.preIndex.the.base:
                    return this
            return digit
        else:
            # put mode
            cuts = self.preIndex.cut(codepoint)
            if self.verbose:
                print("    INSERT %d %06x %s" % (digit, codepoint, cuts))
            for segment in cuts[:-1]:
                N = len(self)
                if self[this][segment] == -1:
                    if this < self.preIndex.the.base:
                        self[this][segment] = digit
                    else:
                        self[this][segment] = N
                        self.append([-1] * self.preIndex.the.enum)
                        this = N
                else:
                    this = self[this][segment]
                if self.verbose:
                    print 'self[%d][%d] = %d' % (this, segment, this)
            self[this][cuts[-1]] = digit

    def productions(self):
        "create javascript usable content"
        show = u"""
document.jlettvin = {};
document.jlettvin.unidigit = {
/** Digit tables for handling digits in all Unicode supported alphabets.
Unicode strings containing numbers may be converted to integer,
arithmetic operations may be performed,
and the result output generated in the alphabet of choice.
Digits may be freely intermixed in number strings.
 */
"""

        # TODO Fix the bug in this code
        show += u"var integerToCodepointList = [\n    "
        show += u',\n    '.join([
            u"%d => %s" % (i, str(self.integerToCodepointList[i]))
            for i in range(10)])
        show += u"\n];\n"

        show += u"var codepointToDigit = [\n    "
        show += u'\n    '.join([
            u"0x%06x => '%d'" % (k, w)
            for k, w in self.codepointToDigit.iteritems()])
        show += u"\n];\n"

        show += u"var codepointToLanguage = [\n    "
        show += u',\n    '.join([
            u"0x%06x => '%s'" % (k, w)
            for k, w in self.codepointToLanguage.iteritems()])
        show += u"\n];\n"

        show += u"var languageToDigits = [\n    "
        show += u',\n    '.join([
            u"'%s' => '%s'" % (k, w)
            for k, w in self.languageToDigits.iteritems()])
        show += u"\n];\n"

        show += u"var digitLookup = [\n    "
        show += u',\n    '.join([
            u"%3d => %s" % (index, str(page))
            for index, page in enumerate(self)])
        show += u"\n];\n"

        show += """
var cutCodepoint = function(codepoint) {
    var bits = %d;
    var mask = %d;
    var need = %d;
    var cuts = new int[need];
    for (var shft = need; shft--; ) {
        cuts[need-shft-1] = codepoint >> (shft * bits) & mask;
    }
    return cuts;
};

var asDigit = function(codepoint) {
    var cuts = cutCodepoint(codepoint);
    var this = %d;
    var base = %d;
    var done = null;
    for (var segment of cuts) {
        var this = digitLookup[this][segment];
        if (this < base) {
            done = this;
            break;
        }
    }
    cuts = null;
    return done == null ? -1 : done;
};
};
""" % ( self.bits,
        self.mask,
        self.need,
        self.preIndex.the.base,
        self.preIndex.the.base)

        return show

    def emit(self):
        "Generate javascript usable tables and function."
        with open('Digit.js', 'w+b') as target:
            UTF8Writer = codecs.getwriter('utf8')
            target = UTF8Writer(target)
            print>>target, self.productions()
        return self