예제 #1
0
def write_radical_strokes_index(f, key, DData):
    '''
    Stores the Kangxi radical/additional strokes in a dict e.g.
    
    {"150'.5": [array.array('I'), ...],
     "150'": [array.array('I')]}
    
    Stores by both: 
    * (radical).(additional strokes)
    format as well as:
    * (radical)
    so that you can search even if you don't know the number of strokes.
    
    Simplified radicals are indicated by a final "'".
    '''
    DArrays = {}
    for ord_ in DData:
        for value in DData[ord_]:
            no_strokes_value = value.split('.')[0]  # no added strokes

            if not value in DArrays:
                DArrays[value] = get_int_array()

            if not no_strokes_value in DArrays:
                DArrays[no_strokes_value] = get_int_array()

            DArrays[value].append(ord_)
            DArrays[no_strokes_value].append(ord_)

    # Write to disk
    return write_arrays(f, DArrays)
예제 #2
0
def write_encoding(f, key, DOrds):
    DFlags, DOrds = coorce_to_encodings(DOrds)
    LRanges, DOrds = compress_ord_ranges(DOrds)
    DOrds = ranges_to_single_ords(DOrds)
    LIgnoreRanges = get_char_gaps(DOrds)
    '''
    Encoding [BigFive etc]
    Variables: LSeek[+1] -> LValues[+1]
    '''
    LSeek = get_int_array()  # [+1]
    '''
    LValues should be [+1], but can't be used as "FFFF" 
    will be chopped off. Instead I've used the *first*
    value to specify the number of values at that seekpoint, 
    similar to pascal strings which don't use \0's
    '''
    LValues = get_int_array()  # [should be +1, but can't]
    '''
    Store flags assigned to variants if there's a source 
    associated to it, e.g. in Unihan's kSemanticVariant 
    "U+7A69<kFenn,kMatthews" might store kFenn as `1` 
    and kMatthews as `2`, making the flag for U+7A69 `3`
    '''
    LFlags = get_int_array()

    if DOrds:
        for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
            if ord_ in DOrds:
                LEnc = DOrds[ord_]

                LSeek.append(len(LValues) + 1)

                LValues.append(len(LEnc))
                LFlags.append(0)  # NOTE ME!

                for enc, flags in LEnc:
                    LValues.append(enc)
                    LFlags.append(flags)
            else:
                LSeek.append(0)
    else:
        assert LRanges

    # Write to disk
    DRtn = {}
    DRtn['LSeek'] = write_array(f, LSeek)
    DRtn['LValues'] = write_array(f, LValues)
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    DRtn['LRanges'] = write_json(f, LRanges)

    if DFlags:
        # Write flags to disk if there are any for this key
        DFlagsReversed = dict(
            (str(value), i_key) for i_key, value in list(DFlags.items()))
        DRtn['DFlags'] = write_json(f, DFlagsReversed)
        DRtn['LFlags'] = write_array(f, LFlags)

    return DRtn
예제 #3
0
    def __init__(self, key, DData, LISO=None):
        """
        For readings in other languages etc where the data isn't in English
        Write the index to disk, using stem
        FIXME: ADD RANGE SUPPORT!
        TODO: REMOVE DUPE WORDS?
        """
        self.key = key
        self.DData = DData
        #self.LISO = LISO
        self.iso = key_to_iso(key)
        print('FulltextWriter ISO:', key, self.iso)
        self.SSpell = set()

        L = []
        for ord_ in list(self.DData.keys()):
            value = self.DData.get(ord_, [])
            LValues = value if isinstance(value, (list, tuple)) else [value]

            for value in LValues:
                if self.iso and self.iso == 'ltc':  # Tang dynasty Chinese
                    L.extend(self.get_L_tang(ord_, value))

                elif self.iso:
                    L.extend(self.get_L_inflected(ord_, value))

                else:
                    L.extend(self.get_L_general(ord_, value))
        L.sort()

        # Convert to c array types
        LHash = self.LHash = get_int_array(signed=False)
        LOrds = self.LOrds = get_int_array()

        for hash_, ord_ in L:
            LHash.append(hash_)
            LOrds.append(ord_)
예제 #4
0
def write_string_keys_index(f, key, DData):
    """
    Will be stored by dicts, e.g.
    {'Arabic': [array.array('I')], ...}
    """
    DStringKeys = {}
    LRanges = []

    # First go through the e.g. Unicode-specified ranges
    for ord_ in DData:
        #if key =='subblock_heading':
        #    print('DData[ord]:', DData[ord_])

        for value in iter_values(DData[ord_]):
            #if key == 'subblock_heading':
            #    print(value, ord_)

            if not value in DStringKeys:
                # Create a get_int_array for each value->ordinals mapping
                DStringKeys[value] = get_int_array()

            if type(ord_) in (list, tuple):
                # A range, so reprocess below
                LRanges.append((value, ord_))
            else:
                # A single value, so add just the codepoint
                DStringKeys[value].append(ord_)

    #if len(DStringKeys) > 10000:
    #    print(('StringKeys Ignored Because of Size:', key, len(DStringKeys)))
    #    return

    LRangesOut = []

    for value, (from_, to) in LRanges:
        # Normal Data
        LRangesOut.append([from_, to, value])

    # Write to disk
    # TODO: Divide larger keys into smaller categories?
    DRtn = {}
    DRtn['DStringKeys'] = write_arrays(f, DStringKeys)
    DRtn['LRanges'] = write_json(f, LRangesOut)

    return DRtn
예제 #5
0
def write_integer_list(f, key, DOrds):
    """
    IntegerList [Grades/Frequencies (storing only numbers)]
    """
    DOrds = coerce_to_int(DOrds)
    LRanges, DOrds = compress_ord_ranges(DOrds)
    LIgnoreRanges = get_char_gaps(DOrds)

    LShort = get_int_array()  # [+1]

    # Fill the data gaps - TODO: ADD SUPPORT FOR MULTIPLE VALUES!
    DMultiVals = {}
    for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
        #print 'ord_:', ord_
        if ord_ in DOrds:
            L = DOrds[ord_]
            LShort.append(int(L[0]) + 1)

            if len(L) > 1:
                # Append to `DMultiVals` if there is a
                # character with multiple integers mappings!
                assert not str(ord_) in DMultiVals
                DMultiVals[str(ord_)] = L[1:]
        else:
            LShort.append(0)

    # Write to disk
    print('WRITE LShort!')
    DRtn = {}
    DRtn['LShort'] = write_array(f, LShort)
    print('WRITE LIgnoreRanges!')
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    print('WRITE LRanges!')
    DRtn['LRanges'] = write_json(f, LRanges)
    print('WRITE DMultiVals!')
    DRtn['DMultiVals'] = write_json(f, DMultiVals)
    print(('OK:', DRtn))
    return DRtn
예제 #6
0
def write_sentence_data(f, key, DOrds):
    """
    Names [UnicodeData (single), NamesList (multiple)]
      LWordLinks separated by 1 to allow multiple names,
      Ended by a 0 [LWordLinks+=2 to compensate]
    References each word by a unsigned short (65535 max dictionary words)
    Variables: LSeek[+1; 0 means "no link/value for this codepoint"] -> 
               LWordLinks[+2] -> 
               LWords[\v terminated]
    """
    LSeek = get_int_array()  # [+1]
    LAmount = get_int_array()
    LWordLinks = get_int_array()  # [+2]
    LWords = get_uni_array()

    LRanges, DOrds = compress_ord_ranges(
        DOrds
    )  # MASSIVE SPACE WASTAGE WARNING! ==================================
    LIgnoreRanges = get_char_gaps(DOrds)

    DWords = {}

    def get_word_seek(word):
        if not word in DWords:
            # Add the word and seek info
            # if the word not in DWords
            seek = len(LWords)
            amount = LWords.extend(word)

            DWords[word] = (seek, amount)
        return DWords[word]

    DWordLinks = {}

    def get_wordlinks_seek(value):
        # This requires a list type to allow multiple
        # definitions, so if it isn't convert it to one
        if not type(value) in (list, tuple):
            value = [value]
        value = tuple(value)

        if value in DWordLinks:
            return DWordLinks[value]

        # Process each definition seperately
        i_LWords = [i.split() for i in value]
        wordlinks_seek = len(LWordLinks)

        i = 1
        for LSentence in i_LWords:
            for word in LSentence:
                # Append to LWordLinks[+2]
                seek, amount = get_word_seek(word)
                LWordLinks.append(seek + 2)
                LAmount.append(amount)

            if i != len(i_LWords):
                # 1 signifies multiple names
                # Only happens if not the last item
                LWordLinks.append(1)
                LAmount.append(0)
            i += 1

        # 0 signifies the end of sequence
        LWordLinks.append(0)
        LAmount.append(0)

        # Store for next time
        DWordLinks[value] = wordlinks_seek
        return wordlinks_seek

    if DOrds:
        for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
            if ord_ in DOrds:
                #print 'CODEPOINT FOUND:', ord_
                seek = get_wordlinks_seek(DOrds[ord_])
                LSeek.append(seek + 1)
            else:
                LSeek.append(0)

    # Make the values in the LRanges to point
    # to seek positions as well to save space
    LRanges = [(from_, to, get_wordlinks_seek(value))
               for from_, to, value in LRanges]

    # Write to disk
    DRtn = {}
    DRtn['LSeek'] = write_array(f, LSeek)
    DRtn['LWordLinks'] = write_array(f, LWordLinks)
    DRtn['LAmount'] = write_array(f, LAmount)
    DRtn['LWords'] = write_array(f, LWords)
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    DRtn['LRanges'] = write_json(f, LRanges)
    return DRtn
예제 #7
0
def write_string_data(f, key, DOrds):
    """
    StringData (storing string lists by number to save space)
    """
    # Variables: 
    # + LRanges
    LRanges, DOrds = compress_ord_ranges(DOrds)
    
    # + LSeek[+1][Ranges Subtracted] -> LWords[\v terminated]
    LSeek = get_int_array() # [+1]
    LAmount = get_int_array()
    LWords = get_uni_array()
    
    DWordSeek = {}
    def get_L_seek(L):
        if not isinstance(L, (tuple, list)):
            L = [L]
        
        LRtn = []
        for data in L:
            # Append the seek position for string `data`
            if not data in DWordSeek:
                seek = len(LWords)
                amount = LWords.extend(str(data))
                DWordSeek[data] = (seek, amount)

            LRtn.append(
                DWordSeek[data]
            )
        return LRtn
    
    LIgnoreRanges = []
    if DOrds:
        # Only process if not Blocks etc
        LIgnoreRanges = get_char_gaps(DOrds)
        #print 'CHARGAPS:', LIgnoreRanges
        
        for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
            #print Key, ord_
            if ord_ in DOrds:
                # And add the DWordSeek link to LSeek [+1]
                data = DOrds[ord_]
                
                for seek, amount in get_L_seek(data):
                    LSeek.append(seek+1)
                    LAmount.append(amount)
            else:
                LSeek.append(0)
                LAmount.append(0) # NOTE ME: It may pay to make this -1 and add assertions (!)
    
    # Make the values in the LRanges to point 
    # to seek positions as well to save space
    n_LRanges = []
    for from_, to, value in LRanges:
        a = get_L_seek(value)
        a = a[0] if len(a)==1 else a
        n_LRanges.append((from_, to, a))
    
    # Write to disk
    DRtn = {}
    DRtn['LRanges'] = write_json(f, n_LRanges)
    DRtn['LSeek'] = write_array(f, LSeek)
    DRtn['LAmount'] = write_array(f, LAmount)
    DRtn['LWords'] = write_array(f, LWords)
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    return DRtn
예제 #8
0
def write_integer_keys_index(f, key, DData, prefix=None):
    '''
    DNum -> {value: number of items} where 
    `value` might be a grade number
    '''
    DNum = {}
    for ord_ in DData:
        if isinstance(DData[ord_], (list, tuple)):
            # HACK: Use only the first value if there are multiple grades etc
            # FIXME: Support multiple values! ===================================================
            DData[ord_] = DData[ord_][0]

        value = int(DData[ord_])
        if not value in DNum:
            DNum[value] = 0
        DNum[value] += 1

    # The maximum value migth be e.g. 10 for
    # grade 10 < CHECK USED CORRECTLY! =============================================================
    max_ = max(DNum)
    min_ = min(DNum)

    DRtn = {}
    if len(DNum) > 50:
        '''
        Frequency values, usually thousands of possible values
        Filter them down by Max/50 to allow rough frequency browsing
        TODO: Make it a bit less "linear" than dividing by 50?
        
        For example:
        * Frequencies 0-49
        * Frequencies 50-99
        * Frequencies 100-149
        ...etc
        '''

        i = min_
        step = int(max_ / 50.0) or 1
        while 1:
            if i > max_:
                from_ = i
                to = max_
            else:
                from_ = i
                to = i + step - 1
                i += step

            format = '%s - %s' % (from_, to)
            if prefix:
                format = '%s %s' % (prefix, format)

            L = get_int_array()
            for ord_ in DData:
                if (int(DData[ord_]) >= int(from_)
                        and int(DData[ord_]) <= int(to)):
                    #print 'ADDED:', from_, to, unicode(DData[ord_]).encode('utf-8')
                    L.append(ord_)
                #else:
                #   print 'NOT ADDED:', from_, to, unicode(DData[ord_]).encode('utf-8')

            if len(L):
                DRtn[format] = L

            # Stop looping if no more items!
            if i > max_:
                break

    else:
        '''
        If less than 15 values, they're usually grade values which have 
        few possibilities, so divides into 'sets' of 100 or so, 
        
        For example:
        * Grade 1 (0-99)
        * Grade 1 (100-177)
        * Grade 2 (0-99)
        ...etc
        '''

        LValues = list(DNum.keys())
        LValues.sort()

        #
        for value in LValues:
            num_steps = int(DNum[value] / 100.0) + 1
            for i in range(num_steps):
                # Add on the last maximum
                from_ = i * 100
                to = from_ + 99

                if to > DNum[value]:
                    # If no more items, use the max number for "to"
                    to = DNum[value]

                # Write the index to DRtn
                # TODO: Add "Page" etc for indices?
                xx = 0
                format = '%s (%s - %s)' % (value, from_ + 1, to + 1)
                L = get_int_array()
                for ord_ in DData:
                    if str(DData[ord_]) == str(value):
                        if xx >= from_ and xx <= to:
                            #print('ADDED:', i, xx, unicode(value).encode('utf-8'))
                            L.append(ord_)
                        #else:
                        #   print('NOT ADDED:', i, xx, unicode(value).encode('utf-8'))
                        xx += 1

                if len(L):
                    DRtn[format] = L

    # Write out to disk using unsigned integer arrays
    return write_arrays(f, DRtn)
예제 #9
0
def write_indices(f, key, DOrds):
    """
    Indices [Storing Page Positions as Either Numeric or Char Data]
    Variables: DArrays
    """
    LArrays, DOrds = indice_tools.parse_indices(key, DOrds)
    DArrays = {}
    
    LRanges, DOrds = compress_ord_ranges(DOrds)
    
    LIgnoreRanges = get_char_gaps(DOrds)
    
    # Create the various arrays
    for name, typ in LArrays:
        if typ == 'char':
            # [\v] indicates None
            DArrays[name] = get_uni_array()
        elif typ == 'integer': 
            # [+1] to allow 0 for None
            DArrays[name] = get_int_array()
        else:
            raise Exception("Unknown indice type %s" % typ)
    
    # Fill the data gaps
    DMultiVals = {}
    for ord_ in iter_ranges(LIgnoreRanges, max(DOrds)):
        if ord_ in DOrds and DOrds[ord_]:
            LValues = DOrds[ord_]
            D = LValues[0]
            
            # Add to DMultiVals if more than one value
            if len(LValues) > 1:
                assert not str(ord_) in DMultiVals
                DMultiVals[str(ord_)] = LValues[1:]
            
            for k in D:
                value = D[k]
                LArray = DArrays[k]
                
                if value in (None, ''):
                    if LArray.typecode in ('u', 'c'):
                        LArray.append('\v')
                    else: 
                        LArray.append(0)
                else:
                    try:
                        if LArray.typecode in ('u', 'c'):
                            assert len(value) == 1
                            # WARNING: StrArray's only allow indexing of single ASCII
                            # chars, as they're encoded using utf-8 (!)

                            # Hopefully that's all it will need, though
                            LArray.append(str(value))
                        else: 
                            LArray.append(int(value)+1)
                    except:
                        print(('Error on value: %s key: %s typecode: %s' % (value, k, LArray.typecode)))
                        raise
        else: 
            for k in DArrays:
                # Append blank values
                LArray = DArrays[k]
                if LArray.typecode in ('u', 'c'):
                    LArray.append('\v')
                else: 
                    LArray.append(0)
    
    # Write to disk
    DRtn = {}
    DRtn['DArrays'] = write_arrays(f, [(name, DArrays[name]) for name, typ in LArrays])
    DRtn['LIgnoreRanges'] = write_json(f, LIgnoreRanges)
    DRtn['LRanges'] = write_json(f, LRanges)
    DRtn['DMultiVals'] = write_json(f, DMultiVals)
    return DRtn