Exemple #1
0
 def _encodeInterps4Type(self, typenum, interpsList):
     res = bytearray()
     res.extend(self._encodeTypeNum(typenum))
     
     encodedInterpsList = bytearray()
     
     orthCasePatterns = set([tuple(interp.orthCasePattern) for interp in interpsList])
     lemmaCasePatterns = set([tuple(interp.encodedForm.casePattern) for interp in interpsList])
     prefixCuts = set([interp.encodedForm.prefixCutLength for interp in interpsList])
     
     encodedInterpsList.append(self._encodeCompressByte(orthCasePatterns, lemmaCasePatterns, prefixCuts))
     
     if not self._casePatternsAreEncodedInCompressByte(orthCasePatterns):
         minOrthCasePatterns = self._getMinOrthCasePatterns(interpsList)
         encodedInterpsList.append(len(minOrthCasePatterns))
         for casePattern in minOrthCasePatterns:
             encodedInterpsList.extend(self._encodeCasePattern(casePattern))
     
     for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
         if not self._casePatternsAreEncodedInCompressByte(orthCasePatterns):
             encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
         if not self._prefixCutsAreEncodedInCompressByte(prefixCuts):
             encodedInterpsList.append(interp.encodedForm.prefixCutLength)
         encodedInterpsList.append(interp.encodedForm.cutLength)
         encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd))
         if not self._casePatternsAreEncodedInCompressByte(lemmaCasePatterns):
             encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
         encodedInterpsList.extend(htons(interp.tagnum))
         encodedInterpsList.append(interp.namenum)
         encodedInterpsList.extend(htons(interp.qualifiers))
     
     res.extend(htons(len(encodedInterpsList)))
     res.extend(encodedInterpsList)
     return res
Exemple #2
0
 def _encodeInterps4Type(self, typenum, interpsList):
     res = bytearray()
     res.extend(self._encodeTypeNum(typenum))
     encodedInterpsList = bytearray()
     for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
         encodedInterpsList.extend(serializeString(interp.homonymId))
         encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd))
         encodedInterpsList.append(interp.encodedForm.cutLength)
         encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd))
         encodedInterpsList.extend(htons(interp.tagnum))
         encodedInterpsList.append(interp.namenum)
         encodedInterpsList.extend(htons(interp.qualifiers))
     
     res.extend(htons(len(encodedInterpsList)))
     res.extend(encodedInterpsList)
     return res
Exemple #3
0
    def _doEncodeData(self, interpsList):

        assert type(interpsList) == frozenset

        segnum2Interps = self._groupInterpsByType(interpsList)

        res = bytearray()

        for typenum, interpsList in segnum2Interps.iteritems():
            res.extend(self._encodeInterps4Type(typenum, interpsList))
        del interpsList

        res = htons(len(res)) + res

        return res
Exemple #4
0
    def transitionsData2bytearray(self, state):
        res = bytearray()
#         logging.debug('next')
        for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()):
            res.append(segnum)
            if shiftOrth:
                res.append(1)
            else:
                res.append(0)
            offset = nextState.offset
            exceptions.validate(offset <= MAX_FSA_SIZE,
                                u'Segmentation rules are too big and complicated' \
                                + u'- the resulting automaton would exceed its max size which is %d' \
                                % MAX_FSA_SIZE)
            res.extend(htons(offset))
        return res
Exemple #5
0
 def _serializeSeparatorsList(self):
     res = bytearray()
     res.extend(serializationUtils.htons(len(self.separatorsList)))
     for cp in sorted(self.separatorsList):
         res.extend(serializationUtils.htonl(cp))
     return res