def _encodeInterps4Type(self, typenum, interpsList): res = bytearray() res.extend(self._encodeTypeNum(typenum)) encodedInterpsList = bytearray() orthCasePatterns = set([tuple(interp.orthCasePattern) for interp in interpsList]) lemmaCasePatterns = set([tuple(interp.encodedForm.casePattern) for interp in interpsList]) prefixCuts = set([interp.encodedForm.prefixCutLength for interp in interpsList]) encodedInterpsList.append(self._encodeCompressByte(orthCasePatterns, lemmaCasePatterns, prefixCuts)) if not self._casePatternsAreEncodedInCompressByte(orthCasePatterns): minOrthCasePatterns = self._getMinOrthCasePatterns(interpsList) encodedInterpsList.append(len(minOrthCasePatterns)) for casePattern in minOrthCasePatterns: encodedInterpsList.extend(self._encodeCasePattern(casePattern)) for interp in sorted(interpsList, key=lambda i: i.getSortKey()): if not self._casePatternsAreEncodedInCompressByte(orthCasePatterns): encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) if not self._prefixCutsAreEncodedInCompressByte(prefixCuts): encodedInterpsList.append(interp.encodedForm.prefixCutLength) encodedInterpsList.append(interp.encodedForm.cutLength) encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd)) if not self._casePatternsAreEncodedInCompressByte(lemmaCasePatterns): encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) encodedInterpsList.extend(htons(interp.tagnum)) encodedInterpsList.append(interp.namenum) encodedInterpsList.extend(htons(interp.qualifiers)) res.extend(htons(len(encodedInterpsList))) res.extend(encodedInterpsList) return res
def _encodeInterps4Type(self, typenum, interpsList): res = bytearray() res.extend(self._encodeTypeNum(typenum)) encodedInterpsList = bytearray() for interp in sorted(interpsList, key=lambda i: i.getSortKey()): encodedInterpsList.extend(serializeString(interp.homonymId)) encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd)) encodedInterpsList.append(interp.encodedForm.cutLength) encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd)) encodedInterpsList.extend(htons(interp.tagnum)) encodedInterpsList.append(interp.namenum) encodedInterpsList.extend(htons(interp.qualifiers)) res.extend(htons(len(encodedInterpsList))) res.extend(encodedInterpsList) return res
def _doEncodeData(self, interpsList): assert type(interpsList) == frozenset segnum2Interps = self._groupInterpsByType(interpsList) res = bytearray() for typenum, interpsList in segnum2Interps.iteritems(): res.extend(self._encodeInterps4Type(typenum, interpsList)) del interpsList res = htons(len(res)) + res return res
def transitionsData2bytearray(self, state): res = bytearray() # logging.debug('next') for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()): res.append(segnum) if shiftOrth: res.append(1) else: res.append(0) offset = nextState.offset exceptions.validate(offset <= MAX_FSA_SIZE, u'Segmentation rules are too big and complicated' \ + u'- the resulting automaton would exceed its max size which is %d' \ % MAX_FSA_SIZE) res.extend(htons(offset)) return res
def _serializeSeparatorsList(self): res = bytearray() res.extend(serializationUtils.htons(len(self.separatorsList))) for cp in sorted(self.separatorsList): res.extend(serializationUtils.htonl(cp)) return res