def __init__( self, userPassword, ownerPassword=None, canPrint=1, canModify=1, canCopy=1, canAnnotate=1, strength=40 ): """ This class defines the encryption properties to be used while creating a pdf document. Once initiated, a StandardEncryption object can be applied to a Canvas or a BaseDocTemplate. The userPassword parameter sets the user password on the encrypted pdf. The ownerPassword parameter sets the owner password on the encrypted pdf. The boolean flags canPrint, canModify, canCopy, canAnnotate determine wether a user can perform the corresponding actions on the pdf when only a user password has been supplied. If the user supplies the owner password while opening the pdf, all actions can be performed regardless of the flags. Note that the security provided by these encryption settings (and even more so for the flags) is very weak. """ if isUnicodeType(ownerPassword): ownerPassword = ownerPassword.encode("utf-8") if isUnicodeType(userPassword): userPassword = userPassword.encode("utf-8") self.ownerPassword = ownerPassword self.userPassword = userPassword if strength == 40: self.revision = 2 elif strength == 128: self.revision = 3 self.canPrint = canPrint self.canModify = canModify self.canCopy = canCopy self.canAnnotate = canAnnotate self.O = self.U = self.P = self.key = None
def encodePDF(key, objectNumber, generationNumber, string, revision=2): "Encodes a string or stream" newkey = key if isUnicodeType(newkey): newkey = newkey.encode('utf-8') n = objectNumber for i in range(3): c = chr(n & 0xff) if isUnicodeType(c): c = c.encode('utf-8') newkey += c n = n >> 8 # extend 2 bytes of the generationNumber n = generationNumber for i in range(2): c = chr(n & 0xff) if isUnicodeType(c): c = c.encode('utf-8') newkey += c n = n >> 8 md5output = hashlib.md5(newkey).digest() if revision == 2: key = md5output[:10] elif revision == 3: key = md5output #all 16 bytes from reportlab.lib.arciv import ArcIV encrypted = ArcIV(key).encode(string) #print 'encrypted=', hexText(encrypted) if DEBUG: print('encodePDF(%s,%s,%s,%s,%s)==>%s' % tuple( map(lambda x: hexText(str(x)), (key, objectNumber, generationNumber, string, revision, encrypted)))) return encrypted
def computeO(userPassword, ownerPassword, revision): from reportlab.lib.arciv import ArcIV # print 'digest of hello is %s' % hashlib.md5('hello').digest() assert revision in (2, 3), "Unknown algorithm revision %s" % revision if isUnicodeType(userPassword): userPassword = userPassword.encode("utf-8") if isUnicodeType(ownerPassword): ownerPassword = ownerPassword.encode("utf-8") if ownerPassword in (None, ""): ownerPassword = userPassword ownerPad = ownerPassword + PadString ownerPad = ownerPad[0:32] password = userPassword + PadString userPad = password[:32] digest = hashlib.md5(ownerPad).digest() if revision == 2: O = ArcIV(digest[:5]).encode(userPad) elif revision == 3: for i in range(50): digest = hashlib.md5(digest).digest() digest = digest[:16] O = userPad for i in range(20): thisKey = xorKey(i, digest) O = ArcIV(thisKey).encode(O) if DEBUG: print( "computeO(%s,%s,%s)==>%s" % tuple(map(lambda x: hexText(str(x)), (userPassword, ownerPassword, revision, O))) ) return O
def __init__(self, userPassword, ownerPassword=None, canPrint=1, canModify=1, canCopy=1, canAnnotate=1, strength=40): ''' This class defines the encryption properties to be used while creating a pdf document. Once initiated, a StandardEncryption object can be applied to a Canvas or a BaseDocTemplate. The userPassword parameter sets the user password on the encrypted pdf. The ownerPassword parameter sets the owner password on the encrypted pdf. The boolean flags canPrint, canModify, canCopy, canAnnotate determine wether a user can perform the corresponding actions on the pdf when only a user password has been supplied. If the user supplies the owner password while opening the pdf, all actions can be performed regardless of the flags. Note that the security provided by these encryption settings (and even more so for the flags) is very weak. ''' if isUnicodeType(ownerPassword): ownerPassword = ownerPassword.encode('utf-8') if isUnicodeType(userPassword): userPassword = userPassword.encode('utf-8') self.ownerPassword = ownerPassword self.userPassword = userPassword if strength == 40: self.revision = 2 elif strength == 128: self.revision = 3 self.canPrint = canPrint self.canModify = canModify self.canCopy = canCopy self.canAnnotate = canAnnotate self.O = self.U = self.P = self.key = None
def handle_data(self, data): #the only data should be paragraph text, preformatted para #text, 'string text' for a fixed string on the page, #or table data if not type(data) is str: data = data.decode('utf-8') if self._curPara: self._curPara.rawtext = self._curPara.rawtext + data elif self._curPrefmt: self._curPrefmt.rawtext = self._curPrefmt.rawtext + data elif self._curPyCode: self._curPyCode.rawtext = self._curPyCode.rawtext + data elif self._curString: if sys.version_info[0] == 3 and not isUnicodeType(data): data = data.decode('utf-8') self._curString.text = self._curString.text + data elif self._curTable: self._curTable.rawBlocks.append(data) elif self._curTitle != None: # need to allow empty strings, # hence explicitly testing for None if sys.version_info[0] == 3 and not isUnicodeType(data): data = data.decode('utf-8') self._curTitle = self._curTitle + data elif self._curAuthor != None: if sys.version_info[0] == 3 and not isUnicodeType(data): data = data.decode('utf-8') self._curAuthor = self._curAuthor + data elif self._curSubject != None: if sys.version_info[0] == 3 and not isUnicodeType(data): data = data.decode('utf-8') self._curSubject = self._curSubject + data
def computeO(userPassword, ownerPassword, revision): from reportlab.lib.arciv import ArcIV #print 'digest of hello is %s' % hashlib.md5('hello').digest() assert revision in (2, 3), 'Unknown algorithm revision %s' % revision if isUnicodeType(userPassword): userPassword = userPassword.encode('utf-8') if isUnicodeType(ownerPassword): ownerPassword = ownerPassword.encode('utf-8') if ownerPassword in (None, ''): ownerPassword = userPassword ownerPad = ownerPassword + PadString ownerPad = ownerPad[0:32] password = userPassword + PadString userPad = password[:32] digest = hashlib.md5(ownerPad).digest() if revision == 2: O = ArcIV(digest[:5]).encode(userPad) elif revision == 3: for i in range(50): digest = hashlib.md5(digest).digest() digest = digest[:16] O = userPad for i in range(20): thisKey = xorKey(i, digest) O = ArcIV(thisKey).encode(O) if DEBUG: print('computeO(%s,%s,%s)==>%s' % tuple( map(lambda x: hexText(str(x)), (userPassword, ownerPassword, revision, O)))) return O
def encodePDF(key, objectNumber, generationNumber, string, revision=2): "Encodes a string or stream" newkey = key if isUnicodeType(newkey): newkey = newkey.encode("utf-8") n = objectNumber for i in range(3): c = chr(n & 0xFF) if isUnicodeType(c): c = c.encode("utf-8") newkey += c n = n >> 8 # extend 2 bytes of the generationNumber n = generationNumber for i in range(2): c = chr(n & 0xFF) if isUnicodeType(c): c = c.encode("utf-8") newkey += c n = n >> 8 md5output = hashlib.md5(newkey).digest() if revision == 2: key = md5output[:10] elif revision == 3: key = md5output # all 16 bytes from reportlab.lib.arciv import ArcIV encrypted = ArcIV(key).encode(string) # print 'encrypted=', hexText(encrypted) if DEBUG: print( "encodePDF(%s,%s,%s,%s,%s)==>%s" % tuple(map(lambda x: hexText(str(x)), (key, objectNumber, generationNumber, string, revision, encrypted))) ) return encrypted
def encryptionkey(password, OwnerKey, Permissions, FileId1, revision=2): # FileId1 is first string of the fileid array # add padding string # AR force same as iText example # Permissions = -1836 #int(Permissions - 2**31) if isUnicodeType(password): password = password.encode("utf-8") password = password + PadString # truncate to 32 bytes password = password[:32] # translate permissions to string, low order byte first p = Permissions permissionsString = [] for i in range(4): byte = p & 0xFF # seems to match what iText does p = p >> 8 permissionsString.append(byte % 256) if sys.version_info[0] == 3: permissionsString = bytes(permissionsString) else: permissionsString = b"".join([chr(i) for i in permissionsString]) if isUnicodeType(OwnerKey): OwnerKey = OwnerKey.encode("utf-8") if isUnicodeType(permissionsString): permissionsString = permissionsString.encode("utf-8") if isUnicodeType(FileId1): FileId1 = FileId1.encode("utf-8") hash = hashlib.md5(password) hash.update(OwnerKey) hash.update(permissionsString) hash.update(FileId1) md5output = hash.digest() if revision == 2: key = md5output[:5] elif revision == 3: # revision 3 algorithm - loop 50 times for x in range(50): md5output = hashlib.md5(md5output).digest() key = md5output[:16] if DEBUG: print( "encryptionkey(%s,%s,%s,%s,%s)==>%s" % tuple(map(lambda x: hexText(str(x)), (password, OwnerKey, Permissions, FileId1, revision, key))) ) return key
def encryptionkey(password, OwnerKey, Permissions, FileId1, revision=2): # FileId1 is first string of the fileid array # add padding string #AR force same as iText example #Permissions = -1836 #int(Permissions - 2**31) if isUnicodeType(password): password = password.encode('utf-8') password = password + PadString # truncate to 32 bytes password = password[:32] # translate permissions to string, low order byte first p = Permissions permissionsString = [] for i in range(4): byte = (p & 0xff) # seems to match what iText does p = p >> 8 permissionsString.append(byte % 256) if sys.version_info[0] == 3: permissionsString = bytes(permissionsString) else: permissionsString = b''.join([chr(i) for i in permissionsString]) if isUnicodeType(OwnerKey): OwnerKey = OwnerKey.encode('utf-8') if isUnicodeType(permissionsString): permissionsString = permissionsString.encode('utf-8') if isUnicodeType(FileId1): FileId1 = FileId1.encode('utf-8') hash = hashlib.md5(password) hash.update(OwnerKey) hash.update(permissionsString) hash.update(FileId1) md5output = hash.digest() if revision == 2: key = md5output[:5] elif revision == 3: #revision 3 algorithm - loop 50 times for x in range(50): md5output = hashlib.md5(md5output).digest() key = md5output[:16] if DEBUG: print('encryptionkey(%s,%s,%s,%s,%s)==>%s' % tuple( map(lambda x: hexText(str(x)), (password, OwnerKey, Permissions, FileId1, revision, key)))) return key
def computeU(encryptionkey, encodestring=PadString, revision=2, documentId=None): from reportlab.lib.arciv import ArcIV if revision == 2: result = ArcIV(encryptionkey).encode(encodestring) elif revision == 3: assert documentId is not None, "Revision 3 algorithm needs the document ID!" h = hashlib.md5(PadString) if isUnicodeType(documentId): h.update(documentId.encode("utf-8")) else: h.update(documentId) tmp = h.digest() tmp = ArcIV(encryptionkey).encode(tmp) for n in range(1, 20): thisKey = xorKey(n, encryptionkey) tmp = ArcIV(thisKey).encode(tmp) while len(tmp) < 32: tmp = tmp + b"\000" result = tmp if DEBUG: print( "computeU(%s,%s,%s,%s)==>%s" % tuple(map(lambda x: hexText(str(x)), (encryptionkey, encodestring, revision, documentId, result))) ) return result
def _py_stringWidth(self, text, size, encoding='utf-8'): "Calculate text width" if not isUnicodeType(text): text = text.decode(encoding or 'utf-8') # encoding defaults to utf-8 g = self.face.charWidths.get dw = self.face.defaultWidth return 0.001*size*sum([g(ord(u),dw) for u in text])
def _py_unicode2T1(utext,fonts): '''return a list of (font,string) pairs representing the unicode text''' R = [] font, fonts = fonts[0], fonts[1:] enc = font.encName if 'UCS-2' in enc: enc = 'UTF16' while utext: try: if isUnicodeType(utext): s = utext.encode(enc) else: s = utext R.append((font,s)) break except UnicodeEncodeError as e: i0, il = e.args[2:4] if i0: R.append((font,utext[:i0].encode(enc))) if fonts: R.extend(_py_unicode2T1(utext[i0:il],fonts)) else: R.append((_notdefFont,_notdefChar*(il-i0))) utext = utext[il:] return R
def stringWidth(self, text, size, encoding=None): "Just ensure we do width test on characters, not bytes..." if not isUnicodeType(text): text = text.decode('utf8') widths = self.unicodeWidths return size * 0.001 * sum([widths.get(uch, 1000) for uch in text])
def _py_unicode2T1(utext, fonts): '''return a list of (font,string) pairs representing the unicode text''' R = [] font, fonts = fonts[0], fonts[1:] enc = font.encName if 'UCS-2' in enc: enc = 'UTF16' while utext: try: if isUnicodeType(utext): s = utext.encode(enc) else: s = utext R.append((font, s)) break except UnicodeEncodeError as e: i0, il = e.args[2:4] if i0: R.append((font, utext[:i0].encode(enc))) if fonts: R.extend(_py_unicode2T1(utext[i0:il], fonts)) else: R.append((_notdefFont, _notdefChar * (il - i0))) utext = utext[il:] return R
def computeU(encryptionkey, encodestring=PadString, revision=2, documentId=None): from reportlab.lib.arciv import ArcIV if revision == 2: result = ArcIV(encryptionkey).encode(encodestring) elif revision == 3: assert documentId is not None, "Revision 3 algorithm needs the document ID!" h = hashlib.md5(PadString) if isUnicodeType(documentId): h.update(documentId.encode('utf-8')) else: h.update(documentId) tmp = h.digest() tmp = ArcIV(encryptionkey).encode(tmp) for n in range(1, 20): thisKey = xorKey(n, encryptionkey) tmp = ArcIV(thisKey).encode(tmp) while len(tmp) < 32: tmp = tmp + b'\000' result = tmp if DEBUG: print('computeU(%s,%s,%s,%s)==>%s' % tuple( map(lambda x: hexText(str(x)), (encryptionkey, encodestring, revision, documentId, result)))) return result
def _py_stringWidth(self, text, size, encoding='utf-8'): "Calculate text width" if not isUnicodeType(text): text = text.decode(encoding or 'utf-8') # encoding defaults to utf-8 g = self.face.charWidths.get dw = self.face.defaultWidth return 0.001 * size * sum([g(ord(u), dw) for u in text])
def formatForPdf(self, text): #these ones should be encoded asUTF16 minus the BOM from codecs import utf_16_be_encode #print 'formatting %s: %s' % (type(text), repr(text)) if not isUnicodeType(text): text = text.decode('utf8') utfText = utf_16_be_encode(text)[0] encoded = _escape(utfText) #print ' encoded:',encoded return encoded
def _AsciiHexEncode(input): """Encodes input using ASCII-Hex coding. This is a verbose encoding used for binary data within a PDF file. One byte binary becomes two bytes of ASCII. Helper function used by images.""" if isUnicodeType(input): input = input.encode('utf-8') output = getBytesIO() output.write(binascii.b2a_hex(input)) output.write(b'>') return output.getvalue()
def handle_cdata(self, data): #just append to current paragraph text, so we can quote XML if self._curPara: self._curPara.rawtext = self._curPara.rawtext + data elif self._curPrefmt: self._curPrefmt.rawtext = self._curPrefmt.rawtext + data elif self._curPyCode: self._curPyCode.rawtext = self._curPyCode.rawtext + data elif self._curString: if sys.version_info[0] == 3 and not isUnicodeType(data): data = data.decode('utf-8') self._curString.text = self._curString.text + data elif self._curTable: self._curTable.rawBlocks.append(data) elif self._curAuthor != None: if sys.version_info[0] == 3 and not isUnicodeType(data): data = data.decode('utf-8') self._curAuthor = self._curAuthor + data elif self._curSubject != None: if sys.version_info[0] == 3 and not isUnicodeType(data): data = data.decode('utf-8') self._curSubject = self._curSubject + data
def _py_stringWidth(self, text, size, encoding='utf8'): """This is the "purist" approach to width. The practical approach is to use the stringWidth function, which may be swapped in for one written in C.""" if not isUnicodeType(text): text = text.decode(encoding) w = 0 for f, t in unicode2T1(text,[self]+self.substitutionFonts): if sys.version_info[0] == 3: w += sum([f.widths[c] for c in t]) else: w += sum([f.widths[ord(c)] for c in t]) return w*0.001*size
def _py_stringWidth(self, text, size, encoding='utf8'): """This is the "purist" approach to width. The practical approach is to use the stringWidth function, which may be swapped in for one written in C.""" if not isUnicodeType(text): text = text.decode(encoding) w = 0 for f, t in unicode2T1(text, [self] + self.substitutionFonts): if sys.version_info[0] == 3: w += sum([f.widths[c] for c in t]) else: w += sum([f.widths[ord(c)] for c in t]) return w * 0.001 * size
def splitString(self, text, doc, encoding='utf-8'): """Splits text into a number of chunks, each of which belongs to a single subset. Returns a list of tuples (subset, string). Use subset numbers with getSubsetInternalName. Doc is needed for distinguishing subsets when building different documents at the same time.""" asciiReadable = self._asciiReadable try: state = self.state[doc] except KeyError: state = self.state[doc] = TTFont.State(asciiReadable) curSet = -1 cur = [] results = [] if not isUnicodeType(text): text = text.decode(encoding or 'utf-8') # encoding defaults to utf-8 assignments = state.assignments subsets = state.subsets for code in map(ord, text): if code in assignments: n = assignments[code] else: if state.frozen: raise pdfdoc.PDFError( "Font %s is already frozen, cannot add new character U+%04X" % (self.fontName, code)) n = state.nextCode if n & 0xFF == 32: # make code 32 always be a space character if n != 32: subsets[n >> 8].append(32) state.nextCode += 1 n = state.nextCode state.nextCode += 1 assignments[code] = n if n > 32: if not (n & 0xFF): subsets.append([]) subsets[n >> 8].append(code) else: subsets[0][n] = code if (n >> 8) != curSet: if cur: results.append((curSet, ''.join(map(chr, cur)))) curSet = (n >> 8) cur = [] cur.append(n & 0xFF) if cur: results.append((curSet, ''.join(map(chr, cur)))) return results
def makeStream(self): "Finishes the generation and returns the TTF file as a string" stm = getBytesIO() write = stm.write numTables = len(self.tables) searchRange = 1 entrySelector = 0 while searchRange * 2 <= numTables: searchRange = searchRange * 2 entrySelector = entrySelector + 1 searchRange = searchRange * 16 rangeShift = numTables * 16 - searchRange # Header write( pack(">lHHHH", 0x00010000, numTables, searchRange, entrySelector, rangeShift)) # Table directory tables = list(self.tables.items()) tables.sort() # XXX is this the correct order? offset = 12 + numTables * 16 for tag, data in tables: if tag == 'head': head_start = offset checksum = calcChecksum(data) if isUnicodeType(tag): tag = tag.encode('utf-8') write(tag) write(pack(">LLL", checksum, offset, len(data))) paddedLength = (len(data) + 3) & ~3 offset = offset + paddedLength # Table data for tag, data in tables: data += b"\0\0\0" write(data[:len(data) & ~3]) checksum = calcChecksum(stm.getvalue()) checksum = add32(0xB1B0AFBA, -checksum) stm.seek(head_start + 8) write(pack('>L', checksum)) return stm.getvalue()
def makeStream(self): "Finishes the generation and returns the TTF file as a string" stm = getBytesIO() write = stm.write numTables = len(self.tables) searchRange = 1 entrySelector = 0 while searchRange * 2 <= numTables: searchRange = searchRange * 2 entrySelector = entrySelector + 1 searchRange = searchRange * 16 rangeShift = numTables * 16 - searchRange # Header write(pack(">lHHHH", 0x00010000, numTables, searchRange, entrySelector, rangeShift)) # Table directory tables = list(self.tables.items()) tables.sort() # XXX is this the correct order? offset = 12 + numTables * 16 for tag, data in tables: if tag == 'head': head_start = offset checksum = calcChecksum(data) if isUnicodeType(tag): tag = tag.encode('utf-8') write(tag) write(pack(">LLL", checksum, offset, len(data))) paddedLength = (len(data)+3)&~3 offset = offset + paddedLength # Table data for tag, data in tables: data += b"\0\0\0" write(data[:len(data)&~3]) checksum = calcChecksum(stm.getvalue()) checksum = add32(0xB1B0AFBA, -checksum) stm.seek(head_start + 8) write(pack('>L', checksum)) return stm.getvalue()
def splitString(self, text, doc, encoding='utf-8'): """Splits text into a number of chunks, each of which belongs to a single subset. Returns a list of tuples (subset, string). Use subset numbers with getSubsetInternalName. Doc is needed for distinguishing subsets when building different documents at the same time.""" asciiReadable = self._asciiReadable try: state = self.state[doc] except KeyError: state = self.state[doc] = TTFont.State(asciiReadable) curSet = -1 cur = [] results = [] if not isUnicodeType(text): text = text.decode(encoding or 'utf-8') # encoding defaults to utf-8 assignments = state.assignments subsets = state.subsets for code in map(ord,text): if code in assignments: n = assignments[code] else: if state.frozen: raise pdfdoc.PDFError("Font %s is already frozen, cannot add new character U+%04X" % (self.fontName, code)) n = state.nextCode if n&0xFF==32: # make code 32 always be a space character if n!=32: subsets[n >> 8].append(32) state.nextCode += 1 n = state.nextCode state.nextCode += 1 assignments[code] = n if n>32: if not(n&0xFF): subsets.append([]) subsets[n >> 8].append(code) else: subsets[0][n] = code if (n >> 8) != curSet: if cur: results.append((curSet, ''.join(map(chr,cur)))) curSet = (n >> 8) cur = [] cur.append(n & 0xFF) if cur: results.append((curSet,''.join(map(chr,cur)))) return results
def dumbSplit(word, widths, maxWidths): """This function attempts to fit as many characters as possible into the available space, cutting "like a knife" between characters. This would do for Chinese. It returns a list of (text, extraSpace) items where text is a Unicode string, and extraSpace is the points of unused space available on the line. This is a structure which is fairly easy to display, and supports 'backtracking' approaches after the fact. Test cases assume each character is ten points wide... >>> dumbSplit(u'Hello', [10]*5, 60) [[10, u'Hello']] >>> dumbSplit(u'Hello', [10]*5, 50) [[0, u'Hello']] >>> dumbSplit(u'Hello', [10]*5, 40) [[0, u'Hell'], [30, u'o']] """ _more = """ #>>> dumbSplit(u'Hello', [10]*5, 4) # less than one character #(u'', u'Hello') # this says 'Nihongo wa muzukashii desu ne!' (Japanese is difficult isn't it?) in 12 characters >>> jtext = u'\u65e5\u672c\u8a9e\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01' >>> dumbSplit(jtext, [10]*11, 30) # (u'\u65e5\u672c\u8a9e', u'\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01') """ if not isinstance(maxWidths, (list, tuple)): maxWidths = [maxWidths] assert isUnicodeType(word) lines = [] i = widthUsed = lineStartPos = 0 maxWidth = maxWidths[0] nW = len(word) while i < nW: w = widths[i] c = word[i] widthUsed += w i += 1 if widthUsed > maxWidth + _FUZZ and widthUsed > 0: extraSpace = maxWidth - widthUsed if ord(c) < 0x3000: # we appear to be inside a non-Asian script section. # (this is a very crude test but quick to compute). # This is likely to be quite rare so the speed of the # code below is hopefully not a big issue. The main # situation requiring this is that a document title # with an english product name in it got cut. # we count back and look for # - a space-like character # - reversion to Kanji (which would be a good split point) # - in the worst case, roughly half way back along the line limitCheck = (lineStartPos + i) >> 1 #(arbitrary taste issue) for j in range(i - 1, limitCheck, -1): cj = word[j] if category(cj) == 'Zs' or ord(cj) >= 0x3000: k = j + 1 if k < i: j = k + 1 extraSpace += sum(widths[j:i]) w = widths[k] c = word[k] i = j break #end of English-within-Asian special case #we are pushing this character back, but #the most important of the Japanese typography rules #if this character cannot start a line, wrap it up to this line so it hangs #in the right margin. We won't do two or more though - that's unlikely and #would result in growing ugliness. #and increase the extra space #bug fix contributed by Alexander Vasilenko <*****@*****.**> if c not in ALL_CANNOT_START and i > lineStartPos + 1: #otherwise we need to push the character back #the i>lineStart+1 condition ensures progress i -= 1 extraSpace += w #lines.append([maxWidth-sum(widths[lineStartPos:i]), word[lineStartPos:i].strip()]) lines.append([extraSpace, word[lineStartPos:i].strip()]) try: maxWidth = maxWidths[len(lines)] except IndexError: maxWidth = maxWidths[-1] # use the last one lineStartPos = i widthUsed = 0 #any characters left? if widthUsed > 0: lines.append([maxWidth - widthUsed, word[lineStartPos:]]) return lines
def dumbSplit(word, widths, maxWidths): """This function attempts to fit as many characters as possible into the available space, cutting "like a knife" between characters. This would do for Chinese. It returns a list of (text, extraSpace) items where text is a Unicode string, and extraSpace is the points of unused space available on the line. This is a structure which is fairly easy to display, and supports 'backtracking' approaches after the fact. Test cases assume each character is ten points wide... >>> dumbSplit(u'Hello', [10]*5, 60) [[10, u'Hello']] >>> dumbSplit(u'Hello', [10]*5, 50) [[0, u'Hello']] >>> dumbSplit(u'Hello', [10]*5, 40) [[0, u'Hell'], [30, u'o']] """ _more = """ #>>> dumbSplit(u'Hello', [10]*5, 4) # less than one character #(u'', u'Hello') # this says 'Nihongo wa muzukashii desu ne!' (Japanese is difficult isn't it?) in 12 characters >>> jtext = u'\u65e5\u672c\u8a9e\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01' >>> dumbSplit(jtext, [10]*11, 30) # (u'\u65e5\u672c\u8a9e', u'\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01') """ if not isinstance(maxWidths,(list,tuple)): maxWidths = [maxWidths] assert isUnicodeType(word) lines = [] i = widthUsed = lineStartPos = 0 maxWidth = maxWidths[0] nW = len(word) while i<nW: w = widths[i] c = word[i] widthUsed += w i += 1 if widthUsed > maxWidth + _FUZZ and widthUsed>0: extraSpace = maxWidth - widthUsed if ord(c)<0x3000: # we appear to be inside a non-Asian script section. # (this is a very crude test but quick to compute). # This is likely to be quite rare so the speed of the # code below is hopefully not a big issue. The main # situation requiring this is that a document title # with an english product name in it got cut. # we count back and look for # - a space-like character # - reversion to Kanji (which would be a good split point) # - in the worst case, roughly half way back along the line limitCheck = (lineStartPos+i)>>1 #(arbitrary taste issue) for j in range(i-1,limitCheck,-1): cj = word[j] if category(cj)=='Zs' or ord(cj)>=0x3000: k = j+1 if k<i: j = k+1 extraSpace += sum(widths[j:i]) w = widths[k] c = word[k] i = j break #end of English-within-Asian special case #we are pushing this character back, but #the most important of the Japanese typography rules #if this character cannot start a line, wrap it up to this line so it hangs #in the right margin. We won't do two or more though - that's unlikely and #would result in growing ugliness. #and increase the extra space #bug fix contributed by Alexander Vasilenko <*****@*****.**> if c not in ALL_CANNOT_START and i>lineStartPos+1: #otherwise we need to push the character back #the i>lineStart+1 condition ensures progress i -= 1 extraSpace += w #lines.append([maxWidth-sum(widths[lineStartPos:i]), word[lineStartPos:i].strip()]) lines.append([extraSpace, word[lineStartPos:i].strip()]) try: maxWidth = maxWidths[len(lines)] except IndexError: maxWidth = maxWidths[-1] # use the last one lineStartPos = i widthUsed = 0 #any characters left? if widthUsed > 0: lines.append([maxWidth - widthUsed, word[lineStartPos:]]) return lines