def build_ahc(self): if len(self.anchors) > 6: self.logger.warn("More than six anchors in file %r. " "Some links may not work properly." % self.item.href) data = StringIO() data.write(codepoint_to_chr(len(self.anchors)).encode('utf-8')) for anchor, offset in self.anchors: data.write(codepoint_to_chr(len(anchor)).encode('utf-8')) data.write(anchor) data.write(pack('<I', offset)) return data.getvalue()
def dump_hex(self, src, length=16): ''' Diagnostic ''' FILTER=''.join([(len(repr(codepoint_to_chr(x)))==3) and codepoint_to_chr(x) or '.' for x in range(256)]) N=0 result='' while src: s,src = src[:length],src[length:] hexa = ' '.join(["%02X"%ord(x) for x in s]) s = s.translate(FILTER) result += "%04X %-*s %s\n" % (N, length*3, hexa, s) N+=length print(result)
def get_tweaks_docs(self): path = self.a(self.j(self.SRC, '..', 'resources', 'default_tweaks.py')) with open(path, 'rb') as f: raw = f.read().decode('utf-8') msgs = [] lines = list(raw.splitlines()) for i, line in enumerate(lines): if line.startswith('#:'): msgs.append((i, line[2:].strip())) j = i block = [] while True: j += 1 line = lines[j] if not line.startswith('#'): break block.append(line[1:].strip()) if block: msgs.append((i+1, '\n'.join(block))) ans = [] for lineno, msg in msgs: ans.append('#: %s:%d'%(path, lineno)) slash = codepoint_to_chr(92) msg = msg.replace(slash, slash*2).replace('"', r'\"').replace('\n', r'\n').replace('\r', r'\r').replace('\t', r'\t') ans.append('msgid "%s"'%msg) ans.append('msgstr ""') ans.append('') return '\n'.join(ans)
def get_tweaks_docs(self): path = self.a(self.j(self.SRC, '..', 'resources', 'default_tweaks.py')) with open(path, 'rb') as f: raw = f.read().decode('utf-8') msgs = [] lines = list(raw.splitlines()) for i, line in enumerate(lines): if line.startswith('#:'): msgs.append((i, line[2:].strip())) j = i block = [] while True: j += 1 line = lines[j] if not line.startswith('#'): break block.append(line[1:].strip()) if block: msgs.append((i+1, '\n'.join(block))) ans = [] for lineno, msg in msgs: ans.append('#: %s:%d'%(path, lineno)) slash = codepoint_to_chr(92) msg = msg.replace(slash, slash*2).replace('"', r'\"').replace('\n', r'\n').replace('\r', r'\r').replace('\t', r'\t') ans.append('msgid "%s"'%msg) ans.append('msgstr ""') ans.append('') return '\n'.join(ans)
def do_map(m, points): base = 0xf000 limit = len(m) + base for p in points: if base < p < limit: yield m[p - base] else: yield codepoint_to_chr(p)
def do_map(m, points): base = 0xf000 limit = len(m) + base for p in points: if base < p < limit: yield m[p - base] else: yield codepoint_to_chr(p)
def __unicode_process(self, token): # change scope in if token == r'\{': self.__uc_value.append(self.__uc_value[-1]) # basic error handling self.__reini_utf8_counters() return token # change scope out elif token == r'\}': self.__uc_value.pop() self.__reini_utf8_counters() return token # add a uc control elif token[:3] == r'\uc': self.__uc_value[-1] = int(token[3:]) self.__reini_utf8_counters() return token # bin data to slip elif self.__uc_bin: self.__uc_bin = False return '' # uc char to remove elif self.__uc_char: # handle \bin tag in case of uc char to skip if token[:4] == '\bin': self.__uc_char -= 1 self.__uc_bin = True return '' elif token[:1] == "\\": self.__uc_char -= 1 return '' else: return self.__remove_uc_chars(0, token) # go for real \u token match_obj = self.__utf_exp.match(token) if match_obj is not None: self.__reini_utf8_counters() # get value and handle negative case uni_char = int(match_obj.group(1)) uni_len = len(match_obj.group(0)) if uni_char < 0: uni_char += 65536 uni_char = codepoint_to_chr(uni_char).encode( 'ascii', 'xmlcharrefreplace') self.__uc_char = self.__uc_value[-1] # there is only an unicode char if len(token) <= uni_len: return uni_char # an unicode char and something else # must be after as it is splited on \ # necessary? maybe for \bin? elif not self.__uc_char: return uni_char + token[uni_len:] # if not uc0 and chars else: return uni_char + self.__remove_uc_chars(uni_len, token) # default return token
def __unicode_process(self, token): # change scope in if token == r'\{': self.__uc_value.append(self.__uc_value[-1]) # basic error handling self.__reini_utf8_counters() return token # change scope out elif token == r'\}': self.__uc_value.pop() self.__reini_utf8_counters() return token # add a uc control elif token[:3] == '\\uc': self.__uc_value[-1] = int(token[3:]) self.__reini_utf8_counters() return token # bin data to slip elif self.__uc_bin: self.__uc_bin = False return '' # uc char to remove elif self.__uc_char: # handle \bin tag in case of uc char to skip if token[:4] == '\bin': self.__uc_char -=1 self.__uc_bin = True return '' elif token[:1] == "\\" : self.__uc_char -=1 return '' else: return self.__remove_uc_chars(0, token) # go for real \u token match_obj = self.__utf_exp.match(token) if match_obj is not None: self.__reini_utf8_counters() # get value and handle negative case uni_char = int(match_obj.group(1)) uni_len = len(match_obj.group(0)) if uni_char < 0: uni_char += 65536 uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii') self.__uc_char = self.__uc_value[-1] # there is only an unicode char if len(token)<= uni_len: return uni_char # an unicode char and something else # must be after as it is splited on \ # necessary? maybe for \bin? elif not self.__uc_char: return uni_char + token[uni_len:] # if not uc0 and chars else: return uni_char + self.__remove_uc_chars(uni_len, token) # default return token
def write(self, *values): for value in values: if isinstance(value, numbers.Integral): try: value = codepoint_to_chr(value) except OverflowError: self.logger.warn('unicode_type overflow for integer:', value) value = u'?' self.buf.write(value.encode('utf-8'))
def pdf_serialize(self, stream): raw = self.encode('ascii') if len(raw) > 126: raise ValueError('Name too long: %r'%self) raw = bytearray(raw) sharp = ord(b'#') buf = ( codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else '#{:x}'.format(x).encode('ascii') for x in raw) stream.write(b'/'+b''.join(buf))
def entityref(c): if not UNICODE_SNOB and c in unifiable.keys(): return unifiable[c] else: try: name2cp(c) except KeyError: return "&" + c else: return codepoint_to_chr(name2cp(c))
def pdf_serialize(self, stream): raw = self.encode('ascii') if len(raw) > 126: raise ValueError('Name too long: %r'%self) raw = bytearray(raw) sharp = ord(b'#') buf = ( codepoint_to_chr(x).encode('ascii') if 33 < x < 126 and x != sharp else '#{:x}'.format(x).encode('ascii') for x in raw) stream.write(b'/'+b''.join(buf))
def charref(name): if name[0] in ['x', 'X']: c = int(name[1:], 16) else: c = int(name) if not UNICODE_SNOB and c in unifiable_n.keys(): return unifiable_n[c] else: return codepoint_to_chr(c)
def escape_funcs(): global escape, unescape if escape is None: escapem = {('\\' + x):codepoint_to_chr(i+1) for i, x in enumerate('\\${}')} escape_pat = re.compile('|'.join(map(re.escape, escapem))) escape = lambda x: escape_pat.sub(lambda m: escapem[m.group()], x.replace(r'\\', '\x01')) unescapem = {v:k[1] for k, v in iteritems(escapem)} unescape_pat = re.compile('|'.join(unescapem)) unescape = lambda x:unescape_pat.sub(lambda m:unescapem[m.group()], x) return escape, unescape
def escape_funcs(): global escape, unescape if escape is None: escapem = {('\\' + x):codepoint_to_chr(i+1) for i, x in enumerate('\\${}')} escape_pat = re.compile('|'.join(map(re.escape, escapem))) escape = lambda x: escape_pat.sub(lambda m: escapem[m.group()], x.replace(r'\\', '\x01')) unescapem = {v:k[1] for k, v in iteritems(escapem)} unescape_pat = re.compile('|'.join(unescapem)) unescape = lambda x:unescape_pat.sub(lambda m:unescapem[m.group()], x) return escape, unescape
def _build_manifest(self): states = ['linear', 'nonlinear', 'css', 'images'] manifest = dict((state, []) for state in states) for item in self._oeb.manifest.values(): if item.spine_position is not None: key = 'linear' if item.linear else 'nonlinear' manifest[key].append(item) elif item.media_type in OEB_STYLES: manifest['css'].append(item) elif item.media_type in LIT_IMAGES: manifest['images'].append(item) data = StringIO() data.write(pack('<Bc', 1, '\\')) offset = 0 for state in states: items = manifest[state] items.sort() data.write(pack('<I', len(items))) for item in items: id, media_type = item.id, item.media_type if media_type in OEB_DOCS: # Needs to have 'html' in media-type media_type = XHTML_MIME elif media_type in OEB_STYLES: media_type = CSS_MIME href = urlunquote(item.href) item.offset = offset \ if state in ('linear', 'nonlinear') else 0 data.write(pack('<I', item.offset)) entry = [ codepoint_to_chr(len(id)), unicode_type(id), codepoint_to_chr(len(href)), unicode_type(href), codepoint_to_chr(len(media_type)), unicode_type(media_type) ] for value in entry: data.write(value.encode('utf-8')) data.write('\0') offset += item.size self._add_file('/manifest', data.getvalue())
def fixup(m, rm=rm, rchar=rchar): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return codepoint_to_chr(int(text[3:-1], 16)) else: return codepoint_to_chr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = codepoint_to_chr(name2codepoint[text[1:-1]]) except KeyError: pass if rm: return rchar # replace by char return text # leave as is
def fixup(m, rm=rm, rchar=rchar): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return codepoint_to_chr(int(text[3:-1], 16)) else: return codepoint_to_chr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = codepoint_to_chr(name2codepoint[text[1:-1]]) except KeyError: pass if rm: return rchar # replace by char return text # leave as is
def mkitaiji(self, src, dst): dic = {} for line in open(src, "rb"): line = line.decode('utf-8').strip() if line.startswith(';;'): # skip comment continue if re.match(r"^$",line): continue pair = re.sub(r'\\u([0-9a-fA-F]{4})', lambda x:codepoint_to_chr(int(x.group(1),16)), line) dic[pair[0]] = pair[1] from calibre.utils.serialize import msgpack_dumps with open(dst, 'wb') as f: f.write(msgpack_dumps(dic))
def create_sfnt(self, text_item): get_table = partial(self.qt_hack.get_sfnt_table, text_item) try: ans = Font(Sfnt(get_table)) except UnsupportedFont as e: raise UnsupportedFont('The font %s is not a valid sfnt. Error: %s'%( text_item.font().family(), e)) glyph_map = self.qt_hack.get_glyph_map(text_item) gm = {} ans.ignore_glyphs = set() for uc, glyph_id in enumerate(glyph_map): if glyph_id not in gm: gm[glyph_id] = codepoint_to_chr(uc) if uc in (0xad, 0x200b): ans.ignore_glyphs.add(glyph_id) ans.full_glyph_map = gm return ans
def __init__(self, metrics, num, objects, compress): self.metrics, self.compress = metrics, compress self.is_otf = self.metrics.is_otf self.subset_tag = str( re.sub('.', lambda m: codepoint_to_chr(int(m.group())+ord('A')), oct(num).replace('o', '') )).rjust(6, 'A') self.font_stream = FontStream(metrics.is_otf, compress=compress) try: psname = metrics.postscript_name except Exception: psname = uuid4() self.font_descriptor = Dictionary({ 'Type': Name('FontDescriptor'), 'FontName': Name('%s+%s'%(self.subset_tag, psname)), 'Flags': 0b100, # Symbolic font 'FontBBox': Array(metrics.pdf_bbox), 'ItalicAngle': metrics.post.italic_angle, 'Ascent': metrics.pdf_ascent, 'Descent': metrics.pdf_descent, 'CapHeight': metrics.pdf_capheight, 'AvgWidth': metrics.pdf_avg_width, 'StemV': metrics.pdf_stemv, }) self.descendant_font = Dictionary({ 'Type':Name('Font'), 'Subtype':Name('CIDFontType' + ('0' if metrics.is_otf else '2')), 'BaseFont': self.font_descriptor['FontName'], 'FontDescriptor':objects.add(self.font_descriptor), 'CIDSystemInfo':Dictionary({ 'Registry':String('Adobe'), 'Ordering':String('Identity'), 'Supplement':0, }), }) if not self.is_otf: self.descendant_font['CIDToGIDMap'] = Name('Identity') self.font_dict = Dictionary({ 'Type':Name('Font'), 'Subtype':Name('Type0'), 'Encoding':Name('Identity-H'), 'BaseFont':self.descendant_font['BaseFont'], 'DescendantFonts':Array([objects.add(self.descendant_font)]), }) self.used_glyphs = set()
def create_sfnt(self, text_item): get_table = partial(self.qt_hack.get_sfnt_table, text_item) try: ans = Font(Sfnt(get_table)) except UnsupportedFont as e: raise UnsupportedFont( 'The font %s is not a valid sfnt. Error: %s' % (text_item.font().family(), e)) glyph_map = self.qt_hack.get_glyph_map(text_item) gm = {} ans.ignore_glyphs = set() for uc, glyph_id in enumerate(glyph_map): if glyph_id not in gm: gm[glyph_id] = codepoint_to_chr(uc) if uc in (0xad, 0x200b): ans.ignore_glyphs.add(glyph_id) ans.full_glyph_map = gm return ans
def read_utf8_char(bytes, pos): c = ord(bytes[pos:pos+1]) mask = 0x80 if (c & mask): elsize = 0 while c & mask: mask >>= 1 elsize += 1 if (mask <= 1) or (mask == 0x40): raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) else: elsize = 1 if elsize > 1: if elsize + pos > len(bytes): raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) c &= (mask - 1) for i in range(1, elsize): b = ord(bytes[pos+i:pos+i+1]) if (b & 0xC0) != 0x80: raise LitError( 'Invalid UTF8 character: %s' % repr(bytes[pos:pos+i])) c = (c << 6) | (b & 0x3F) return codepoint_to_chr(c), pos+elsize
def read_utf8_char(bytes, pos): c = ord(bytes[pos]) mask = 0x80 if (c & mask): elsize = 0 while c & mask: mask >>= 1 elsize += 1 if (mask <= 1) or (mask == 0x40): raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) else: elsize = 1 if elsize > 1: if elsize + pos > len(bytes): raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos])) c &= (mask - 1) for i in range(1, elsize): b = ord(bytes[pos + i]) if (b & 0xC0) != 0x80: raise LitError('Invalid UTF8 character: %s' % repr(bytes[pos:pos + i])) c = (c << 6) | (b & 0x3F) return codepoint_to_chr(c), pos + elsize
def binary_to_text_inner(self, bin, buf, stack): (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, is_goingdown, state, flags) = stack.pop() if state == 'close tag': if not tag_name: raise LitError('Tag ends before it begins.') buf.write(encode(u''.join(('</', tag_name, '>')))) dynamic_tag = 0 tag_name = None state = 'text' while self.cpos < len(bin): c, self.cpos = read_utf8_char(bin, self.cpos) oc = ord(c) if state == 'text': if oc == 0: state = 'get flags' continue elif c == '\v': c = '\n' elif c == '>': c = '>>' elif c == '<': c = '<<' buf.write(encode(c)) elif state == 'get flags': if oc == 0: state = 'text' continue flags = oc state = 'get tag' elif state == 'get tag': state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc buf.write(b'<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: state = 'get custom length' continue if flags & FLAG_ATOM: if not self.tag_atoms or tag not in self.tag_atoms: raise LitError( "atom tag %d not in atom tag list" % tag) tag_name = self.tag_atoms[tag] current_map = self.attr_atoms elif tag < len(self.tag_map): tag_name = self.tag_map[tag] current_map = self.tag_to_attr_map[tag] else: dynamic_tag += 1 errors += 1 tag_name = '?'+codepoint_to_chr(tag)+'?' current_map = self.tag_to_attr_map[tag] print('WARNING: tag %s unknown' % codepoint_to_chr(tag)) buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag %s at %d'%(tag_name, self.cpos)) break elif state == 'get attr': in_censorship = False if oc == 0: state = 'text' if not is_goingdown: tag_name = None dynamic_tag = 0 buf.write(b' />') else: buf.write(b'>') frame = (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, False, 'close tag', flags) stack.append(frame) frame = (depth+1, None, None, 0, 0, False, False, 'text', 0) stack.append(frame) break else: if oc == 0x8000: state = 'get attr length' continue attr = None if current_map and oc in current_map and current_map[oc]: attr = current_map[oc] elif oc in self.attr_map: attr = self.attr_map[oc] if not attr or not isinstance(attr, string_or_bytes): raise LitError( 'Unknown attribute %d in tag %s' % (oc, tag_name)) if attr.startswith('%'): in_censorship = True state = 'get value length' continue buf.write(b' ' + encode(attr) + b'=') if attr in ['href', 'src']: state = 'get href length' else: state = 'get value length' elif state == 'get value length': if not in_censorship: buf.write(b'"') count = oc - 1 if count == 0: if not in_censorship: buf.write(b'"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue if count < 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: buf.write(encode('%s"' % (oc - 1))) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: if c == '"': c = '"' elif c == '<': c = '<' if isinstance(c, unicode_type): c = c.encode('ascii', 'xmlcharrefreplace') buf.write(c) count -= 1 if count == 0: if not in_censorship: buf.write(b'"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 if count <= 0 or count > len(bin)-self.cpos: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' tag_name = '' elif state == 'get custom': tag_name += c count -= 1 if count == 0: buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) buf.write(b' ') state = 'get custom attr' elif state == 'get custom attr': buf.write(encode(c)) count -= 1 if count == 0: buf.write(b'=') state = 'get value length' elif state == 'get href length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' elif state == 'get href': href += c count -= 1 if count == 0: doc, frag = urldefrag(href[1:]) path = self.item_path(doc) if frag: path = '#'.join((path, frag)) path = urlnormalize(path) buf.write(encode(u'"%s"' % path)) state = 'get attr'
def uni(match): try: return codepoint_to_chr(int(match.group(1))) except Exception: return '?'
def binary_to_text_inner(self, bin, buf, stack): (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, is_goingdown, state, flags) = stack.pop() if state == 'close tag': if not tag_name: raise LitError('Tag ends before it begins.') buf.write(encode(u''.join(('</', tag_name, '>')))) dynamic_tag = 0 tag_name = None state = 'text' while self.cpos < len(bin): c, self.cpos = read_utf8_char(bin, self.cpos) oc = ord(c) if state == 'text': if oc == 0: state = 'get flags' continue elif c == '\v': c = '\n' elif c == '>': c = '>>' elif c == '<': c = '<<' buf.write(encode(c)) elif state == 'get flags': if oc == 0: state = 'text' continue flags = oc state = 'get tag' elif state == 'get tag': state = 'text' if oc == 0 else 'get attr' if flags & FLAG_OPENING: tag = oc buf.write('<') if not (flags & FLAG_CLOSING): is_goingdown = True if tag == 0x8000: state = 'get custom length' continue if flags & FLAG_ATOM: if not self.tag_atoms or tag not in self.tag_atoms: raise LitError("atom tag %d not in atom tag list" % tag) tag_name = self.tag_atoms[tag] current_map = self.attr_atoms elif tag < len(self.tag_map): tag_name = self.tag_map[tag] current_map = self.tag_to_attr_map[tag] else: dynamic_tag += 1 errors += 1 tag_name = '?' + codepoint_to_chr(tag) + '?' current_map = self.tag_to_attr_map[tag] print('WARNING: tag %s unknown' % codepoint_to_chr(tag)) buf.write(encode(tag_name)) elif flags & FLAG_CLOSING: if depth == 0: raise LitError('Extra closing tag %s at %d' % (tag_name, self.cpos)) break elif state == 'get attr': in_censorship = False if oc == 0: state = 'text' if not is_goingdown: tag_name = None dynamic_tag = 0 buf.write(' />') else: buf.write('>') frame = (depth, tag_name, current_map, dynamic_tag, errors, in_censorship, False, 'close tag', flags) stack.append(frame) frame = (depth + 1, None, None, 0, 0, False, False, 'text', 0) stack.append(frame) break else: if oc == 0x8000: state = 'get attr length' continue attr = None if current_map and oc in current_map and current_map[oc]: attr = current_map[oc] elif oc in self.attr_map: attr = self.attr_map[oc] if not attr or not isinstance(attr, string_or_bytes): raise LitError('Unknown attribute %d in tag %s' % (oc, tag_name)) if attr.startswith('%'): in_censorship = True state = 'get value length' continue buf.write(' ' + encode(attr) + '=') if attr in ['href', 'src']: state = 'get href length' else: state = 'get value length' elif state == 'get value length': if not in_censorship: buf.write('"') count = oc - 1 if count == 0: if not in_censorship: buf.write('"') in_censorship = False state = 'get attr' continue state = 'get value' if oc == 0xffff: continue if count < 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) elif state == 'get value': if count == 0xfffe: if not in_censorship: buf.write('%s"' % (oc - 1)) in_censorship = False state = 'get attr' elif count > 0: if not in_censorship: if c == '"': c = '"' elif c == '<': c = '<' buf.write(c.encode('ascii', 'xmlcharrefreplace')) count -= 1 if count == 0: if not in_censorship: buf.write('"') in_censorship = False state = 'get attr' elif state == 'get custom length': count = oc - 1 if count <= 0 or count > len(bin) - self.cpos: raise LitError('Invalid character count %d' % count) dynamic_tag += 1 state = 'get custom' tag_name = '' elif state == 'get custom': tag_name += c count -= 1 if count == 0: buf.write(encode(tag_name)) state = 'get attr' elif state == 'get attr length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) buf.write(' ') state = 'get custom attr' elif state == 'get custom attr': buf.write(encode(c)) count -= 1 if count == 0: buf.write('=') state = 'get value length' elif state == 'get href length': count = oc - 1 if count <= 0 or count > (len(bin) - self.cpos): raise LitError('Invalid character count %d' % count) href = '' state = 'get href' elif state == 'get href': href += c count -= 1 if count == 0: doc, frag = urldefrag(href[1:]) path = self.item_path(doc) if frag: path = '#'.join((path, frag)) path = urlnormalize(path) buf.write(encode(u'"%s"' % path)) state = 'get attr'
import re, socket from mechanize import URLError from calibre.ebooks.metadata.book.base import Metadata from calibre import browser from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.chardet import xml_to_unicode from polyglot.builtins import codepoint_to_chr, unicode_type, range from polyglot.urllib import parse_qs, quote_plus URL = \ "http://ww2.kdl.org/libcat/WhatsNext.asp?AuthorLastName={0}&AuthorFirstName=&SeriesName=&BookTitle={1}&CategoryID=0&cmdSearch=Search&Search=1&grouping=" _ignore_starts = u'\'"'+u''.join(codepoint_to_chr(x) for x in list(range(0x2018, 0x201e))+[0x2032, 0x2033]) def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode_type): title = title.encode('utf-8') title = quote_plus(title) author = authors[0].strip()
def _replace_unicode(match): codepoint = int(match.group(1), 16) if codepoint > sys.maxunicode: codepoint = 0xFFFD return codepoint_to_chr(codepoint)
class Parser(object): def __init__(self): self.current_token = 0 self.tokens = None OPCODE = 1 WORD = 2 QUOTED_WORD = 3 EOF = 4 REPLACEMENTS = tuple(('\\' + x, codepoint_to_chr(i + 1)) for i, x in enumerate('\\"()')) # Had to translate named constants to numeric values lex_scanner = re.Scanner([ (r'[()]', lambda x,t: (Parser.OPCODE, t)), (r'@.+?:[^")\s]+', lambda x,t: (Parser.WORD, unicode_type(t))), (r'[^"()\s]+', lambda x,t: (Parser.WORD, unicode_type(t))), (r'".*?((?<!\\)")', lambda x,t: (Parser.QUOTED_WORD, t[1:-1])), (r'\s+', None) ], flags=re.DOTALL) def token(self, advance=False): if self.is_eof(): return None res = self.tokens[self.current_token][1] if advance: self.current_token += 1 return res def lcase_token(self, advance=False): if self.is_eof(): return None res = self.tokens[self.current_token][1] if advance: self.current_token += 1 return icu_lower(res) def token_type(self): if self.is_eof(): return self.EOF return self.tokens[self.current_token][0] def is_eof(self): return self.current_token >= len(self.tokens) def advance(self): self.current_token += 1 def tokenize(self, expr): # Strip out escaped backslashes, quotes and parens so that the # lex scanner doesn't get confused. We put them back later. for k, v in self.REPLACEMENTS: expr = expr.replace(k, v) tokens = self.lex_scanner.scan(expr)[0] def unescape(x): for k, v in self.REPLACEMENTS: x = x.replace(v, k[1:]) return x return [ (tt, unescape(tv) if tt in (self.WORD, self.QUOTED_WORD) else tv) for tt, tv in tokens ] def parse(self, expr, locations): self.locations = locations self.tokens = self.tokenize(expr) self.current_token = 0 prog = self.or_expression() if not self.is_eof(): raise ParseException(_('Extra characters at end of search')) return prog def or_expression(self): lhs = self.and_expression() if self.lcase_token() == 'or': self.advance() return ['or', lhs, self.or_expression()] return lhs def and_expression(self): lhs = self.not_expression() if self.lcase_token() == 'and': self.advance() return ['and', lhs, self.and_expression()] # Account for the optional 'and' if ((self.token_type() in [self.WORD, self.QUOTED_WORD] or self.token() == '(') and self.lcase_token() != 'or'): return ['and', lhs, self.and_expression()] return lhs def not_expression(self): if self.lcase_token() == 'not': self.advance() return ['not', self.not_expression()] return self.location_expression() def location_expression(self): if self.token_type() == self.OPCODE and self.token() == '(': self.advance() res = self.or_expression() if self.token_type() != self.OPCODE or self.token(advance=True) != ')': raise ParseException(_('missing )')) return res if self.token_type() not in (self.WORD, self.QUOTED_WORD): raise ParseException(_('Invalid syntax. Expected a lookup name or a word')) return self.base_token() def base_token(self): if self.token_type() == self.QUOTED_WORD: return ['token', 'all', self.token(advance=True)] words = self.token(advance=True).split(':') # The complexity here comes from having colon-separated search # values. That forces us to check that the first "word" in a colon- # separated group is a valid location. If not, then the token must # be reconstructed. We also have the problem that locations can be # followed by quoted strings that appear as the next token. and that # tokens can be a sequence of colons. # We have a location if there is more than one word and the first # word is in locations. This check could produce a "wrong" answer if # the search string is something like 'author: "foo"' because it # will be interpreted as 'author:"foo"'. I am choosing to accept the # possible error. The expression should be written '"author:" foo' if len(words) > 1 and words[0].lower() in self.locations: loc = words[0].lower() words = words[1:] if len(words) == 1 and self.token_type() == self.QUOTED_WORD: return ['token', loc, self.token(advance=True)] return ['token', icu_lower(loc), ':'.join(words)] return ['token', 'all', ':'.join(words)]
def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[], inhead=False, preserve=False): if not isinstance(elem.tag, string_or_bytes): # Don't emit any comments or raw entities return nsrmap = copy.copy(nsrmap) attrib = dict(elem.attrib) style = self.stylizer.style(elem) if self.stylizer else None for key, value in elem.nsmap.items(): if value not in nsrmap or nsrmap[value] != key: xmlns = ('xmlns:' + key) if key else 'xmlns' attrib[xmlns] = value nsrmap[value] = key tag = prefixname(elem.tag, nsrmap) tag_offset = self.buf.tell() if tag == 'head': inhead = True flags = FLAG_OPENING if not elem.text and len(elem) == 0: flags |= FLAG_CLOSING if inhead: flags |= FLAG_HEAD if style and self.is_block(style): flags |= FLAG_BLOCK self.write(0, flags) tattrs = self.tattrs[0] if tag in self.tags: index = self.tags[tag] self.write(index) if self.tattrs[index]: tattrs = self.tattrs[index] else: self.write(FLAG_CUSTOM, len(tag) + 1, tag) last_break = self.page_breaks[-1][0] if self.page_breaks else None if style and last_break != tag_offset \ and style['page-break-before'] in PAGE_BREAKS: self.page_breaks.append((tag_offset, list(parents))) for attr, value in attrib.items(): attr = prefixname(attr, nsrmap) if attr in ('href', 'src'): value = urlnormalize(value) path, frag = urldefrag(value) if self.item: path = self.item.abshref(path) prefix = codepoint_to_chr(3) if path in self.manifest.hrefs: prefix = codepoint_to_chr(2) value = self.manifest.hrefs[path].id if frag: value = '#'.join((value, frag)) value = prefix + value elif attr in ('id', 'name'): self.anchors.append((value, tag_offset)) elif attr.startswith('ms--'): attr = '%' + attr[4:] elif tag == 'link' and attr == 'type' and value in OEB_STYLES: value = CSS_MIME if attr in tattrs: self.write(tattrs[attr]) else: self.write(FLAG_CUSTOM, len(attr) + 1, attr) try: self.write(ATTR_NUMBER, int(value) + 1) except ValueError: self.write(len(value) + 1, value) self.write(0) old_preserve = preserve if style: preserve = (style['white-space'] in ('pre', 'pre-wrap')) xml_space = elem.get(XML('space')) if xml_space == 'preserve': preserve = True elif xml_space == 'normal': preserve = False if elem.text: if preserve: self.write(elem.text) elif len(elem) == 0 or not elem.text.isspace(): self.write(COLLAPSE.sub(' ', elem.text)) # else: de nada parents.append(tag_offset) child = cstyle = nstyle = None for next in chain(elem, [None]): if self.stylizer: nstyle = None if next is None else self.stylizer.style(next) if child is not None: if not preserve \ and (inhead or not nstyle or self.is_block(cstyle) or self.is_block(nstyle)) \ and child.tail and child.tail.isspace(): child.tail = None self.tree_to_binary(child, nsrmap, parents, inhead, preserve) child, cstyle = next, nstyle parents.pop() preserve = old_preserve if not flags & FLAG_CLOSING: self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) if elem.tail and tag != 'html': tail = elem.tail if not preserve: tail = COLLAPSE.sub(' ', tail) self.write(tail) if style and style['page-break-after'] not in ('avoid', 'auto'): self.page_breaks.append((self.buf.tell(), list(parents)))
ans = None # invalid tweak value try: ans = frozenset(ans) if ans else frozenset(data['eng']) except: ans = frozenset((r'A\s+', r'The\s+', r'An\s+')) ans = '|'.join(ans) ans = '^(%s)'%ans try: ans = re.compile(ans, re.IGNORECASE) except: ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE) _title_pats[lang] = ans return ans _ignore_starts = '\'"'+''.join(codepoint_to_chr(x) for x in list(range(0x2018, 0x201e))+[0x2032, 0x2033]) def title_sort(title, order=None, lang=None): if order is None: order = tweaks['title_series_sorting'] title = title.strip() if order == 'strictly_alphabetic': return title if title and title[0] in _ignore_starts: title = title[1:] match = get_title_sort_pat(lang).search(title) if match: try: prep = match.group(1)
ans = None # invalid tweak value try: ans = frozenset(ans) if ans else frozenset(data['eng']) except: ans = frozenset((r'A\s+', r'The\s+', r'An\s+')) ans = '|'.join(ans) ans = '^(%s)'%ans try: ans = re.compile(ans, re.IGNORECASE) except: ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE) _title_pats[lang] = ans return ans _ignore_starts = u'\'"'+u''.join(codepoint_to_chr(x) for x in list(range(0x2018, 0x201e))+[0x2032, 0x2033]) def title_sort(title, order=None, lang=None): if order is None: order = tweaks['title_series_sorting'] title = title.strip() if order == 'strictly_alphabetic': return title if title and title[0] in _ignore_starts: title = title[1:] match = get_title_sort_pat(lang).search(title) if match: try: prep = match.group(1)
import re, urllib, urlparse, socket from mechanize import URLError from calibre.ebooks.metadata.book.base import Metadata from calibre import browser from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.ebooks.chardet import xml_to_unicode from polyglot.builtins import codepoint_to_chr, unicode_type URL = \ "http://ww2.kdl.org/libcat/WhatsNext.asp?AuthorLastName={0}&AuthorFirstName=&SeriesName=&BookTitle={1}&CategoryID=0&cmdSearch=Search&Search=1&grouping=" _ignore_starts = u'\'"' + u''.join( codepoint_to_chr(x) for x in range(0x2018, 0x201e) + [0x2032, 0x2033]) def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode_type): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip()
def uni(match): try: return codepoint_to_chr(int(match.group(1))) except Exception: return '?'
def process_phtml(self, d, paragraph_offsets=[]): html = u'<p id="p0">' offset = 0 paragraph_open = True link_open = False need_set_p_id = False p_num = 1 font_specifier_close = '' while offset < len(d): if not paragraph_open: if need_set_p_id: html += u'<p id="p%s">' % p_num p_num += 1 need_set_p_id = False else: html += u'<p>' paragraph_open = True c = ord(d[offset]) # PHTML "functions" if c == 0x0: offset += 1 c = ord(d[offset]) # Page link begins # 2 Bytes # record ID if c == 0x0a: offset += 1 id = struct.unpack('>H', d[offset:offset + 2])[0] if id in self.uid_text_secion_number: html += '<a href="%s.html">' % id link_open = True offset += 1 # Targeted page link begins # 3 Bytes # record ID, target elif c == 0x0b: offset += 3 # Paragraph link begins # 4 Bytes # record ID, paragraph number elif c == 0x0c: offset += 1 id = struct.unpack('>H', d[offset:offset + 2])[0] offset += 2 pid = struct.unpack('>H', d[offset:offset + 2])[0] if id in self.uid_text_secion_number: html += '<a href="%s.html#p%s">' % (id, pid) link_open = True offset += 1 # Targeted paragraph link begins # 5 Bytes # record ID, paragraph number, target elif c == 0x0d: offset += 5 # Link ends # 0 Bytes elif c == 0x08: if link_open: html += '</a>' link_open = False # Set font # 1 Bytes # font specifier elif c == 0x11: offset += 1 specifier = d[offset] html += font_specifier_close # Regular text if specifier == 0: font_specifier_close = '' # h1 elif specifier == 1: html += '<h1>' font_specifier_close = '</h1>' # h2 elif specifier == 2: html += '<h2>' font_specifier_close = '</h2>' # h3 elif specifier == 3: html += '<h13>' font_specifier_close = '</h3>' # h4 elif specifier == 4: html += '<h4>' font_specifier_close = '</h4>' # h5 elif specifier == 5: html += '<h5>' font_specifier_close = '</h5>' # h6 elif specifier == 6: html += '<h6>' font_specifier_close = '</h6>' # Bold elif specifier == 7: html += '<b>' font_specifier_close = '</b>' # Fixed-width elif specifier == 8: html += '<tt>' font_specifier_close = '</tt>' # Small elif specifier == 9: html += '<small>' font_specifier_close = '</small>' # Subscript elif specifier == 10: html += '<sub>' font_specifier_close = '</sub>' # Superscript elif specifier == 11: html += '<sup>' font_specifier_close = '</sup>' # Embedded image # 2 Bytes # image record ID elif c == 0x1a: offset += 1 uid = struct.unpack('>H', d[offset:offset + 2])[0] html += '<img src="images/%s.jpg" />' % uid offset += 1 # Set margin # 2 Bytes # left margin, right margin elif c == 0x22: offset += 2 # Alignment of text # 1 Bytes # alignment elif c == 0x29: offset += 1 # Horizontal rule # 3 Bytes # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100) elif c == 0x33: offset += 3 if paragraph_open: html += u'</p>' paragraph_open = False html += u'<hr />' # New line # 0 Bytes elif c == 0x38: if paragraph_open: html += u'</p>\n' paragraph_open = False # Italic text begins # 0 Bytes elif c == 0x40: html += u'<i>' # Italic text ends # 0 Bytes elif c == 0x48: html += u'</i>' # Set text color # 3 Bytes # 8-bit red, 8-bit green, 8-bit blue elif c == 0x53: offset += 3 # Multiple embedded image # 4 Bytes # alternate image record ID, image record ID elif c == 0x5c: offset += 3 uid = struct.unpack('>H', d[offset:offset + 2])[0] html += '<img src="images/%s.jpg" />' % uid offset += 1 # Underline text begins # 0 Bytes elif c == 0x60: html += u'<u>' # Underline text ends # 0 Bytes elif c == 0x68: html += u'</u>' # Strike-through text begins # 0 Bytes elif c == 0x70: html += u'<s>' # Strike-through text ends # 0 Bytes elif c == 0x78: html += u'</s>' # 16-bit Unicode character # 3 Bytes # alternate text length, 16-bit unicode character elif c == 0x83: offset += 3 # 32-bit Unicode character # 5 Bytes # alternate text length, 32-bit unicode character elif c == 0x85: offset += 5 # Begin custom font span # 6 Bytes # font page record ID, X page position, Y page position elif c == 0x8e: offset += 6 # Adjust custom font glyph position # 4 Bytes # X page position, Y page position elif c == 0x8c: offset += 4 # Change font page # 2 Bytes # font record ID elif c == 0x8a: offset += 2 # End custom font span # 0 Bytes elif c == 0x88: pass # Begin new table row # 0 Bytes elif c == 0x90: pass # Insert table (or table link) # 2 Bytes # table record ID elif c == 0x92: offset += 2 # Table cell data # 7 Bytes # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length elif c == 0x97: offset += 7 # Exact link modifier # 2 Bytes # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or # Targeted Paragraph Link function to specify an exact byte offset within # the paragraph. This function must be followed immediately by the # function it modifies). elif c == 0x9a: offset += 2 elif c == 0xa0: html += ' ' else: html += codepoint_to_chr(c) offset += 1 if offset in paragraph_offsets: need_set_p_id = True if paragraph_open: html += u'</p>\n' paragraph_open = False if paragraph_open: html += u'</p>' return html
def process_phtml(self, d, paragraph_offsets=[]): html = u'<p id="p0">' offset = 0 paragraph_open = True link_open = False need_set_p_id = False p_num = 1 font_specifier_close = '' while offset < len(d): if not paragraph_open: if need_set_p_id: html += u'<p id="p%s">' % p_num p_num += 1 need_set_p_id = False else: html += u'<p>' paragraph_open = True c = ord(d[offset]) # PHTML "functions" if c == 0x0: offset += 1 c = ord(d[offset]) # Page link begins # 2 Bytes # record ID if c == 0x0a: offset += 1 id = struct.unpack('>H', d[offset:offset+2])[0] if id in self.uid_text_secion_number: html += '<a href="%s.html">' % id link_open = True offset += 1 # Targeted page link begins # 3 Bytes # record ID, target elif c == 0x0b: offset += 3 # Paragraph link begins # 4 Bytes # record ID, paragraph number elif c == 0x0c: offset += 1 id = struct.unpack('>H', d[offset:offset+2])[0] offset += 2 pid = struct.unpack('>H', d[offset:offset+2])[0] if id in self.uid_text_secion_number: html += '<a href="%s.html#p%s">' % (id, pid) link_open = True offset += 1 # Targeted paragraph link begins # 5 Bytes # record ID, paragraph number, target elif c == 0x0d: offset += 5 # Link ends # 0 Bytes elif c == 0x08: if link_open: html += '</a>' link_open = False # Set font # 1 Bytes # font specifier elif c == 0x11: offset += 1 specifier = d[offset] html += font_specifier_close # Regular text if specifier == 0: font_specifier_close = '' # h1 elif specifier == 1: html += '<h1>' font_specifier_close = '</h1>' # h2 elif specifier == 2: html += '<h2>' font_specifier_close = '</h2>' # h3 elif specifier == 3: html += '<h13>' font_specifier_close = '</h3>' # h4 elif specifier == 4: html += '<h4>' font_specifier_close = '</h4>' # h5 elif specifier == 5: html += '<h5>' font_specifier_close = '</h5>' # h6 elif specifier == 6: html += '<h6>' font_specifier_close = '</h6>' # Bold elif specifier == 7: html += '<b>' font_specifier_close = '</b>' # Fixed-width elif specifier == 8: html += '<tt>' font_specifier_close = '</tt>' # Small elif specifier == 9: html += '<small>' font_specifier_close = '</small>' # Subscript elif specifier == 10: html += '<sub>' font_specifier_close = '</sub>' # Superscript elif specifier == 11: html += '<sup>' font_specifier_close = '</sup>' # Embedded image # 2 Bytes # image record ID elif c == 0x1a: offset += 1 uid = struct.unpack('>H', d[offset:offset+2])[0] html += '<img src="images/%s.jpg" />' % uid offset += 1 # Set margin # 2 Bytes # left margin, right margin elif c == 0x22: offset += 2 # Alignment of text # 1 Bytes # alignment elif c == 0x29: offset += 1 # Horizontal rule # 3 Bytes # 8-bit height, 8-bit width (pixels), 8-bit width (%, 1-100) elif c == 0x33: offset += 3 if paragraph_open: html += u'</p>' paragraph_open = False html += u'<hr />' # New line # 0 Bytes elif c == 0x38: if paragraph_open: html += u'</p>\n' paragraph_open = False # Italic text begins # 0 Bytes elif c == 0x40: html += u'<i>' # Italic text ends # 0 Bytes elif c == 0x48: html += u'</i>' # Set text color # 3 Bytes # 8-bit red, 8-bit green, 8-bit blue elif c == 0x53: offset += 3 # Multiple embedded image # 4 Bytes # alternate image record ID, image record ID elif c == 0x5c: offset += 3 uid = struct.unpack('>H', d[offset:offset+2])[0] html += '<img src="images/%s.jpg" />' % uid offset += 1 # Underline text begins # 0 Bytes elif c == 0x60: html += u'<u>' # Underline text ends # 0 Bytes elif c == 0x68: html += u'</u>' # Strike-through text begins # 0 Bytes elif c == 0x70: html += u'<s>' # Strike-through text ends # 0 Bytes elif c == 0x78: html += u'</s>' # 16-bit Unicode character # 3 Bytes # alternate text length, 16-bit unicode character elif c == 0x83: offset += 3 # 32-bit Unicode character # 5 Bytes # alternate text length, 32-bit unicode character elif c == 0x85: offset += 5 # Begin custom font span # 6 Bytes # font page record ID, X page position, Y page position elif c == 0x8e: offset += 6 # Adjust custom font glyph position # 4 Bytes # X page position, Y page position elif c == 0x8c: offset += 4 # Change font page # 2 Bytes # font record ID elif c == 0x8a: offset += 2 # End custom font span # 0 Bytes elif c == 0x88: pass # Begin new table row # 0 Bytes elif c == 0x90: pass # Insert table (or table link) # 2 Bytes # table record ID elif c == 0x92: offset += 2 # Table cell data # 7 Bytes # 8-bit alignment, 16-bit image record ID, 8-bit columns, 8-bit rows, 16-bit text length elif c == 0x97: offset += 7 # Exact link modifier # 2 Bytes # Paragraph Offset (The Exact Link Modifier modifies a Paragraph Link or # Targeted Paragraph Link function to specify an exact byte offset within # the paragraph. This function must be followed immediately by the # function it modifies). elif c == 0x9a: offset += 2 elif c == 0xa0: html += ' ' else: html += codepoint_to_chr(c) offset += 1 if offset in paragraph_offsets: need_set_p_id = True if paragraph_open: html += u'</p>\n' paragraph_open = False if paragraph_open: html += u'</p>' return html
def uni(match): return codepoint_to_chr(int(match.group(1)))
try: ans = frozenset(ans) if ans else frozenset(data['eng']) except: ans = frozenset((r'A\s+', r'The\s+', r'An\s+')) ans = '|'.join(ans) ans = '^(%s)' % ans try: ans = re.compile(ans, re.IGNORECASE) except: ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE) _title_pats[lang] = ans return ans _ignore_starts = u'\'"' + u''.join( codepoint_to_chr(x) for x in list(range(0x2018, 0x201e)) + [0x2032, 0x2033]) def title_sort(title, order=None, lang=None): if order is None: order = tweaks['title_series_sorting'] title = title.strip() if order == 'strictly_alphabetic': return title if title and title[0] in _ignore_starts: title = title[1:] match = get_title_sort_pat(lang).search(title) if match: try: prep = match.group(1)
def unescape_entity(m): try: return codepoint_to_chr(name2codepoint[m.group(1)]) except KeyError: return m.group(0) # use as is