def iterlinks(self, name, get_line_numbers=True): ''' Iterate over all links in name. If get_line_numbers is True the yields results of the form (link, line_number, offset). Where line_number is the line_number at which the link occurs and offset is the number of characters from the start of the line. Note that offset could actually encompass several lines if not zero. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') elif media_type.lower() in OEB_DOCS: for el, attr, link, pos in iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name, 'rb') as f: raw = self.decode(f.read()).replace('\r\n', '\n').replace( '\r', '\n') position = PositionFinder(raw) is_in_comment = CommentFinder(raw) for link, offset in itercsslinks(raw): if not is_in_comment(offset): lnum, col = position(offset) yield link, lnum, col else: for link in getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
def check_xml_parsing(name, mt, raw): raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n') # Get rid of entities as named entities trip up the XML parser eproc = EntitityProcessor(mt) eraw = entity_pat.sub(eproc, raw) parser = XMLParser(recover=False) errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError errors = [] if eproc.ok_named_entities: errors.append(NamedEntities(name)) if eproc.bad_entities: position = PositionFinder(raw) for offset, ent in eproc.bad_entities: lnum, col = position(offset) errors.append(BadEntity(ent, name, lnum, col)) try: root = fromstring(eraw, parser=parser) except UnicodeDecodeError: return errors + [DecodeError(name)] except XMLSyntaxError as err: try: line, col = err.position except: line = col = None return errors + [errcls(err.message, name, line, col)] except Exception as err: return errors + [errcls(err.message, name)] if mt in OEB_DOCS: if root.nsmap.get(root.prefix, None) != XHTML_NS: errors.append(BadNamespace(name, root.nsmap.get(root.prefix, None))) return errors