Esempio n. 1
0
 def iterlinks(self, name, get_line_numbers=True):
     ''' Iterate over all links in name. If get_line_numbers is True the
     yields results of the form (link, line_number, offset). Where
     line_number is the line_number at which the link occurs and offset is
     the number of characters from the start of the line. Note that offset
     could actually encompass several lines if not zero. '''
     media_type = self.mime_map.get(name, guess_type(name))
     if name == self.opf_name:
         for elem in self.opf_xpath('//*[@href]'):
             yield (elem.get('href'), elem.sourceline,
                    0) if get_line_numbers else elem.get('href')
     elif media_type.lower() in OEB_DOCS:
         for el, attr, link, pos in iterlinks(self.parsed(name)):
             yield (link, el.sourceline, pos) if get_line_numbers else link
     elif media_type.lower() in OEB_STYLES:
         if get_line_numbers:
             with self.open(name, 'rb') as f:
                 raw = self.decode(f.read()).replace('\r\n', '\n').replace(
                     '\r', '\n')
                 position = PositionFinder(raw)
                 is_in_comment = CommentFinder(raw)
                 for link, offset in itercsslinks(raw):
                     if not is_in_comment(offset):
                         lnum, col = position(offset)
                         yield link, lnum, col
         else:
             for link in getUrls(self.parsed(name)):
                 yield link
     elif media_type.lower() == guess_type('toc.ncx'):
         for elem in self.parsed(name).xpath('//*[@src]'):
             yield (elem.get('src'), elem.sourceline,
                    0) if get_line_numbers else elem.get('src')
Esempio n. 2
0
def check_xml_parsing(name, mt, raw):
    raw = raw.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
    # Get rid of entities as named entities trip up the XML parser
    eproc = EntitityProcessor(mt)
    eraw = entity_pat.sub(eproc, raw)
    parser = XMLParser(recover=False)
    errcls = HTMLParseError if mt in OEB_DOCS else XMLParseError
    errors = []
    if eproc.ok_named_entities:
        errors.append(NamedEntities(name))
    if eproc.bad_entities:
        position = PositionFinder(raw)
        for offset, ent in eproc.bad_entities:
            lnum, col = position(offset)
            errors.append(BadEntity(ent, name, lnum, col))

    try:
        root = fromstring(eraw, parser=parser)
    except UnicodeDecodeError:
        return errors + [DecodeError(name)]
    except XMLSyntaxError as err:
        try:
            line, col = err.position
        except:
            line = col = None
        return errors + [errcls(err.message, name, line, col)]
    except Exception as err:
        return errors + [errcls(err.message, name)]

    if mt in OEB_DOCS:
        if root.nsmap.get(root.prefix, None) != XHTML_NS:
            errors.append(BadNamespace(name, root.nsmap.get(root.prefix, None)))

    return errors