def handle_endtag(self, tag): if self.isIgnorableTag(tag) or tag == "br": return if not self.tags: raise HTMLParseError("End tag without open elements", self.getpos()) openTag = self.tags.pop() if tag != openTag: if openTag == "p": self.appendParagraph() openTag = self.tags.pop() elif openTag == "li": self.appendListItem() openTag = self.tags.pop() elif self.isTableCell(openTag): # TODO: appendUnstructuredText openTag = self.tags.pop() if tag != openTag and tag in ["p", "td"]: # maybe an unclosed p or td was closed after all, but there were forbidden elements within it self.appendOther() self.tags.append(openTag) return if tag != openTag: if tag not in self.tags: # ignore this tag completely and restore previous state self.tags.append(openTag) return raise HTMLParseError("End tag does not match start tag", self.getpos()) if openTag in ["h1", "h2", "h3", "h4", "h5", "h6", "caption"]: self.segments.append((SEGMENT_TYPE_HEADING, self.getData())) elif openTag in ["li", "dt", "dd"]: self.appendListItem() elif openTag == "p": self.appendParagraph() elif not self.isNonContentTag(tag): self.appendOther()
def __init__(self, tagstack, endtag, position=(None, None)): self.endtag = endtag if tagstack: if len(tagstack) == 1: msg = "Open tag <%s> does not match close tag </%s>" % (tagstack[0], endtag) else: msg = "Open tags <%s> do not match close tag </%s>" % (string.join(tagstack, ">, <"), endtag) else: msg = "No tags are open to match </%s>" % endtag HTMLParseError.__init__(self, msg, position)
def __init__(self, tagstack, endtag, position=(None, None)): self.endtag = endtag if tagstack: if len(tagstack) == 1: msg = ('Open tag <%s> does not match close tag </%s>' % (tagstack[0], endtag)) else: msg = ('Open tags <%s> do not match close tag </%s>' % ('>, <'.join(tagstack), endtag)) else: msg = 'No tags are open to match </%s>' % endtag HTMLParseError.__init__(self, msg, position)
def __init__(self, tagstack, endtag, position=(None, None)): self.endtag = endtag if tagstack: if len(tagstack) == 1: msg = ('Open tag <%s> does not match close tag </%s>' % (tagstack[0], endtag)) else: msg = ('Open tags <%s> do not match close tag </%s>' % (string.join(tagstack, '>, <'), endtag)) else: msg = 'No tags are open to match </%s>' % endtag HTMLParseError.__init__(self, msg, position)
def handle_data(self, data): """Reject any non-whitespace text in elements """ if not data.strip(): return raise HTMLParseError("Text not allowed at this level: %r" % data[:5], position=self.getpos())
def handle_starttag(self, tag, attrs): if self.isIgnorableTag(tag): return if self.isCloseP(tag) and len(self.tags) >= 1 and self.tags[-1] == "p": self.tags.pop() self.appendParagraph() if tag == "li" and len(self.tags) >= 1 and self.tags[-1] == "li": self.tags.pop() self.appendListItem() if self.isTableCell(tag) and len(self.tags) >= 1 and self.isTableCell( self.tags[-1]): self.tags.pop() self.appendOther() if tag in ["br"]: self.data = self.data + u" " return elif self.isContentTag(tag): self.appendOther() if self.allowContentInList(): pass elif tag == "li" and len(self.tags) >= 1 and self.tags[-1] == "ul": pass elif self.acceptData() is not None: raise HTMLParseError("Nesting error", self.getpos()) self.tags.append(tag)
def handle_endtag(self, tag): if self.__curr_tag is not None and self.__curr_tag == tag: self.__curr_bio = None self.__curr_tag = None self.__curr_attrs = None else: raise HTMLParseError('Error2', self.getpos())
def handle_starttag(self, tag, attrs): if self.__curr_tag is None: self.__curr_bio = 'B' self.__curr_tag = tag self.__curr_attrs = attrs else: raise HTMLParseError('Error1', self.getpos())
def _pop_empty(self): if len(self._dom_stack) > 1 and self._dom_stack[-1].is_empty: closed = self._dom_stack.pop() prev = self._dom_stack[-1] if not isinstance(prev, closed._consume_in): raise HTMLParseError("A <%s> element cannot be consumed in a <%s>" % \ (closed.tag, prev.tag), position=closed.pos) prev.consume(closed)
def error(self, message): # TODO: remove this dependency with HTMLParser. Use a lib that allows # parsing of malformed HTML. The reason for this checking is that when # there is a paramter (non HTML) containing a '&', it fails. if message.startswith("EOF in middle of entity or char ref"): return raise HTMLParseError(message, self.getpos())
def handle_charref(self, name): if name.lower().startswith(u"x"): nameInt = int(name[1:], 16) else: nameInt = int(name) if 0 < nameInt and nameInt <= 65533: self.data = self.data + unichr(nameInt) else: raise HTMLParseError("Unknown character reference", self.getpos())
def feed(self, data): try: quit_on_done_backup = self.quit_on_done path_backup = self.path self.reset() self.path = path_backup self.quit_on_done = quit_on_done_backup HTMLParser.feed(self, data) except HTMLParseError, msg: if not self.quit_on_done or not "DONE PROCESSING" in msg.msg: raise HTMLParseError(msg.msg, self.getpos())
def handle_starttag(self, tag, attrs): if self.ppage is not None or tag != "meta": return attrs = dict(attrs) if attrs.get("name") == "parsely-page": ppage = attrs.get("content", attrs.get("value")) if ppage: try: self.ppage = json.loads(ppage) except: raise HTMLParseError("bad ppage") # bad ppage
def close(self): super(BaseDPOParser, self).close() # consume remaining element while len(self._dom_stack) > 1: closed = self._dom_stack.pop() prev = self._dom_stack[-1] if not isinstance(prev, closed._consume_in): raise HTMLParseError("A <%s> element cannot be consumed in a <%s>" % \ (closed.tag, prev.tag), position=closed.pos) self.logger.warning("Missing </%s> tag at end of stream %d:%d ", closed.tag, *self.getpos()) prev.consume(closed)
def handle_endtag(self, tag): if len(self._dom_stack) < 2: if tag == 'html': return raise HTMLParseError("Invalid closing tag </%s> at root element" % tag, position=self.getpos()) while self._dom_stack: closed = self._dom_stack.pop() if not self._dom_stack: raise HTMLParseError("Syntax error at element <%s>" % closed.tag, position=self.getpos()) prev = self._dom_stack[-1] if not isinstance(prev, closed._consume_in): raise HTMLParseError("A <%s> element cannot be consumed in a <%s>" % \ (closed.tag, prev.tag or prev._name), position=closed.pos) prev.consume(closed) if closed.tag == tag: break
def translate_value(self, value): if value == "VERY GOOD": return self.VERY_GOOD elif value == "GOOD": return self.GOOD elif value == "FAIR": return self.FAIR elif value == "POOR": return self.POOR elif value == "VERY POOR": return self.VERY_POOR elif value == "HAZARDOUS": return self.HAZARDOUS else: raise HTMLParseError("Unknown rating value: %s" % value)
def handle_data(self, data): data = re.sub('\t|\n|\r', '', data).strip(' ') if not data: return if self.day: self.day_value = data if self.menu.get(self.day_value): raise DuplicatedDayError self.menu[self.day_value] = {} self.day = False elif self.kind: self.kind_value = data if not self.day_value: raise HTMLParseError('No day found for this kind') if self.menu.get(self.day_value) is None: raise HTMLParseError('%s didn\'t get parsed' % self.day_value) self.menu[self.day_value][self.kind_value] = [] self.kind = False elif self.name: if not self.day_value: raise HTMLParseError('No day found for this name') if not self.kind_value: raise HTMLParseError('No kind found for this name') if self.menu.get(self.day_value) is None: raise HTMLParseError('%s didn\'t get parsed' % self.day_value) if self.menu[self.day_value].get(self.kind_value) is None: raise HTMLParseError('%s didn\'t get parsed' % self.kind_value) this_kind = self.menu[self.day_value][self.kind_value] l = len(this_kind) # The data handler breaks when it hits '&', but we don't # want this happen, so here we concatenate the broken # words. if l > 1 and this_kind[l - 1] == '&': and_symbol = this_kind.pop() first = this_kind.pop() data = ' '.join([first, and_symbol, data]) self.menu[self.day_value][self.kind_value].append(data)
def feed(self, data): HTMLParser.feed(self, data) if self.__curr_tag is not None: raise HTMLParseError('Error3', self.getpos())
def link_extractor(html): try: tree = lxml.html.document_fromstring(html) except lxml.etree.ParseError, e: raise HTMLParseError(str(e), e.position)
def __init__(self, tag, position=(None, None)): self.tag = tag msg = 'Close tag </%s> should be removed' % tag HTMLParseError.__init__(self, msg, position)
def handle_entityref(self, name): uchr = html_entities.name2codepoint.get(name, None) if uchr is None: raise HTMLParseError("Invalid HTML entity ref: &%s;" % name, position=self.getpos()) self.handle_data(six.unichr(uchr))
def handle_pi(self, data): raise HTMLParseError( "Processing instruction not allowed at this level", position=self.getpos()) # TODO: include statement raise NotImplementedError
def handle_decl(self, decl): raise HTMLParseError("Declaration not allowed at this level", position=self.getpos())
def __init__(self, msg, position=(None, None)): HTMLParseError.__init__(self, msg, position)
def feed(self, html): # BBB: Python 2.7 is more tolerant to broken HTML. # For the moment, be strict to behave like Python 2.6. HTMLParser.feed(self, html) if self.rawdata: raise HTMLParseError("unknown error", self.getpos())
def __init__(self, tagstack, tag, position=(None, None)): self.tag = tag msg = 'Tag <%s> is not allowed in <%s>' % (tag, tagstack[-1]) HTMLParseError.__init__(self, msg, position)
def __str__(self): result = HTMLParseError.__str__(self) return result
def close(self): for tag in self.results.keys(): if tag not in self.closed: raise HTMLParseError('Tag %s is not closed' % tag)
def error(self, msg): raise HTMLParseError(msg, self.getpos())
def unknown_decl(self, data): raise HTMLParseError("Declaration not allowed at this level", position=self.getpos())
def handle_endtag(self, tag): popped_ok = False while self.tag_stack: if tag == self.tag_stack.pop(): popped_ok = True break if not popped_ok: raise HTMLParseError("Unmatched end tag: %s" % tag) if tag == "td" and self.tables_instack( ) == 2 and self.in_main_table_row: if self.read_row_text: self.read_row_text = False self.row_texts.append(" ".join(self.row_text)) self.row_text = [] elif tag == "tr" and self.tables_instack( ) == 2 and self.in_main_table_row: self.in_main_table_row = False if self.this_row_ratings: for region, rating_val in zip(self.last_row_texts, self.row_texts): rating = PollutionRating(rating_val) if "Sydney" in region: add_statistic_list_item( "air_pollution", "syd:rt", "regions", rating.display(), self.sort_order, label=region, traffic_light_code=rating.tlc()) add_statistic_list_item( "air_pollution", "syd:rt", "region_1", rating.display(), self.sort_order, label=region, traffic_light_code=rating.tlc()) if rating > self.sydney_worst: self.sydney_worst = rating else: add_statistic_list_item( "air_pollution", "nsw:rt", "regions", rating.display(), self.sort_order, label=region, traffic_light_code=rating.tlc()) add_statistic_list_item( "air_pollution", "nsw:rt", "region_1", rating.display(), self.sort_order, label=region, traffic_light_code=rating.tlc()) self.sort_order += 10 elif tag == "table" and self.tables_instack( ) == 1 and self.level_2_table_number == 1: add_statistic_list_item("air_pollution", "nsw:rt", "regions", self.sydney_worst.display(), 10, label="Sydney", traffic_light_code=self.sydney_worst.tlc()) add_statistic_list_item("air_pollution", "nsw:rt", "region_1", self.sydney_worst.display(), 10, label="Sydney", traffic_light_code=self.sydney_worst.tlc()) elif tag == "table" and self.tables_instack( ) == 1 and self.level_2_table_number == 2: self.in_syd_forecast_table = False elif tag == "td" and self.in_syd_forecast_table: if self.row_text: val = " ".join(self.row_text) try: rating = PollutionRating(val) set_statistic_data("air_pollution", "nsw:rt", "sydney_forecast", rating.display(), traffic_light_code=rating.tlc()) set_statistic_data("air_pollution", "syd:rt", "sydney_forecast", rating.display(), traffic_light_code=rating.tlc()) except HTMLParseError: pass self.row_text = []
def getFeedLink(self): # prioriza links segun el orden de ACCEPT_TYPES for type in self.ACCEPT_TYPES: if type in self.feed_links: return self.feed_links[type] raise HTMLParseError('Feed not found')