Example #1
0
	def handle_endtag(self, tag):
		if self.isIgnorableTag(tag) or tag == "br":
			return
		if not self.tags:
			raise HTMLParseError("End tag without open elements", self.getpos())
		openTag = self.tags.pop()
		if tag != openTag:
			if openTag == "p":
				self.appendParagraph()
				openTag = self.tags.pop()
			elif openTag == "li":
				self.appendListItem()
				openTag = self.tags.pop()
			elif self.isTableCell(openTag):
				# TODO: appendUnstructuredText
				openTag = self.tags.pop()
		if tag != openTag and tag in ["p", "td"]:
			# maybe an unclosed p or td was closed after all, but there were forbidden elements within it
			self.appendOther()
			self.tags.append(openTag)
			return
		if tag != openTag:
			if tag not in self.tags:
				# ignore this tag completely and restore previous state
				self.tags.append(openTag)
				return
			raise HTMLParseError("End tag does not match start tag", self.getpos())
		if openTag in ["h1", "h2", "h3", "h4", "h5", "h6", "caption"]:
			self.segments.append((SEGMENT_TYPE_HEADING, self.getData()))
		elif openTag in ["li", "dt", "dd"]:
			self.appendListItem()
		elif openTag == "p":
			self.appendParagraph()
		elif not self.isNonContentTag(tag):
			self.appendOther()
Example #2
0
 def __init__(self, tagstack, endtag, position=(None, None)):
     self.endtag = endtag
     if tagstack:
         if len(tagstack) == 1:
             msg = "Open tag <%s> does not match close tag </%s>" % (tagstack[0], endtag)
         else:
             msg = "Open tags <%s> do not match close tag </%s>" % (string.join(tagstack, ">, <"), endtag)
     else:
         msg = "No tags are open to match </%s>" % endtag
     HTMLParseError.__init__(self, msg, position)
Example #3
0
 def __init__(self, tagstack, endtag, position=(None, None)):
     self.endtag = endtag
     if tagstack:
         if len(tagstack) == 1:
             msg = ('Open tag <%s> does not match close tag </%s>'
                    % (tagstack[0], endtag))
         else:
             msg = ('Open tags <%s> do not match close tag </%s>'
                    % ('>, <'.join(tagstack), endtag))
     else:
         msg = 'No tags are open to match </%s>' % endtag
     HTMLParseError.__init__(self, msg, position)
Example #4
0
 def __init__(self, tagstack, endtag, position=(None, None)):
     self.endtag = endtag
     if tagstack:
         if len(tagstack) == 1:
             msg = ('Open tag <%s> does not match close tag </%s>' %
                    (tagstack[0], endtag))
         else:
             msg = ('Open tags <%s> do not match close tag </%s>' %
                    (string.join(tagstack, '>, <'), endtag))
     else:
         msg = 'No tags are open to match </%s>' % endtag
     HTMLParseError.__init__(self, msg, position)
Example #5
0
 def handle_data(self, data):
     """Reject any non-whitespace text in elements
     """
     if not data.strip():
         return
     raise HTMLParseError("Text not allowed at this level: %r" % data[:5],
                          position=self.getpos())
Example #6
0
 def handle_starttag(self, tag, attrs):
     if self.isIgnorableTag(tag):
         return
     if self.isCloseP(tag) and len(self.tags) >= 1 and self.tags[-1] == "p":
         self.tags.pop()
         self.appendParagraph()
     if tag == "li" and len(self.tags) >= 1 and self.tags[-1] == "li":
         self.tags.pop()
         self.appendListItem()
     if self.isTableCell(tag) and len(self.tags) >= 1 and self.isTableCell(
             self.tags[-1]):
         self.tags.pop()
         self.appendOther()
     if tag in ["br"]:
         self.data = self.data + u" "
         return
     elif self.isContentTag(tag):
         self.appendOther()
         if self.allowContentInList():
             pass
         elif tag == "li" and len(self.tags) >= 1 and self.tags[-1] == "ul":
             pass
         elif self.acceptData() is not None:
             raise HTMLParseError("Nesting error", self.getpos())
     self.tags.append(tag)
Example #7
0
 def handle_endtag(self, tag):
     if self.__curr_tag is not None and self.__curr_tag == tag:
         self.__curr_bio = None
         self.__curr_tag = None
         self.__curr_attrs = None
     else:
         raise HTMLParseError('Error2', self.getpos())
Example #8
0
 def handle_starttag(self, tag, attrs):
     if self.__curr_tag is None:
         self.__curr_bio = 'B'
         self.__curr_tag = tag
         self.__curr_attrs = attrs
     else:
         raise HTMLParseError('Error1', self.getpos())
Example #9
0
 def _pop_empty(self):
     if len(self._dom_stack) > 1 and self._dom_stack[-1].is_empty:
         closed = self._dom_stack.pop()
         prev = self._dom_stack[-1]
         if not isinstance(prev, closed._consume_in):
             raise HTMLParseError("A <%s> element cannot be consumed in a <%s>" % \
                                  (closed.tag, prev.tag), position=closed.pos)
         prev.consume(closed)
Example #10
0
    def error(self, message):
        # TODO: remove this dependency with HTMLParser. Use a lib that allows
        # parsing of malformed HTML. The reason for this checking is that when
        # there is a paramter (non HTML) containing a '&', it fails.
        if message.startswith("EOF in middle of entity or char ref"):
            return

        raise HTMLParseError(message, self.getpos())
Example #11
0
	def handle_charref(self, name):
		if name.lower().startswith(u"x"):
			nameInt = int(name[1:], 16)
		else:
			nameInt = int(name)
		if 0 < nameInt and nameInt <= 65533:
			self.data = self.data + unichr(nameInt)
		else:
			raise HTMLParseError("Unknown character reference", self.getpos())
Example #12
0
 def feed(self, data):
     try:
         quit_on_done_backup = self.quit_on_done
         path_backup = self.path
         self.reset()
         self.path = path_backup
         self.quit_on_done = quit_on_done_backup
         HTMLParser.feed(self, data)
     except HTMLParseError, msg:
         if not self.quit_on_done or not "DONE PROCESSING" in msg.msg:
             raise HTMLParseError(msg.msg, self.getpos())
Example #13
0
    def handle_starttag(self, tag, attrs):
        if self.ppage is not None or tag != "meta":
            return

        attrs = dict(attrs)
        if attrs.get("name") == "parsely-page":
            ppage = attrs.get("content", attrs.get("value"))
            if ppage:
                try:
                    self.ppage = json.loads(ppage)
                except:
                    raise HTMLParseError("bad ppage")  # bad ppage
Example #14
0
 def close(self):
     super(BaseDPOParser, self).close()
     # consume remaining element
     while len(self._dom_stack) > 1:
         closed = self._dom_stack.pop()
         prev = self._dom_stack[-1]
         if not isinstance(prev, closed._consume_in):
             raise HTMLParseError("A <%s> element cannot be consumed in a <%s>" % \
                                  (closed.tag, prev.tag), position=closed.pos)
         self.logger.warning("Missing </%s> tag at end of stream %d:%d ",
                             closed.tag, *self.getpos())
         prev.consume(closed)
Example #15
0
 def handle_endtag(self, tag):
     if len(self._dom_stack) < 2:
         if tag == 'html':
             return
         raise HTMLParseError("Invalid closing tag </%s> at root element" %
                              tag,
                              position=self.getpos())
     while self._dom_stack:
         closed = self._dom_stack.pop()
         if not self._dom_stack:
             raise HTMLParseError("Syntax error at element <%s>" %
                                  closed.tag,
                                  position=self.getpos())
         prev = self._dom_stack[-1]
         if not isinstance(prev, closed._consume_in):
             raise HTMLParseError("A <%s> element cannot be consumed in a <%s>" % \
                                  (closed.tag, prev.tag or prev._name),
                                  position=closed.pos)
         prev.consume(closed)
         if closed.tag == tag:
             break
Example #16
0
 def translate_value(self, value):
     if value == "VERY GOOD":
         return self.VERY_GOOD
     elif value == "GOOD":
         return self.GOOD
     elif value == "FAIR":
         return self.FAIR
     elif value == "POOR":
         return self.POOR
     elif value == "VERY POOR":
         return self.VERY_POOR
     elif value == "HAZARDOUS":
         return self.HAZARDOUS
     else:
         raise HTMLParseError("Unknown rating value: %s" % value)
Example #17
0
    def handle_data(self, data):
        data = re.sub('\t|\n|\r', '', data).strip(' ')

        if not data:
            return

        if self.day:
            self.day_value = data

            if self.menu.get(self.day_value):
                raise DuplicatedDayError

            self.menu[self.day_value] = {}
            self.day = False
        elif self.kind:
            self.kind_value = data

            if not self.day_value:
                raise HTMLParseError('No day found for this kind')

            if self.menu.get(self.day_value) is None:
                raise HTMLParseError('%s didn\'t get parsed' % self.day_value)

            self.menu[self.day_value][self.kind_value] = []
            self.kind = False
        elif self.name:
            if not self.day_value:
                raise HTMLParseError('No day found for this name')

            if not self.kind_value:
                raise HTMLParseError('No kind found for this name')

            if self.menu.get(self.day_value) is None:
                raise HTMLParseError('%s didn\'t get parsed' % self.day_value)

            if self.menu[self.day_value].get(self.kind_value) is None:
                raise HTMLParseError('%s didn\'t get parsed' % self.kind_value)

            this_kind = self.menu[self.day_value][self.kind_value]
            l = len(this_kind)

            # The data handler breaks when it hits '&', but we don't
            # want this happen, so here we concatenate the broken
            # words.
            if l > 1 and this_kind[l - 1] == '&':
                and_symbol = this_kind.pop()
                first = this_kind.pop()
                data = ' '.join([first, and_symbol, data])

            self.menu[self.day_value][self.kind_value].append(data)
Example #18
0
 def feed(self, data):
     HTMLParser.feed(self, data)
     if self.__curr_tag is not None:
         raise HTMLParseError('Error3', self.getpos())
 def link_extractor(html):
     try:
         tree = lxml.html.document_fromstring(html)
     except lxml.etree.ParseError, e:
         raise HTMLParseError(str(e), e.position)
Example #20
0
 def __init__(self, tag, position=(None, None)):
     self.tag = tag
     msg = 'Close tag </%s> should be removed' % tag
     HTMLParseError.__init__(self, msg, position)
Example #21
0
 def handle_entityref(self, name):
     uchr = html_entities.name2codepoint.get(name, None)
     if uchr is None:
         raise HTMLParseError("Invalid HTML entity ref: &%s;" % name,
                              position=self.getpos())
     self.handle_data(six.unichr(uchr))
Example #22
0
 def handle_pi(self, data):
     raise HTMLParseError(
         "Processing instruction not allowed at this level",
         position=self.getpos())
     # TODO: include statement
     raise NotImplementedError
Example #23
0
 def handle_decl(self, decl):
     raise HTMLParseError("Declaration not allowed at this level",
                          position=self.getpos())
Example #24
0
	def __init__(self, msg, position=(None, None)):
		HTMLParseError.__init__(self, msg, position)
Example #25
0
 def feed(self, html):
     # BBB: Python 2.7 is more tolerant to broken HTML.
     #      For the moment, be strict to behave like Python 2.6.
     HTMLParser.feed(self, html)
     if self.rawdata:
         raise HTMLParseError("unknown error", self.getpos())
Example #26
0
 def __init__(self, tagstack, tag, position=(None, None)):
     self.tag = tag
     msg = 'Tag <%s> is not allowed in <%s>' % (tag, tagstack[-1])
     HTMLParseError.__init__(self, msg, position)
Example #27
0
 def __init__(self, tagstack, tag, position=(None, None)):
     self.tag = tag
     msg = 'Tag <%s> is not allowed in <%s>' % (tag, tagstack[-1])
     HTMLParseError.__init__(self, msg, position)
Example #28
0
 def __init__(self, tag, position=(None, None)):
     self.tag = tag
     msg = 'Close tag </%s> should be removed' % tag
     HTMLParseError.__init__(self, msg, position)
Example #29
0
	def __str__(self):
    		result = HTMLParseError.__str__(self)
		return result
Example #30
0
 def close(self):
     for tag in self.results.keys():
         if tag not in self.closed:
             raise HTMLParseError('Tag %s is not closed' % tag)
Example #31
0
 def error(self, msg):
     raise HTMLParseError(msg, self.getpos())
Example #32
0
 def unknown_decl(self, data):
     raise HTMLParseError("Declaration not allowed at this level",
                          position=self.getpos())
Example #33
0
 def handle_endtag(self, tag):
     popped_ok = False
     while self.tag_stack:
         if tag == self.tag_stack.pop():
             popped_ok = True
             break
     if not popped_ok:
         raise HTMLParseError("Unmatched end tag: %s" % tag)
     if tag == "td" and self.tables_instack(
     ) == 2 and self.in_main_table_row:
         if self.read_row_text:
             self.read_row_text = False
             self.row_texts.append(" ".join(self.row_text))
             self.row_text = []
     elif tag == "tr" and self.tables_instack(
     ) == 2 and self.in_main_table_row:
         self.in_main_table_row = False
         if self.this_row_ratings:
             for region, rating_val in zip(self.last_row_texts,
                                           self.row_texts):
                 rating = PollutionRating(rating_val)
                 if "Sydney" in region:
                     add_statistic_list_item(
                         "air_pollution",
                         "syd:rt",
                         "regions",
                         rating.display(),
                         self.sort_order,
                         label=region,
                         traffic_light_code=rating.tlc())
                     add_statistic_list_item(
                         "air_pollution",
                         "syd:rt",
                         "region_1",
                         rating.display(),
                         self.sort_order,
                         label=region,
                         traffic_light_code=rating.tlc())
                     if rating > self.sydney_worst:
                         self.sydney_worst = rating
                 else:
                     add_statistic_list_item(
                         "air_pollution",
                         "nsw:rt",
                         "regions",
                         rating.display(),
                         self.sort_order,
                         label=region,
                         traffic_light_code=rating.tlc())
                     add_statistic_list_item(
                         "air_pollution",
                         "nsw:rt",
                         "region_1",
                         rating.display(),
                         self.sort_order,
                         label=region,
                         traffic_light_code=rating.tlc())
                 self.sort_order += 10
     elif tag == "table" and self.tables_instack(
     ) == 1 and self.level_2_table_number == 1:
         add_statistic_list_item("air_pollution",
                                 "nsw:rt",
                                 "regions",
                                 self.sydney_worst.display(),
                                 10,
                                 label="Sydney",
                                 traffic_light_code=self.sydney_worst.tlc())
         add_statistic_list_item("air_pollution",
                                 "nsw:rt",
                                 "region_1",
                                 self.sydney_worst.display(),
                                 10,
                                 label="Sydney",
                                 traffic_light_code=self.sydney_worst.tlc())
     elif tag == "table" and self.tables_instack(
     ) == 1 and self.level_2_table_number == 2:
         self.in_syd_forecast_table = False
     elif tag == "td" and self.in_syd_forecast_table:
         if self.row_text:
             val = " ".join(self.row_text)
             try:
                 rating = PollutionRating(val)
                 set_statistic_data("air_pollution",
                                    "nsw:rt",
                                    "sydney_forecast",
                                    rating.display(),
                                    traffic_light_code=rating.tlc())
                 set_statistic_data("air_pollution",
                                    "syd:rt",
                                    "sydney_forecast",
                                    rating.display(),
                                    traffic_light_code=rating.tlc())
             except HTMLParseError:
                 pass
             self.row_text = []
Example #34
0
	def getFeedLink(self):
		# prioriza links segun el orden de ACCEPT_TYPES
		for type in self.ACCEPT_TYPES:
			if type in self.feed_links:
				return self.feed_links[type]
		raise HTMLParseError('Feed not found')