Example #1
0
    def _return_tree_splitted_in_files(self):
        """
        Steps:
         - Look for observations node
         - Remove it from the tree, but store it
         - remove the slices node and substitute it for an empty one
         - getchildren of observations_node and, for i in len(getchildren)
            -pop an element (an observation) from the list
            - put this element in a new observation node
            - when reaching max_allowed_observations, put this observation node
            in the global tree.
            - Serialize it
            - Save the path of serialization and add it to the result
         - Do something similar with the slices_node (with an empty observations_node)
            In this case, no need to control number of slices. Just a file containing only slices


        """
        result = []
        #Getting original obs and slices
        original_obs_node = self._get_observations_node_of_a_tree(self._tree)
        every_obs = original_obs_node.getchildren()
        original_sli_node = self._get_slices_node_of_a_tree(self._tree)

        #Removing obs and slices from the original tree
        self._remove_slices_and_obs_from_the_original_tree()

        #putting groups of obs in the tree and serializing
        original_length = len(every_obs)
        temporal_observations_node = Element(ModelToXMLTransformer.OBSERVATIONS)
        for i in range(1, original_length + 1):
            if i % self._MAX_OBSERVATIONS_ALLOWED == 0:  # cycle of _MAX_OBSERVATIONS_ALLOWED: serialize and new node
                result.append(self._persist_tree_with_obs_node(temporal_observations_node))
                temporal_observations_node = Element(ModelToXMLTransformer.OBSERVATIONS)
            temporal_observations_node.append(every_obs.pop())
        if len(temporal_observations_node.getchildren()):  # Out of the for loop, but we may have obs to include yet
            result.append(self._persist_tree_with_obs_node(temporal_observations_node))

        #managing slices:
        if len(original_sli_node.getchildren()) == 0:
            return result  # No more to do. The original tree hadn't got slices.
        else:
            result.append(self._persist_tree_with_sli_node(original_sli_node))

        #No more to do but returning result. We could restore the original tree object, but there is no reason to do it.

        return result
Example #2
0
class ParseTreeBuilder(object):
	'''This class supplies an alternative for xml.etree.ElementTree.TreeBuilder
	which cleans up the tree on the fly while building it. The main use
	is to normalize the tree that is produced by the editor widget, but it can
	also be used on other "dirty" interfaces.

	This builder takes care of the following issues:
		- Inline tags ('emphasis', 'strong', 'h', etc.) can not span multiple lines
		- Tags can not contain only whitespace
		- Tags can not be empty (with the exception of the 'img' tag)
		- There should be an empty line before each 'h', 'p' or 'pre'
		  (with the exception of the first tag in the tree)
		- The 'p' and 'pre' elements should always end with a newline ('\\n')
		- Each 'p', 'pre' and 'h' should be postfixed with a newline ('\\n')
		  (as a results 'p' and 'pre' are followed by an empty line, the
		  'h' does not end in a newline itself, so it is different)
		- Newlines ('\\n') after a <li> alement are removed (optional)
		- The element '_ignore_' is silently ignored
	'''

	def __init__(self, remove_newlines_after_li=True):
		assert remove_newlines_after_li, 'TODO'
		self._stack = [] # stack of elements for open tags
		self._last = None # last element opened or closed
		self._data = [] # buffer with data
		self._tail = False # True if we are after an end tag
		self._seen_eol = 2 # track line ends on flushed data
			# starts with "2" so check is ok for first top level element

	def start(self, tag, attrib=None):
		if tag == '_ignore_':
			return self._last
		elif tag == 'h':
			self._flush(need_eol=2)
		elif tag in ('p', 'pre'):
			self._flush(need_eol=1)
		else:
			self._flush()
		#~ print 'START', tag

		if tag == 'h':
			if not (attrib and 'level' in attrib):
				logger.warn('Missing "level" attribute for heading')
				attrib = attrib or {}
				attrib['level'] = 1
		elif tag == 'link':
			if not (attrib and 'href' in attrib):
				logger.warn('Missing "href" attribute for link')
				attrib = attrib or {}
				attrib['href'] = "404"
		# TODO check other mandatory properties !

		if attrib:
			self._last = Element(tag, attrib)
		else:
			self._last = Element(tag)

		if self._stack:
			self._stack[-1].append(self._last)
		else:
			assert tag == 'zim-tree', 'root element needs to be "zim-tree"'
		self._stack.append(self._last)

		self._tail = False
		return self._last

	def end(self, tag):
		if tag == '_ignore_':
			return None
		elif tag in ('p', 'pre'):
			self._flush(need_eol=1)
		else:
			self._flush()
		#~ print 'END', tag

		self._last = self._stack[-1]
		assert self._last.tag == tag, \
			"end tag mismatch (expected %s, got %s)" % (self._last.tag, tag)
		self._tail = True

		if len(self._stack) > 1 and not (tag == 'img'
		or (self._last.text and not self._last.text.isspace())
		or self._last.getchildren() ):
			# purge empty tags
			if self._last.text and self._last.text.isspace():
				self._append_to_previous(self._last.text)

			empty = self._stack.pop()
			self._stack[-1].remove(empty)
			children = self._stack[-1].getchildren()
			if children:
				self._last = children[-1]
				if not self._last.tail is None:
					self._data = [self._last.tail]
					self._last.tail = None
			else:
				self._last = self._stack[-1]
				self._tail = False
				if not self._last.text is None:
					self._data = [self._last.text]
					self._last.text = None

			return empty

		else:
			return self._stack.pop()

	def data(self, text):
		assert isinstance(text, basestring)
		self._data.append(text)

	def _flush(self, need_eol=0):
		# need_eol makes sure previous data ends with \n

		#~ print 'DATA:', self._data
		text = ''.join(self._data)

		# Fix trailing newlines
		if text:
			m = count_eol_re.search(text)
			if m: self._seen_eol = len(m.group(0))
			else: self._seen_eol = 0

		if need_eol > self._seen_eol:
			text += '\n' * (need_eol - self._seen_eol)
			self._seen_eol = need_eol

		# Fix prefix newlines
		if self._tail and self._last.tag in ('h', 'p') \
		and not text.startswith('\n'):
			if text:
				text = '\n' + text
			else:
				text = '\n'
				self._seen_eol = 1
		elif self._tail and self._last.tag == 'li' \
		and text.startswith('\n'):
			text = text[1:]
			if not text.strip('\n'):
				self._seen_eol -=1

		if text:
			assert not self._last is None, 'data seen before root element'
			self._data = []

			# Tags that are not allowed to have newlines
			if not self._tail and self._last.tag in (
			'h', 'emphasis', 'strong', 'mark', 'strike', 'code'):
				# assume no nested tags in these types ...
				if self._seen_eol:
					text = text.rstrip('\n')
					self._data.append('\n' * self._seen_eol)
					self._seen_eol = 0
				lines = text.split('\n')

				for line in lines[:-1]:
					assert self._last.text is None, "internal error (text)"
					assert self._last.tail is None, "internal error (tail)"
					if line and not line.isspace():
						self._last.text = line
						self._last.tail = '\n'
						attrib = self._last.attrib.copy()
						self._last = Element(self._last.tag, attrib)
						self._stack[-2].append(self._last)
						self._stack[-1] = self._last
					else:
						self._append_to_previous(line + '\n')

				assert self._last.text is None, "internal error (text)"
				self._last.text = lines[-1]
			else:
				# TODO split paragraphs

				if self._tail:
					assert self._last.tail is None, "internal error (tail)"
					self._last.tail = text
				else:
					assert self._last.text is None, "internal error (text)"
					self._last.text = text
		else:
			self._data = []


	def close(self):
		assert len(self._stack) == 0, 'missing end tags'
		assert not self._last is None and self._last.tag == 'zim-tree', 'missing root element'
		return self._last

	def _append_to_previous(self, text):
		'''Add text before current element'''
		parent = self._stack[-2]
		children = parent.getchildren()[:-1]
		if children:
			if children[-1].tail:
				children[-1].tail = children[-1].tail + text
			else:
				children[-1].tail = text
		else:
			if parent.text:
				parent.text = parent.text + text
			else:
				parent.text = text