def rewrite_links(self, node, callback_func): """Rewrite links inside a document. For each link found, call `callback_func`, passing it the URL amd the matched criterion. The callback function should return the replacement URL. """ assert node.nodeType in (node.DOCUMENT_NODE, node.ELEMENT_NODE) if node.nodeType == node.DOCUMENT_NODE: return self.rewrite_links(node.documentElement, callback_func) # Find any matching elements or attributes, and rewrite their URLs. for (m_type, m_match, m_criterion) in self.match_element(node): if m_type == self.MATCHED_ELEMENT: url = getChildText(node).strip() url = callback_func(url, m_criterion) replaceChildText(node, url) elif m_type == self.MATCHED_ATTRIBUTE: (namespaceURI, localName) = m_match url = node.getAttributeNS(namespaceURI, localName) url = callback_func(url, m_criterion) node.setAttributeNS(namespaceURI, localName, url) else: raise AssertionError("Unrecognized m_type") # Walk through the child nodes. for n in node.childNodes: if n.nodeType == node.ELEMENT_NODE: self.rewrite_links(n, callback_func)
def _load_content(self, page_generator): content = page_generator.content # Initialize the current entry current_entry = {} # Find the <head> and <body> elements. (headElement,) = page_generator.content.getElementsByTagName('head') (bodyElement,) = page_generator.content.getElementsByTagName('body') # Find the <atom:entry> element inside the <head> element. entries = headElement.getElementsByTagNameNS(ATOM_NAMESPACE, 'entry') if not entries: # No Atom feed entry. Do nothing. self._clear_entry_data(page_generator) return elif len(entries) > 1: # There should only be one atom:entry raise FGValueError("Too many Atom entries in %s" % (page_generator.path_info.source_filename,)) (entryElement,) = entries # Store the <atom:entry> element (with all namespace information included) dummyDocument = minidom.parseString('<dummy/>') new_entryElement = dummyDocument.importNode(entryElement, True) dummyDocument.documentElement.appendChild(new_entryElement) normalize_namespaces(new_entryElement) current_entry['atom:entry'] = new_entryElement.toxml() # Find and store the page summary (if any) in the <body> element, and un-set class="feed-summary". summaryElements = list(find_elements_with_class(content, "feed-summary", remove=True)) if len(summaryElements) > 1: # There should only be one element with class="feed-summary" raise FGValueError('Too many elements have class="feed-summary" in %s' % (page_generator.path_info.source_filename,)) elif summaryElements: # Save the summary current_entry['summary'] = summaryElements[0].toxml() else: # Save an empty summary current_entry['summary'] = None # Save the page body current_entry['body'] = bodyElement.toxml() # Save the page title (titleElement,) = getChildElementsNS(headElement, EMPTY_NAMESPACE, 'title') current_entry['title'] = getChildText(titleElement) # Save the path_info current_entry['path_info'] = page_generator.path_info # Perform some early processing self._early_process_entry(page_generator, current_entry) # Write the entry to disk self._write_entry_data(page_generator, current_entry)
def _filter_set_title(pg): # Extract title text from pg.content (head_element,) = getChildElementsNS(pg.content.documentElement, EMPTY_NAMESPACE, 'head') (title_element,) = getChildElementsNS(head_element, EMPTY_NAMESPACE, 'title') title_text = getChildText(title_element) # Replace children of <title> element in pg.title (title_element,) = pg.page.getElementsByTagName('title') for n in title_element.childNodes: title_element.removeChild(n) title_element.appendChild(pg.page.createTextNode(title_text))
def _handle_maxima_element(self, page_generator, element): return self.maxima_expression_placeholder(getChildText(element), force_img=(element.getAttribute('force') == 'img'))
def _handle_math_element(self, page_generator, element): self.math_placeholder(getChildText(element), force_img=(element.getAttribute('force') == 'img'))
def _handle_news_element(self, page_generator, c_newsElement): if self._feed_url is None: raise ValueError("news-here element found before set_news_feed called") # Find the template element (c_templateElement,) = getChildElementsNS(c_newsElement, NEWS_NAMESPACE, 'template') # Create the result document result_doc = minidom.parseString("<div/>") # Load the Atom feed feed = minidom.parseString(open(self._feed_path_info.output_filename, "rb").read()) # Get the content-type of page links page_content_type = self._framework.plugins['vars'].vars['page_content_type'] # Get the maximum number of entries (if any) limit = c_newsElement.getAttribute('limit') if not limit: limit = None else: limit = int(limit) for i, f_entryElement in enumerate(getChildElementsNS(feed.documentElement, ATOM_NAMESPACE, 'entry')): # Don't output more than the specified number of articles if limit is not None and i >= limit: break params = {} # Get entry title (f_titleElement,) = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'title') assert f_titleElement.getAttribute('type') == 'text' params['title'] = getChildText(f_titleElement) # Get entry publication/update dates params['published'] = getChildText(tuple(getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'published'))[0]) params['updated'] = getChildText(tuple(getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'updated'))[0]) # Get entry <link rel="alternate" type="text/html"> ee = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'link') for e in ee: rel = e.getAttribute('rel') type = e.getAttribute('type') hreflang = e.getAttribute('hreflang') if rel == "alternate" and type == page_content_type and not hreflang: params['href'] = e.getAttribute('href') break else: raise RuntimeError("link not found") # Get entry summary (f_summaryElement,) = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'summary') assert f_summaryElement.getAttribute('type') == 'xhtml' (f_summaryDiv,) = (n for n in f_summaryElement.childNodes if n.nodeType == n.ELEMENT_NODE) params['summaryDiv'] = f_summaryDiv # Create per-entry <div> element r_divElement = result_doc.createElement('div') result_doc.documentElement.appendChild(r_divElement) # Copy the template to the result self.__copy_template_to_result(c_templateElement, r_divElement, params) # Namespace normalization normalize_namespaces(result_doc.documentElement) # Replace the placeholder raise ReplaceWithNode(result_doc.documentElement)
def _early_process_entry(self, page_generator, entry): """Perform early in-place processing of an entry.""" entryDocument = minidom.parseString(entry['atom:entry']) entryElement = entryDocument.documentElement page_content_type = self._framework.plugins['vars'].vars['page_content_type'] # Extract the 'id' of the entry (idElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'id') entry['id'] = getChildText(idElement).strip() # Extract and normalize the 'published' date of the entry (publishedElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'published') entry['published'] = atom_datetime_to_utc(getChildText(publishedElement).strip()) # Extract and normalize the 'updated' date of the entry; Create it if it doesn't exist. ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'updated')) if ee: (updatedElement,) = ee # there should be only one else: # Create an <updated> element using the 'published' date updatedElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'updated') replaceChildText(updatedElement, entry['published']) entryElement.appendChild(updatedElement) entry['updated'] = atom_datetime_to_utc(getChildText(updatedElement).strip()) # Create a <title> element if one does not already exist. ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'title')) if not ee: titleElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'title') titleElement.setAttribute('type', 'text') titleElement.appendChild(entryDocument.createTextNode(entry['title'])) entryElement.appendChild(titleElement) # Create a <link rel="alternate"> element if one does not already exist. ee = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'link') linkElement = None for e in ee: rel = e.getAttribute('rel') type = e.getAttribute('type') hreflang = e.getAttribute('hreflang') if rel == "alternate" and type == page_content_type and not hreflang: if linkElement is not None: raise FGValueError('Conflicting <link rel="alternate" type=%r hreflang=%r> entries in %s' % ( page_content_type, hreflang, page_generator.path_info.source_filename,)) linkElement = e if not linkElement: linkElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'link') linkElement.setAttribute('rel', 'alternate') linkElement.setAttribute('href', page_generator.path_info.target_url) linkElement.setAttribute('type', page_content_type) entryElement.appendChild(linkElement) # Rewrite URLs in the atom:entry element rewrite_links(entryElement, ATOM_CRITERIA, page_generator.path_info.target_url, page_generator.path_info.base_url, always_absolute=True) # Add a <summary> element, if applicable if entry['summary']: summaryDocument = minidom.parseString(entry['summary']) # Rewrite URLs in the summary rewrite_links(summaryDocument.documentElement, HTML_CRITERIA, entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True) # Create Atom <summary> element summaryElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'summary') summaryElement.setAttribute('type', 'xhtml') entryElement.appendChild(summaryElement) # Create XHTML <div> element divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div') divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE) summaryElement.appendChild(divElement) # Add data for n in summaryDocument.documentElement.childNodes: divElement.appendChild(divElement.ownerDocument.importNode(n, True)) # Elements with no namespace become XHTML elements substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE}) # Clean up data = None summaryDocument.unlink() summaryDocument = None del entry['summary'] # Add a <content> element if True: bodyDocument = minidom.parseString(entry['body']) # Rewrite URLs in the body rewrite_links(bodyDocument.documentElement, HTML_CRITERIA, entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True) # Create Atom <content> element contentElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'content') contentElement.setAttribute('type', 'xhtml') entryElement.appendChild(contentElement) # Create XHTML <div> element divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div') divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE) contentElement.appendChild(divElement) # Add data for n in bodyDocument.documentElement.childNodes: divElement.appendChild(divElement.ownerDocument.importNode(n, True)) # Elements with no namespace become XHTML elements substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE}) # Clean up data = None bodyDocument.unlink() bodyDocument = None del entry['body'] # Perform xmlns normalization normalize_namespaces(entryDocument.documentElement, strip_dups=True) # Update the new atom:entry document entry['atom:entry'] = entryDocument.toxml()