def _parse_main_text(self) -> None: """ Parse the main text of the article. :return: None """ self._main_text_elements.clear() self._text_images.clear() self._links.clear() self._videos.clear() text_section = self._parsed_html.find(name='section', attrs={'class': 'mainText'}) child: Tag for child in text_section.children: # These can be p, ul, h3, h4, div if child.name == 'p': # These can be text, span, strong, a, br self._main_text_elements.append(self._process_p(child)) elif child.name == 'ul': self._main_text_elements.append(self._process_ul(child)) elif child.name == 'h3' or child.name == 'h4': self._main_text_elements.append(self._process_h(child)) elif child.name == 'div': if child.next.name == 'a': image = self._process_img(child) self._text_images.append(image) self._main_text_elements.append(image) elif child.next.name == 'iframe': video = self._process_iframe(child) self._videos.append(video) self._main_text_elements.append(video) else: raise WrongFormatException(Strings.exception_html_syntax_error)
def _parse_self(self) -> None: """ Parse this css document and get the colors defined in it. :return: None """ parser = tinycss.make_parser('page3') stylesheet = parser.parse_stylesheet_file(self._file_path) for rule in stylesheet.rules: if rule.selector.as_css().startswith('.'): if len(rule.declarations) == 2: dec_names = [] dec_color = None for declaration in rule.declarations: dec_names.append(declaration.name) if declaration.name == 'color': dec_color = webcolors.hex_to_rgb( declaration.value.as_css()) # This means we have a text color declaration if dec_names == ['color', 'display']: color = Colour(dec_color.red, dec_color.green, dec_color.blue) if color == Colour(0, 0, 255, 255): raise WrongFormatException( Strings.exception_reserved_blue + ': ' + str(color)) self._str_to_color_dict[rule.selector.as_css().lstrip( '.')] = color
def translate_str_color(self, name: str) -> Colour: """ Translate a CSS color name into a wx.Colour :param name: the name of the color. :return: an instance of wx.Colour """ try: return self._str_to_color_dict[name] except KeyError as _: raise WrongFormatException(Strings.exception_unrecognized_color + ': ' + str(name))
def _parse_meta_description(self) -> None: """ Parse the meta description of this document and save it into an instance variable. :return: None :raises WrongFormatException: if there are more than one description tags. """ description = self._parsed_html.find_all(name='meta', attrs={ 'name': 'description', 'content': True }) if len(description) == 1: self._meta_description = description[0]['content'] else: raise WrongFormatException( Strings.exception_parse_multiple_descriptions)
def determine_menu_section_and_menu_item(self) -> None: """ Find out which menu this article belongs in. :return: None :raises WrongFormatException: If the article is not found in any menu. """ for menu in self._menus.values(): self._menu_item = menu.find_item_by_file_name(self.get_filename()) if self._menu_item: self._menu_item.set_article(self) # We do not want the items to start as modified when the document is loaded, but set makes them modified self._menu_item.set_modified(False) self._menu_section = menu break if not self._menu_item: raise WrongFormatException(Strings.exception_menu_item_missing + ' for: ' + self.get_filename())
def translate_color_str(self, color: Colour) -> str: """ Decode wx.Colour into string of the loaded color from css. :param color: the wx.Colour to translate. :return: The css name of the color. """ if color == NullColour: # Special case of empty paragraph which should by default be black. return Strings.color_black if color == BLUE: # Special case, deleted link color attribute remains blue for some reason. return Strings.color_black # wx.Colour can not be used as a key, so we have to find it this way. for name, rgb in self._str_to_color_dict.items(): if rgb == color: return name raise WrongFormatException(Strings.exception_unrecognized_color + ': ' + str(color))
def _parse_meta_keywords(self) -> None: """ Parse the meta keywords of this document and save it into an instance variable. :return: None :raises WrongFormatException: if there are more than one keyword tags. """ keywords = self._parsed_html.find_all(name='meta', attrs={ 'name': 'keywords', 'content': True }) if len(keywords) == 1: self._meta_keywords = [ word.strip() for word in keywords[0]['content'].split(',') ] else: raise WrongFormatException( Strings.exception_parse_multiple_keywords)
def _process_h(self, h: Tag) -> Heading: """ Process a 'h' tag in the text. :param h: The beautiful soup h element. :return: An Heading instance """ # The h element can have a color class. size = 0 text = str(h.string) color = 'black' if h.name == 'h3': size = Heading.SIZE_H3 elif h.name == 'h4': size = Heading.SIZE_H4 if size == 0: raise WrongFormatException(Strings.exception_html_syntax_error) if h.has_attr('class'): color = h.attrs['class'][0] self._plain_text += text + '\n' return Heading(Text(text, color=color), size)
def _process_p(self, p: Tag) -> Paragraph: """ Process a 'p' tag in the text. :param p: The beautiful soup p element. :return: a Paragraph instance """ paragraph = Paragraph() for child in p.children: # These can be text, span, strong, a, br child: Tag if not self._process_visual_tags(child, paragraph, False): if child.name == 'a': self._plain_text += child.string link = Link(str(child.string), child.attrs['href'], child.attrs['title'], self._articles, self._working_directory) paragraph.add_element(link) self._links.append(link) else: raise WrongFormatException( Strings.exception_html_syntax_error) return paragraph
def _prepare_documents(self, path: str) -> None: """ Goes through all supposed whitebear files in a directory. Files have to be readable and writeable. Constructs a dictionary {file name:path to the file}. :param path: Path to the supposed whitebear root directory. :raises AccessException if a file are not readable or not writeable. :return: None :raises UnrecognizedFileException if the file can not be validated """ file: str # Parse CSS, so we can send the instance into articles for color translation. file = os.path.join(path, 'styles.css') if os.path.isfile(file): if not os.access(file, os.R_OK) or not os.access(file, os.W_OK): raise AccessException(Strings.exception_access_html + " " + file) else: filename: str = os.path.basename(file) file_path: str = os.path.realpath(file) self._css_document = WhitebearDocumentCSS(filename, file_path) else: raise AccessException(Strings.exception_access_css) # Check all html files in directory are readable and writable for file in glob.glob(path + '/*.html'): file = os.path.join(path, file) if os.path.isfile(file): if not os.access(file, os.R_OK) or not os.access(file, os.W_OK): raise AccessException(Strings.exception_access_html + " " + file) else: filename: str = os.path.basename(file) file_path: str = os.path.realpath(file) try: xml_doc = html.parse(os.path.join(path, file)) if self._xmlschema_article.validate(xml_doc): self._article_documents[filename] = WhitebearDocumentArticle(file_path, self._menu_documents, self._article_documents, self._css_document) elif self._xmlschema_menu.validate(xml_doc): menu = WhitebearDocumentMenu(file_path, self._menu_documents) menu.parse_self() self._menu_documents[filename] = menu elif self._xmlschema_index.validate(xml_doc): self._index_document = WhitebearDocumentIndex(file_path, self._menu_documents, self._article_documents) else: # Skip known non-editable files if 'google' in filename or '404' in filename: continue else: raise UnrecognizedFileException(Strings.exception_file_unrecognized + ' ' + filename) except (XMLSyntaxError, ValueError) as e: raise UnrecognizedFileException(Strings.exception_html_syntax_error + '\n' + str(e) + '\n' + file) # Parse all articles after we have recognized and parsed all menu pages. for article in self._article_documents.values(): try: article.parse_self() article.set_index_document(self._index_document) except IndexError as _: raise WrongFormatException(Strings.exception_broken_html + ': ' + article.get_path()) try: # Parse index. self._index_document.parse_self() except IndexError as _: raise WrongFormatException(Strings.exception_broken_html + ': ' + self._index_document.get_path())