예제 #1
0
 def _parse_main_text(self) -> None:
     """
     Parse the main text of the article.
     :return: None
     """
     self._main_text_elements.clear()
     self._text_images.clear()
     self._links.clear()
     self._videos.clear()
     text_section = self._parsed_html.find(name='section',
                                           attrs={'class': 'mainText'})
     child: Tag
     for child in text_section.children:
         # These can be p, ul, h3, h4, div
         if child.name == 'p':
             # These can be text, span, strong, a, br
             self._main_text_elements.append(self._process_p(child))
         elif child.name == 'ul':
             self._main_text_elements.append(self._process_ul(child))
         elif child.name == 'h3' or child.name == 'h4':
             self._main_text_elements.append(self._process_h(child))
         elif child.name == 'div':
             if child.next.name == 'a':
                 image = self._process_img(child)
                 self._text_images.append(image)
                 self._main_text_elements.append(image)
             elif child.next.name == 'iframe':
                 video = self._process_iframe(child)
                 self._videos.append(video)
                 self._main_text_elements.append(video)
         else:
             raise WrongFormatException(Strings.exception_html_syntax_error)
    def _parse_self(self) -> None:
        """
        Parse this css document and get the colors defined in it.
        :return: None
        """
        parser = tinycss.make_parser('page3')
        stylesheet = parser.parse_stylesheet_file(self._file_path)

        for rule in stylesheet.rules:
            if rule.selector.as_css().startswith('.'):
                if len(rule.declarations) == 2:
                    dec_names = []
                    dec_color = None
                    for declaration in rule.declarations:
                        dec_names.append(declaration.name)
                        if declaration.name == 'color':
                            dec_color = webcolors.hex_to_rgb(
                                declaration.value.as_css())
                    # This means we have a text color declaration
                    if dec_names == ['color', 'display']:
                        color = Colour(dec_color.red, dec_color.green,
                                       dec_color.blue)
                        if color == Colour(0, 0, 255, 255):
                            raise WrongFormatException(
                                Strings.exception_reserved_blue + ': ' +
                                str(color))
                        self._str_to_color_dict[rule.selector.as_css().lstrip(
                            '.')] = color
 def translate_str_color(self, name: str) -> Colour:
     """
     Translate a CSS color name into a wx.Colour
     :param name: the name of the color.
     :return: an instance of wx.Colour
     """
     try:
         return self._str_to_color_dict[name]
     except KeyError as _:
         raise WrongFormatException(Strings.exception_unrecognized_color +
                                    ': ' + str(name))
 def _parse_meta_description(self) -> None:
     """
     Parse the meta description of this document and save it into an instance variable.
     :return: None
     :raises WrongFormatException: if there are more than one description tags.
     """
     description = self._parsed_html.find_all(name='meta',
                                              attrs={
                                                  'name': 'description',
                                                  'content': True
                                              })
     if len(description) == 1:
         self._meta_description = description[0]['content']
     else:
         raise WrongFormatException(
             Strings.exception_parse_multiple_descriptions)
예제 #5
0
 def determine_menu_section_and_menu_item(self) -> None:
     """
     Find out which menu this article belongs in.
     :return: None
     :raises WrongFormatException: If the article is not found in any menu.
     """
     for menu in self._menus.values():
         self._menu_item = menu.find_item_by_file_name(self.get_filename())
         if self._menu_item:
             self._menu_item.set_article(self)
             # We do not want the items to start as modified when the document is loaded, but set makes them modified
             self._menu_item.set_modified(False)
             self._menu_section = menu
             break
     if not self._menu_item:
         raise WrongFormatException(Strings.exception_menu_item_missing +
                                    ' for: ' + self.get_filename())
 def translate_color_str(self, color: Colour) -> str:
     """
     Decode wx.Colour into string of the loaded color from css.
     :param color: the wx.Colour to translate.
     :return: The css name of the color.
     """
     if color == NullColour:
         # Special case of empty paragraph which should by default be black.
         return Strings.color_black
     if color == BLUE:
         # Special case, deleted link color attribute remains blue for some reason.
         return Strings.color_black
     # wx.Colour can not be used as a key, so we have to find it this way.
     for name, rgb in self._str_to_color_dict.items():
         if rgb == color:
             return name
     raise WrongFormatException(Strings.exception_unrecognized_color +
                                ': ' + str(color))
 def _parse_meta_keywords(self) -> None:
     """
     Parse the meta keywords of this document and save it into an instance variable.
     :return: None
     :raises WrongFormatException: if there are more than one keyword tags.
     """
     keywords = self._parsed_html.find_all(name='meta',
                                           attrs={
                                               'name': 'keywords',
                                               'content': True
                                           })
     if len(keywords) == 1:
         self._meta_keywords = [
             word.strip() for word in keywords[0]['content'].split(',')
         ]
     else:
         raise WrongFormatException(
             Strings.exception_parse_multiple_keywords)
예제 #8
0
 def _process_h(self, h: Tag) -> Heading:
     """
     Process a 'h' tag in the text.
     :param h: The beautiful soup h element.
     :return: An Heading instance
     """
     # The h element can have a color class.
     size = 0
     text = str(h.string)
     color = 'black'
     if h.name == 'h3':
         size = Heading.SIZE_H3
     elif h.name == 'h4':
         size = Heading.SIZE_H4
     if size == 0:
         raise WrongFormatException(Strings.exception_html_syntax_error)
     if h.has_attr('class'):
         color = h.attrs['class'][0]
     self._plain_text += text + '\n'
     return Heading(Text(text, color=color), size)
예제 #9
0
 def _process_p(self, p: Tag) -> Paragraph:
     """
     Process a 'p' tag in the text.
     :param p: The beautiful soup p element.
     :return: a Paragraph instance
     """
     paragraph = Paragraph()
     for child in p.children:
         # These can be text, span, strong, a, br
         child: Tag
         if not self._process_visual_tags(child, paragraph, False):
             if child.name == 'a':
                 self._plain_text += child.string
                 link = Link(str(child.string), child.attrs['href'],
                             child.attrs['title'], self._articles,
                             self._working_directory)
                 paragraph.add_element(link)
                 self._links.append(link)
             else:
                 raise WrongFormatException(
                     Strings.exception_html_syntax_error)
     return paragraph
예제 #10
0
    def _prepare_documents(self, path: str) -> None:
        """
        Goes through all supposed whitebear files in a directory. Files have to be readable and writeable. Constructs a
        dictionary {file name:path to the file}.
        :param path: Path to the supposed whitebear root directory.
        :raises AccessException if a file are not readable or not writeable.
        :return: None
        :raises UnrecognizedFileException if the file can not be validated
        """
        file: str
        # Parse CSS, so we can send the instance into articles for color translation.
        file = os.path.join(path, 'styles.css')
        if os.path.isfile(file):
            if not os.access(file, os.R_OK) or not os.access(file, os.W_OK):
                raise AccessException(Strings.exception_access_html + " " + file)
            else:
                filename: str = os.path.basename(file)
                file_path: str = os.path.realpath(file)
                self._css_document = WhitebearDocumentCSS(filename, file_path)
        else:
            raise AccessException(Strings.exception_access_css)

        # Check all html files in directory are readable and writable
        for file in glob.glob(path + '/*.html'):
            file = os.path.join(path, file)
            if os.path.isfile(file):
                if not os.access(file, os.R_OK) or not os.access(file, os.W_OK):
                    raise AccessException(Strings.exception_access_html + " " + file)
                else:
                    filename: str = os.path.basename(file)
                    file_path: str = os.path.realpath(file)
                    try:
                        xml_doc = html.parse(os.path.join(path, file))
                        if self._xmlschema_article.validate(xml_doc):
                            self._article_documents[filename] = WhitebearDocumentArticle(file_path,
                                                                                         self._menu_documents,
                                                                                         self._article_documents,
                                                                                         self._css_document)
                        elif self._xmlschema_menu.validate(xml_doc):
                            menu = WhitebearDocumentMenu(file_path, self._menu_documents)
                            menu.parse_self()
                            self._menu_documents[filename] = menu
                        elif self._xmlschema_index.validate(xml_doc):
                            self._index_document = WhitebearDocumentIndex(file_path, self._menu_documents,
                                                                          self._article_documents)
                        else:
                            # Skip known non-editable files
                            if 'google' in filename or '404' in filename:
                                continue
                            else:
                                raise UnrecognizedFileException(Strings.exception_file_unrecognized + ' ' + filename)
                    except (XMLSyntaxError, ValueError) as e:
                        raise UnrecognizedFileException(Strings.exception_html_syntax_error + '\n' + str(e) + '\n' +
                                                        file)

        # Parse all articles after we have recognized and parsed all menu pages.
        for article in self._article_documents.values():
            try:
                article.parse_self()
                article.set_index_document(self._index_document)
            except IndexError as _:
                raise WrongFormatException(Strings.exception_broken_html + ': ' + article.get_path())
        try:
            # Parse index.
            self._index_document.parse_self()
        except IndexError as _:
            raise WrongFormatException(Strings.exception_broken_html + ': ' + self._index_document.get_path())