Esempio n. 1
0
    def load_jmdict(self, xml_filepath: str) -> None:
        """Load data from a JMdict XML file.

        Args:
            xml_filepath: Path to an JMdict XML file.

        Raises:
            ResourceLoadError: There was an issue with the passed JMdict XML
                file that prevented it from being loaded.
        """
        xml_last_modified_time = os.path.getmtime(xml_filepath)
        if self._load_from_shelf_if_newer(xml_last_modified_time):
            return

        if not os.path.exists(xml_filepath):
            utils.log_and_raise(
                _log, ResourceLoadError,
                'JMdict file not found at "{}"'.format(xml_filepath))

        _log.debug('Reading JMdict XML file at "%s"', xml_filepath)
        tree = ElementTree.parse(xml_filepath)
        _log.debug('Reading of JMdict XML file complete')

        self._entry_map = defaultdict(list)
        self._mecab_decomp_map = defaultdict(list)
        root = tree.getroot()
        for entry_element in root:
            entry_objs = self._parse_entry_xml(entry_element)
            for entry_obj in entry_objs:
                mecab_decomp = self._get_mecab_decomb(entry_obj)
                self._mecab_decomp_map[mecab_decomp].append(entry_obj)
                self._entry_map[entry_obj.text_form].append(entry_obj)

        self._set_max_entry_lens()
        self._write_to_shelf()
Esempio n. 2
0
    def get(self, query: Query) -> Optional[SearchResultPage]:
        """Get the cached first page of search results for the given query.

        Args:
            query: Query to get the cached first page of search results for.

        Returns:
            The cached first page of search results for the query, or None if
            the first page of search results is not in the cache for the query.
        """
        cached_results = self._redis_client.get(f'query:{query.query_str}')
        if cached_results is None:
            return None

        page = SearchResultPage(query=query)
        serialize.deserialize_search_results(cached_results, page)
        for result in page.search_results:
            article_id = result.article.database_id
            cached_article = self._redis_client.get(f'article:{article_id}')
            if cached_article is None:
                utils.log_and_raise(
                    _log, DataAccessError,
                    f'Article key for ID "{article_id}" not found in first '
                    f'cache')
            serialize.deserialize_article(cached_article, result.article)

        return page
Esempio n. 3
0
def _get_ipadic_neologd_version() -> str:
    """Return version of ipadic-NEologd used by this module.

    The version for ipadic-NEologd will be in the form of a date "yyyy.mm.dd".
    For example, 2019.06.11 for the ipadic-NEologd generated on June 11th,
    2019.
    """
    git_dir = utils.get_value_from_env_variable(
        _IPADIC_NEOLOGD_GIT_DIR_ENV_VAR)
    changelog_path = os.path.join(git_dir, _IPADIC_NEOLOGD_CHANGELOG_FILENAME)

    if not os.path.exists(changelog_path):
        utils.log_and_raise(
            _log, ResourceLoadError,
            'ipadic-NEologd change log file not found at "{}"'.format(
                changelog_path))

    match = None
    with open(changelog_path, 'r') as changelog_file:
        for line in changelog_file:
            match = re.match(_IPADIC_NEOLOGD_VERSION_REGEX, line)
            if match is not None:
                break

    if match is None:
        utils.log_and_raise(
            _log, ResourceLoadError,
            'ipadic-NEologd change log file at "{}" does not contain verison '
            'info'.format(changelog_path))

    return '{}.{}.{}'.format(match.group(1), match.group(2), match.group(3))
Esempio n. 4
0
    def _parse_body_text(self, article_tag: Tag) -> Optional[str]:
        """Parse the body text from NHK article HTML.

        Args:
            article_tag: Tag containing NHK article HTML.

        Returns:
            The parsed body text from article_tag.

        Raises:
            CannotParsePageError: There was an error parsing the body text.
        """
        body_tags: List[Tag] = []
        for id_ in self._ARTICLE_BODY_IDS:
            divs = article_tag.find_all('div', id=id_)
            _log.debug('Found %s with id "%s"', len(divs), id_)
            body_tags += divs

        for class_ in self._ARTICLE_BODY_CLASSES:
            divs = article_tag.find_all('div', class_=class_)
            _log.debug('Found %s with class "%s"', len(divs), class_)
            body_tags += divs

        body_text_sections = []
        for tag in body_tags:
            text = self._parse_body_div(tag)
            if text is not None and len(text) > 0:
                body_text_sections.append(text)

        if len(body_text_sections) == 0:
            utils.log_and_raise(
                _log, HtmlParsingError,
                'No body text sections in: "{}"'.format(article_tag))

        return '\n\n'.join(body_text_sections)
Esempio n. 5
0
    def _parse_count_string(self, count_str: str,
                            count_regex: Pattern) -> Optional[int]:
        """Parse a data count string for a series into an int.

        Also checks if the count string indicates that the count is hidden by
        preference of the author of the series.

        Args:
            count_str: String containing a count of some data from a Kakuyomu
                series page.
            count_regex: Pattern that the count string should match to be
                valid. The pattern must contain one group that captures the
                count number portion of the count string.

        Returns:
            The parsed count value as an int. If the count string indicates
            that the count is hidden by author preference, returns None
            instead.

        Raises:
            HtmlParsingError: The count string did not match the given pattern
                and was not the string indicating the count is hidden by author
                preference.
        """
        if count_str == self._INFO_LIST_HIDDEN_DATA_STRING:
            return None

        match = re.match(count_regex, count_str)
        if match is None:
            utils.log_and_raise(
                _log, HtmlParsingError,
                'Count string "{}" does not match pattern {}'.format(
                    count_str, count_regex))

        return int(match.group(1).replace(',', ''))
Esempio n. 6
0
def _get_jmdict_version() -> str:
    """Return version of JMdict currently used by this module.

    The version for JMdict will be in the form of a date "yyyy.mm.dd". For
    example, 2019.06.11 for the JMdict generated on June 11th, 2019.
    """
    jmdict_xml_filepath = utils.get_value_from_env_variable(
        _JMDICT_XML_FILEPATH_ENV_VAR)

    if not os.path.exists(jmdict_xml_filepath):
        utils.log_and_raise(
            _log, ResourceLoadError,
            'JMdict XML file not found at "{}"'.format(jmdict_xml_filepath))

    match = None
    with open(jmdict_xml_filepath, 'r') as jmdict_file:
        for line in jmdict_file:
            match = re.match(_JMDICT_FILE_VERSION_REGEX, line)
            if match is not None:
                break

    if match is None:
        utils.log_and_raise(
            _log, ResourceLoadError,
            'JMdict XML file at "{}" does not contain version info'.format(
                jmdict_xml_filepath))

    return '{}.{}.{}'.format(match.group(1), match.group(2), match.group(3))
Esempio n. 7
0
def _get_mecab_version() -> str:
    """Return version of MeCab on the system."""
    output = subprocess.run(['mecab-config', '--version'], capture_output=True)
    if output.returncode != 0:
        utils.log_and_raise(_log, ResourceLoadError,
                            'mecab is not available on this system')

    mecab_version = output.stdout.decode(sys.stdout.encoding).strip()
    return mecab_version
Esempio n. 8
0
    def _parse_entry_xml(self,
                         entry: ElementTree.Element) -> List[JMdictEntry]:
        """Parse all elements from a given JMdict XML entry.

        Because many Japanese words can be written using kanji as well as kana,
        there are often different ways to write the same word. JMdict entries
        include each of these representations as separate elements, so this
        function parses all of these elements plus the corresponding sense
        information and merges the info together into JMdictEntry objects.

        Args:
            entry: An XML entry element from a JMdict XML file.

        Returns:
            A list of all of the elements for the given entry.

        Raises:
            ResourceLoadError: The passed entry had malformed JMdict XML, so it
            could not be parsed.
        """
        repr_objs = []
        sense_objs = []
        for element in entry:
            if element.tag in self._REPR_ELEMENT_TAGS:
                repr_obj = JMdictEntry()
                self._parse_text_elements(repr_obj,
                                          entry, [self._ENTRY_ID_TAG],
                                          required=True)
                self._parse_text_elements(
                    repr_obj,
                    element, [self._REPR_TEXT_FORM_TAG[element.tag]],
                    required=True)
                self._parse_text_elements(
                    repr_obj,
                    element,
                    self._REPR_OPTIONAL_TAGS[element.tag],
                    required=False)
                repr_objs.append(repr_obj)

            elif element.tag == self._SENSE_ELEMENT_TAG:
                sense_obj = self._JMdictSense()
                self._parse_text_elements(sense_obj,
                                          element,
                                          self._SENSE_OPTIONAL_TAGS,
                                          required=False)
                sense_objs.append(sense_obj)

            elif element.tag != self._ENTRY_ID_TAG:
                entry_str = ElementTree.tostring(entry).decode('utf-8')
                utils.log_and_raise(
                    _log, ResourceLoadError,
                    'Malformed JMdict XML. Unknown tag "{}" found with "{}" '
                    'tag: "{}"'.format(element.tag, entry.tag, entry_str))

        self._add_sense_data(repr_objs, sense_objs)
        return repr_objs
Esempio n. 9
0
    def wrapper_require_write_permission(*args, **kwargs):
        if not args[0].access_mode.has_update_permission():
            utils.log_and_raise(
                _log, DataAccessPermissionError,
                'Update operation "{}" was attempted with only {} '
                'permission'.format(utils.get_full_name(func),
                                    args[0].access_mode.name))

        value = func(*args, **kwargs)
        return value
Esempio n. 10
0
    def _raise_if_no_text(self, element: ElementTree.Element,
                          parent_element: ElementTree.Element) -> None:
        """Raise ResourceLoadError if no accessible text in element."""
        if element.text is not None and len(element.text) > 0:
            return

        parent_str = ElementTree.tostring(parent_element).decode('utf-8')
        utils.log_and_raise(
            _log, ResourceLoadError,
            'Malformed JMdict XML. No accessible text within "{}" element: '
            '"{}"'.format(element.tag, parent_str))
Esempio n. 11
0
    def max_mecab_decomp_len(self) -> int:
        """Max len of a MeCab decomposition of the loaded JMdict entries.

        Property in order to make it read-only.
        """
        if self._max_mecab_decomp_len is None:
            utils.log_and_raise(
                _log, ResourceNotReadyError,
                'JMdict object used before loading any JMdict data.')

        return self._max_mecab_decomp_len
Esempio n. 12
0
    def _has_news_video(self, article_page_soup: BeautifulSoup) -> bool:
        """Return True if there is a news video on the article page."""
        main_tag = html.select_one_descendant_by_tag(article_page_soup, 'main')
        article_json_tag = html.select_one_descendant_by_tag(
            main_tag, 'script')

        article_json_text = article_json_tag.string
        if article_json_text is None:
            utils.log_and_raise(
                _log, HtmlParsingError,
                'No text in article JSON script tag in: "{}"'.format(main_tag))

        return re.search(self._HAS_VIDEO_REGEX, article_json_text) is not None
Esempio n. 13
0
def test_log_and_raise(caplog):
    """Test utils.log_and_raise to make sure it logs and raises as expected."""
    class TestError(Exception):
        pass

    log = logging.getLogger('test')
    with pytest.raises(TestError) as exc_info:
        utils.log_and_raise(log, TestError, ERROR_LOG_MESSAGE)

    assert exc_info.type is TestError
    assert exc_info.value.args[0] == ERROR_LOG_MESSAGE
    assert len(caplog.records) == 1
    assert (caplog.record_tuples[0] == ('test', logging.ERROR,
                                        ERROR_LOG_MESSAGE))
Esempio n. 14
0
    def _parse_json_datetime_str(self, dt_str: str) -> datetime:
        """Parse a datetime string from NHK article metadata json.

        The datetime strings in NHK article metadata json are stored as JST, so
        this function also converts the datetime to UTC.
        """
        try:
            dt = datetime.strptime(dt_str, self._NHK_JSON_DATETIME_FORMAT)
        except ValueError:
            utils.log_and_raise(
                _log, ValueError,
                'Failed to parse NHK json datetime "{}" using format '
                '"{}"'.format(dt_str, self._NHK_JSON_DATETIME_FORMAT))

        return utils.convert_jst_to_utc(dt)
Esempio n. 15
0
    def _find_all_raise_if_none(
            self, tag: str,
            parent_element: ElementTree.Element) -> List[ElementTree.Element]:
        """Find all tag elements in parent, and raises error if none.

        Raises ResourceLoadError if no tag elements are found.
        """
        elements = parent_element.findall(tag)
        if len(elements) == 0:
            parent_str = ElementTree.tostring(parent_element).decode('utf-8')
            utils.log_and_raise(
                _log, ResourceLoadError,
                'Malformed JMdict XML. No "{}" element within "{}" element: '
                '"{}"'.format(tag, parent_element.tag, parent_str))

        return elements
Esempio n. 16
0
    def parse(self,
              text: str,
              text_offset: int = 0) -> List[FoundJpnLexicalItem]:
        """Return the lexical items found by MeCab in the text.

        MeCab will give exactly one lexical item interpretation (its best
        guess) for each lexical item found in the text.

        Args:
            text: The text to parse with MeCab for lexical items.
            text_offset: Offset that this text starts at if the text is
                part of a larger body of text.

        Raises:
            TextAnalysisError: MeCab gave an unexpected output when parsing the
                text.
        """
        mecab_out = self._mecab_tagger.parse(text)
        parsed_tokens = self._parse_mecab_output(mecab_out)

        offset = 0
        found_lexical_items = []
        for parsed_token_tags in parsed_tokens:
            if (len(parsed_token_tags) == 1
                    and parsed_token_tags[0] == self._END_OF_SECTION_MARKER):
                continue

            if len(parsed_token_tags) not in self._EXPECTED_TOKEN_TAG_COUNTS:
                utils.log_and_raise(
                    _log, TextAnalysisError,
                    'Unexpected number of MeCab tags ({}) for token {} in '
                    '"{}"'.format(len(parsed_token_tags), parsed_token_tags,
                                  text))

            # Adjust offset to account for MeCab skipping some white space
            # characters.
            while (text[offset:offset + len(parsed_token_tags[0])] !=
                   parsed_token_tags[0]):
                offset += 1

            interp = self._create_mecab_interp(parsed_token_tags)
            fli = self._create_found_lexical_item(parsed_token_tags, interp,
                                                  text_offset + offset)
            offset += len(parsed_token_tags[0])
            found_lexical_items.append(fli)

        return found_lexical_items
Esempio n. 17
0
    def _parse_search_result_datetime(self, datetime_str: str) -> datetime:
        """Parse a datetime string from the search results page.

        Raises:
            HtmlParsingError: The datetime string could not be parsed.
        """
        try:
            dt = datetime.strptime(datetime_str,
                                   self._SEARCH_RESULT_DATETIME_FORMAT)
        except ValueError:
            utils.log_and_raise(
                _log, HtmlParsingError,
                'Failed to parse search result datetime string "{}" using '
                'format "{}"'.format(datetime_str,
                                     self._SEARCH_RESULT_DATETIME_FORMAT))

        # Search result datetime strings do not include seconds, so explicitly
        # set seconds to 0.
        return utils.convert_jst_to_utc(dt.replace(second=0))
Esempio n. 18
0
    def _parse_series_episode_metadatas(
            self, series_page_soup: BeautifulSoup,
            series_blog: JpnArticleBlog) -> List[JpnArticle]:
        """Parse the episode metadatas for a series from its homepage.

        Args:
            series_page_soup: A BeautifulSoup initialized with the content from
                a series homepage.
            series_blog: Blog info for this series.

        Returns:
            A list of the article metadatas for all episodes listed on the
            series homepage.
        """
        table_of_contents_items = self._select_table_of_contents_items(
            series_page_soup)

        article_metas = []
        ep_order_num = 1
        section_order_num = 0
        section_ep_order_num = 1
        section_name = None
        for item in table_of_contents_items:
            if self._is_section_li(item):
                section_order_num += 1
                section_ep_order_num = 1
                section_name = html.parse_valid_child_text(item).strip()
            elif self._is_episode_li(item):
                article_meta = self._parse_table_of_contents_episode(
                    item, series_blog, ep_order_num, section_name,
                    section_order_num, section_ep_order_num)
                article_metas.append(article_meta)
                ep_order_num += 1
                section_ep_order_num += 1
            else:
                utils.log_and_raise(
                    _log, HtmlParsingError,
                    'Unrecognized list item "{}" in table of contents: '
                    '"{}"'.format(item, series_page_soup))

        return article_metas
Esempio n. 19
0
    def get_containing_sentence(
            self, item_pos: ArticleTextPosition) -> Tuple[str, int]:
        """Get the sentence containing the lexical item at item_pos.

        Args:
            item_pos: The position whose containing sentence to get.

        Returns:
            (containing sentence, offset of containing sentence in article)
        """
        if self.full_text is None:
            utils.log_and_raise(
                _log, MissingDataError,
                'full_text is not set, so cannot get containing sentences in '
                '{!r}'.format(self))

        start = utils.find_jpn_sentence_start(self.full_text, item_pos.start)
        end = utils.find_jpn_sentence_end(self.full_text,
                                          item_pos.start + item_pos.len)

        return (self.full_text[start:end + 1], start)
Esempio n. 20
0
    def _get_mecab_neologd_dict_path(self) -> str:
        """Find the path to the NEologd dict in the system.

        Returns:
            The path to the directory containing the NEologd dictionary.
        """
        output = subprocess.run(['mecab-config', '--version'],
                                capture_output=True)
        if output.returncode != 0:
            utils.log_and_raise(
                _log, ResourceLoadError,
                'MeCab is not installed on this system, so the '
                'mecab-ipadic-NEologd dictionary cannot be used')

        output = subprocess.run(['mecab-config', '--dicdir'],
                                capture_output=True)
        if output.returncode != 0:
            utils.log_and_raise(
                _log, ResourceLoadError,
                'MeCab dictionary directory could not be retrieved, so the '
                'mecab-ipadic-NEologd dictionary cannot be used')

        neologd_path = os.path.join(
            output.stdout.decode(sys.stdout.encoding).strip(),
            self._MECAB_NEOLOGD_DIR_NAME)
        if not os.path.exists(neologd_path):
            utils.log_and_raise(
                _log, ResourceLoadError,
                'mecab-ipadic-NEologd is not installed on this system, so the '
                'mecab-ipadic-NEologd dictionary cannot be used')

        return neologd_path
Esempio n. 21
0
    def contains_entry(self, entry: Union[str, Tuple[str, ...]]) -> bool:
        """Test if entry is in the JMdict entries.

        Args:
            entry: value to check for in the loaded JMdict entries. If a
                string, checks if an entry with that text form exists. If a
                tuple, checks if an entry with that Mecab decomposition exists.

        Returns:
            True if the entry is in the loaded JMdict entries, False otherwise.

        Raises:
            ResourceNotReadyError: JMdict data has not been loaded into this
                JMdict object yet.
        """
        if self._entry_map is None or self._mecab_decomp_map is None:
            utils.log_and_raise(
                _log, ResourceNotReadyError,
                'JMdict object used before loading any JMdict data.')

        if isinstance(entry, str):
            return entry in self._entry_map
        return entry in self._mecab_decomp_map
Esempio n. 22
0
    def get_entries(self, entry: Union[str, Tuple[str,
                                                  ...]]) -> List[JMdictEntry]:
        """Get the list of JMdict entries that match the give entry.

        Args:
            entry: value to get matching JMdict entries for. If a string, gets
                entries with matching text form. If a tuple, gets entries with
                matching Mecab decomposition.

        Returns:
            A list of the matching JMdict entries.

        Raises:
            ResourceNotReadyError: JMdict data has not been loaded into this
                JMdict object yet.
        """
        if self._entry_map is None or self._mecab_decomp_map is None:
            utils.log_and_raise(
                _log, ResourceNotReadyError,
                'JMdict object used before loading any JMdict data.')

        if isinstance(entry, str):
            return self._entry_map.get(entry, [])
        return self._mecab_decomp_map.get(entry, [])
Esempio n. 23
0
def _raise_parsing_error(error_msg: str) -> None:
    """Raise and log error encountered during HTML parsing."""
    utils.log_and_raise(_log, HtmlParsingError, error_msg)