def _can_expand_right(self, pos: ArticleTextPosition) -> bool: """Return True if it's possible to expand sample text to the right. Only returns False when expanding right would expand outside the title of the article or outside of the end of the article. Returns False when: - The end of the article is immediately to the right. - Expanding right would expand from inside the article title to outside of it. - The sentence to the right has already been used in the preview. """ if (pos.start + pos.len) == len(self._article.full_text): return False # Expansion from inside the title to outside it is not allowed if pos.start < len(self._article.title): right_end = utils.find_jpn_sentence_start(self._article.full_text, pos.start + pos.len) while right_end > 0 and self._article.full_text[right_end] == '\n': right_end -= 1 if right_end >= len(self._article.title): return False if (pos.start + pos.len) in self._used_sentences: return False return True
def group_text_positions_by_sentence( self, text_positions: List[ArticleTextPosition]) -> List[SentenceGroup]: """Group a list of text positions by their containing sentences. Args: text_positions: List of of text positions in this article. Returns: A list of (sentence position, contained text positions) tuples. The tuples are sorted by sentence start index. """ sentence_groups: SentenceGroupMap = defaultdict(list) end = -1 for pos in sorted(text_positions, key=attrgetter('start')): if pos.start > end: start = utils.find_jpn_sentence_start(self.full_text, pos.start) end = utils.find_jpn_sentence_end(self.full_text, pos.start + pos.len) sentence_groups[ArticleTextPosition(start, end - start + 1)].append(pos) group_tuples = [] for sentence_pos, text_pos_list in sentence_groups.items(): group_tuples.append((sentence_pos, tuple(text_pos_list))) return group_tuples
def _get_left_sentence_segs( self, pos: ArticleTextPosition ) -> Tuple[List[PreviewSampleTextSegment], int]: """Get the segments and start index of the sentence left of pos.""" left_start = utils.find_jpn_sentence_start(self._article.full_text, pos.start - 1) found_positions = self._sentence_found_positions_map.get( left_start, ()) left_segs = self._create_sample_segments( ArticleTextPosition(left_start, pos.start - left_start), found_positions) return (left_segs, left_start)
def get_containing_sentence( self, item_pos: ArticleTextPosition) -> Tuple[str, int]: """Get the sentence containing the lexical item at item_pos. Args: item_pos: The position whose containing sentence to get. Returns: (containing sentence, offset of containing sentence in article) """ if self.full_text is None: utils.log_and_raise( _log, MissingDataError, 'full_text is not set, so cannot get containing sentences in ' '{!r}'.format(self)) start = utils.find_jpn_sentence_start(self.full_text, item_pos.start) end = utils.find_jpn_sentence_end(self.full_text, item_pos.start + item_pos.len) return (self.full_text[start:end + 1], start)
def _can_expand_left(self, pos: ArticleTextPosition) -> bool: """Return True if it's possible to expand sample text to the left. Returns False when: - The start of the article is immediately to the left. - The title of the article is immediately to the left. - The sentence to the left has already been used in the preview. """ if pos.start == 0: return False # Expansion from outside the title to inside it is not allowed left_start = utils.find_jpn_sentence_start(self._article.full_text, pos.start - 1) if (pos.start >= len(self._article.title) and left_start < len(self._article.title)): return False if left_start in self._used_sentences: return False return True
def assert_sentence_start_ends(text, ends): """Assert find_jpn_sentence_(start|end) works for all of the text. This is done by checking that the correct sentence start and end is given when the functions are run on every character in the text. Args: text: A string of text to use for the test. ends: The indexes of the ending character of every sentence in text in ascending order. """ sentence_start = 0 sentence_end = ends[0] end_index = 0 for index, _ in enumerate(text): if index > sentence_end: end_index += 1 sentence_start = index sentence_end = ends[end_index] assert utils.find_jpn_sentence_start(text, index) == sentence_start assert utils.find_jpn_sentence_end(text, index) == sentence_end