def get_words_with_coordinates(self): with open(self.path, 'r') as f: for line_index, line in enumerate(f): if line_index < self._start_line: continue if line_index == self._start_line: current_line = tokenize(line, with_newline=True)[self._start_word:] else: current_line = tokenize(line, with_newline=True) for word_index, word in enumerate(current_line): yield word, line_index, word_index
def _load_buffer(self): result = list() with open(self.path) as f: for index, line in enumerate(f): if index > self._line_number: break # Reached far end of buffer if index == self._line_number - self._bufsize or index == 0: self._buffer_start = index # Reached near end of buffer if index > self._line_number - self._bufsize: if index == self.start_line: result.extend(tokenize(line, with_newline=True)[:self.start_word]) else: result.extend(tokenize(line, with_newline=True)) return reversed(result)
def _load_lines(self): with open(self.path) as f: for index, line in enumerate(f): self._lines[index] = tokenize(line)
def test_tokenize_newline(self): assert helper.tokenize('', with_newline=True)[-1] == '\n'
def test_tokenize(self, line, result): assert helper.tokenize(line) == result