Ejemplo n.º 1
0
 def get_words_with_coordinates(self):
     with open(self.path, 'r') as f:
         for line_index, line in enumerate(f):
             if line_index < self._start_line: continue
             if line_index == self._start_line:
                 current_line = tokenize(line, with_newline=True)[self._start_word:]
             else:
                 current_line = tokenize(line, with_newline=True)
             for word_index, word in enumerate(current_line):
                 yield word, line_index, word_index
Ejemplo n.º 2
0
 def _load_buffer(self):
     result = list()
     with open(self.path) as f:
         for index, line in enumerate(f):
             if index > self._line_number: break # Reached far end of buffer
             if index == self._line_number - self._bufsize or index == 0:
                 self._buffer_start = index # Reached near end of buffer
             if index > self._line_number - self._bufsize:
                 if index == self.start_line:
                     result.extend(tokenize(line, with_newline=True)[:self.start_word])
                 else:
                     result.extend(tokenize(line, with_newline=True))
     return reversed(result)
Ejemplo n.º 3
0
 def _load_lines(self):
     with open(self.path) as f:
         for index, line in enumerate(f):
             self._lines[index] = tokenize(line)
Ejemplo n.º 4
0
 def test_tokenize_newline(self):
     assert helper.tokenize('', with_newline=True)[-1] == '\n'
Ejemplo n.º 5
0
 def test_tokenize(self, line, result):
     assert helper.tokenize(line) == result