def test_correct_length(self): # Check that the corpus views report the correct lengths: for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(len(v), len(file_data.split())) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
def test_correct_values(self): # Check that corpus views produce the correct sequence of values. for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(list(v), file_data.split()) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(list(v), self.linetok.tokenize(file_data))
def docs(self, fileids=None): """ @return: A list of corpus document strings. @rtype: C{list} of C{StreamBackedCorpusView} @param fileids: A list of corpus files. @type fileids: C{list} of C{str} or regular expression """ return concat([ StreamBackedCorpusView(fileid, self._read_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ])
def verbs(self): """ @return: a corpus view that acts as a list of all verb lemmas in this corpus (from verbsfile). """ return StreamBackedCorpusView(self.abspath(self._verbsfile), read_line_block, encoding=self.encoding(self._verbsfile))