Esempio n. 1
0
    def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
Esempio n. 2
0
    def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data))
Esempio n. 3
0
 def docs(self, fileids=None):
     """
     @return: A list of corpus document strings.
     @rtype: C{list} of C{StreamBackedCorpusView}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression
     """
     return concat([
         StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
         for (fileid, enc) in self.abspaths(fileids, True)
     ])
Esempio n. 4
0
 def verbs(self):
     """
     @return: a corpus view that acts as a list of all verb lemmas in this corpus (from verbsfile).
     """
     return StreamBackedCorpusView(self.abspath(self._verbsfile),
                                   read_line_block, encoding=self.encoding(self._verbsfile))