Example #1
0
 def __init__(self, corpus_file, encoding, aligned, group_by_sent,
              word_tokenizer, sent_tokenizer, alignedsent_block_reader):
     self._aligned = aligned
     self._group_by_sent = group_by_sent
     self._word_tokenizer = word_tokenizer
     self._sent_tokenizer = sent_tokenizer
     self._alignedsent_block_reader = alignedsent_block_reader
     StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
Example #2
0
 def __init__(self, *args, **kwargs):
     StreamBackedCorpusView.__init__(self, *args, **kwargs)
     # open self._stream
     self._open()
     # skip the heading block
     read_blankline_block(self._stream)
     # reset the start position to the current position in the stream
     self._filepos = [self._stream.tell()]
Example #3
0
 def __init__(self, corpus_file, encoding, aligned, group_by_sent,
              word_tokenizer, sent_tokenizer, alignedsent_block_reader):
     self._aligned = aligned
     self._group_by_sent = group_by_sent
     self._word_tokenizer = word_tokenizer
     self._sent_tokenizer = sent_tokenizer
     self._alignedsent_block_reader = alignedsent_block_reader
     StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
Example #4
0
	def __init__(self, *args, **kwargs):
		StreamBackedCorpusView.__init__(self, *args, **kwargs)
		# open self._stream
		self._open()
		# skip the heading block
		read_blankline_block(self._stream)
		# reset the start position to the current position in the stream
		self._filepos = [self._stream.tell()]
Example #5
0
    def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
Example #6
0
    def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data))
Example #7
0
    def __init__(self, fileid, delete_on_gc=False):
        """
        Create a new corpus view that reads the pickle corpus
        ``fileid``.

        :param delete_on_gc: If true, then ``fileid`` will be deleted
            whenever this object gets garbage-collected.
        """
        self._delete_on_gc = delete_on_gc
        StreamBackedCorpusView.__init__(self, fileid, encoding=None)
Example #8
0
    def __init__(self, filename, startpos=0, **kwargs):
        StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
        self.in_sentence = False
        self.position = 0

        self.show_tags = kwargs.pop('tags', True)
        self.disamb_only = kwargs.pop('disamb_only', True)
        self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
        self.simplify_tags = kwargs.pop('simplify_tags', False)
        self.one_tag = kwargs.pop('one_tag', True)
        self.append_no_space = kwargs.pop('append_no_space', False)
        self.append_space = kwargs.pop('append_space', False)
        self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
Example #9
0
    def __init__(self, filename, startpos=0, **kwargs):
        StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
        self.in_sentence = False
        self.position = 0

        self.show_tags = kwargs.pop('tags', True)
        self.disamb_only = kwargs.pop('disamb_only', True)
        self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE)
        self.simplify_tags = kwargs.pop('simplify_tags', False)
        self.one_tag = kwargs.pop('one_tag', True)
        self.append_no_space = kwargs.pop('append_no_space', False)
        self.append_space = kwargs.pop('append_space', False)
        self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
Example #10
0
    def __init__(self, filename, startpos=0, **kwargs):
        StreamBackedCorpusView.__init__(self, filename, None, startpos, None)
        self.in_sentence = False
        self.position = 0

        self.show_tags = kwargs.pop("tags", True)
        self.disamb_only = kwargs.pop("disamb_only", True)
        self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE)
        self.simplify_tags = kwargs.pop("simplify_tags", False)
        self.one_tag = kwargs.pop("one_tag", True)
        self.append_no_space = kwargs.pop("append_no_space", False)
        self.append_space = kwargs.pop("append_space", False)
        self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
Example #11
0
 def __init__(self, fileid, block_reader=None, startpos=0, encoding='utf8'):
     StreamBackedCorpusView.__init__(self,
                                     fileid,
                                     block_reader=block_reader,
                                     startpos=0,
                                     encoding='utf8')
     try:
         if isinstance(self._fileid, GzipFileSystemPathPointer):
             if re.match(r'.*\.gz$', str(self._fileid)):
                 self._eofpos = self.getuncompressedsize(self._fileid)
             else:
                 self._eofpos = self._fileid.file_size()
         else:
             self._eofpos = os.stat(self._fileid).st_size
     except Exception as exc:
         raise ValueError('Unable to open or access %r -- %s' %
                          (fileid, exc))
Example #12
0
 def docs(self, fileids=None):
     """
     @return: A list of corpus document strings.
     @rtype: C{list} of C{StreamBackedCorpusView}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression
     """
     return concat([
         StreamBackedCorpusView(fileid, self._read_block, encoding=enc)
         for (fileid, enc) in self.abspaths(fileids, True)
     ])
Example #13
0
 def verbs(self):
     """
     @return: a corpus view that acts as a list of all verb lemmas in this corpus (from verbsfile).
     """
     return StreamBackedCorpusView(self.abspath(self._verbsfile),
                                   read_line_block, encoding=self.encoding(self._verbsfile))
Example #14
0
 def __init__(self, *args, **kwargs):
     StreamBackedCorpusView.__init__(self, *args, **kwargs)
	def __init__(self, *args, **kwargs):
		StreamBackedCorpusView.__init__(self, *args, **kwargs)
		# open self._stream
		self._open()
		# skip the heading block
		 self.read_block(self._stream)
Example #16
0
 def __init__(self, *args, **kwargs):
     StreamBackedCorpusView.__init__(self, *args, **kwargs)
     self._open()
     self.read_block(self._stream)
     self._filepos = [self._stream.tell()]
Example #17
0
 def __init__(self, *args, **kwargs):
     StreamBackedCorpusView.__init__(self, *args, **kwargs)
     self._open()
     self.read_block(self._stream)
     self._filepos = [self._stream.tell()]