def __init__(self, corpus_file, encoding, aligned, group_by_sent, word_tokenizer, sent_tokenizer, alignedsent_block_reader): self._aligned = aligned self._group_by_sent = group_by_sent self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
def __init__(self, *args, **kwargs): StreamBackedCorpusView.__init__(self, *args, **kwargs) # open self._stream self._open() # skip the heading block read_blankline_block(self._stream) # reset the start position to the current position in the stream self._filepos = [self._stream.tell()]
def test_correct_length(self): # Check that the corpus views report the correct lengths: for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(len(v), len(file_data.split())) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
def test_correct_values(self): # Check that corpus views produce the correct sequence of values. for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(list(v), file_data.split()) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(list(v), self.linetok.tokenize(file_data))
def __init__(self, fileid, delete_on_gc=False): """ Create a new corpus view that reads the pickle corpus ``fileid``. :param delete_on_gc: If true, then ``fileid`` will be deleted whenever this object gets garbage-collected. """ self._delete_on_gc = delete_on_gc StreamBackedCorpusView.__init__(self, fileid, encoding=None)
def __init__(self, filename, startpos=0, **kwargs): StreamBackedCorpusView.__init__(self, filename, None, startpos, None) self.in_sentence = False self.position = 0 self.show_tags = kwargs.pop('tags', True) self.disamb_only = kwargs.pop('disamb_only', True) self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE) self.simplify_tags = kwargs.pop('simplify_tags', False) self.one_tag = kwargs.pop('one_tag', True) self.append_no_space = kwargs.pop('append_no_space', False) self.append_space = kwargs.pop('append_space', False) self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
def __init__(self, filename, startpos=0, **kwargs): StreamBackedCorpusView.__init__(self, filename, None, startpos, None) self.in_sentence = False self.position = 0 self.show_tags = kwargs.pop("tags", True) self.disamb_only = kwargs.pop("disamb_only", True) self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE) self.simplify_tags = kwargs.pop("simplify_tags", False) self.one_tag = kwargs.pop("one_tag", True) self.append_no_space = kwargs.pop("append_no_space", False) self.append_space = kwargs.pop("append_space", False) self.replace_xmlentities = kwargs.pop("replace_xmlentities", True)
def __init__(self, fileid, block_reader=None, startpos=0, encoding='utf8'): StreamBackedCorpusView.__init__(self, fileid, block_reader=block_reader, startpos=0, encoding='utf8') try: if isinstance(self._fileid, GzipFileSystemPathPointer): if re.match(r'.*\.gz$', str(self._fileid)): self._eofpos = self.getuncompressedsize(self._fileid) else: self._eofpos = self._fileid.file_size() else: self._eofpos = os.stat(self._fileid).st_size except Exception as exc: raise ValueError('Unable to open or access %r -- %s' % (fileid, exc))
def docs(self, fileids=None): """ @return: A list of corpus document strings. @rtype: C{list} of C{StreamBackedCorpusView} @param fileids: A list of corpus files. @type fileids: C{list} of C{str} or regular expression """ return concat([ StreamBackedCorpusView(fileid, self._read_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True) ])
def verbs(self): """ @return: a corpus view that acts as a list of all verb lemmas in this corpus (from verbsfile). """ return StreamBackedCorpusView(self.abspath(self._verbsfile), read_line_block, encoding=self.encoding(self._verbsfile))
def __init__(self, *args, **kwargs): StreamBackedCorpusView.__init__(self, *args, **kwargs)
def __init__(self, *args, **kwargs): StreamBackedCorpusView.__init__(self, *args, **kwargs) # open self._stream self._open() # skip the heading block self.read_block(self._stream)
def __init__(self, *args, **kwargs): StreamBackedCorpusView.__init__(self, *args, **kwargs) self._open() self.read_block(self._stream) self._filepos = [self._stream.tell()]