def rollover(self): """Roll the StringIO over to a TempFile""" if not self._rolled: tmp = EncodedFile(TemporaryFile(), data_encoding='utf-8') pos = self.buffer.tell() tmp.write(self.buffer.getvalue()) tmp.seek(pos) self.buffer.close() self._buffer = tmp
def rollover(self): """Roll the StringIO over to a TempFile""" if not self._rolled: tmp = EncodedFile(TemporaryFile(dir=self._dir), data_encoding='utf-8') pos = self.buffer.tell() tmp.write(self.buffer.getvalue()) tmp.seek(pos) self.buffer.close() self._buffer = tmp
def load_snippets_from_txt_file(txt_file, snippet_count, book_id): """Load snippet_count snippets from the given text file.""" size = os.path.getsize(txt_file.name) snippets = set() enc_file = EncodedFile(txt_file.file, 'utf-8', errors='ignore') while len(snippets) < snippet_count: starting_byte = random.randint(size / 10, 9 * size / 10) # Ignore the first line read since the cursor my start in the middle. enc_file.seek(starting_byte) line = guarded_readline(enc_file) pos = enc_file.tell() for i in range(2): line = guarded_readline(enc_file) if len(line) >= MIN_SNIPPET_SIZE: line = unicode(line, encoding='utf-8', errors='ignore') if VERBOSE: print("{0} : {1}".format(txt_file.name, pos)) snippets.add((line.strip(), pos, book_id)) break pos = enc_file.tell() return snippets
def load_snippets_from_txt_file(txt_file, snippet_count, book_id): """Load snippet_count snippets from the given text file.""" size = os.path.getsize(txt_file.name) snippets = set() enc_file = EncodedFile(txt_file.file, 'utf-8', errors='ignore') while len(snippets) < snippet_count: starting_byte = random.randint(size / 10, 9 * size / 10) # Ignore the first line read since the cursor my start in the middle. enc_file.seek(starting_byte) line = guarded_readline(enc_file) pos = enc_file.tell() for i in range(2): line = guarded_readline(enc_file) if len(line) >= MIN_SNIPPET_SIZE: line = unicode(line, encoding='utf-8', errors='ignore') if VERBOSE: print("{0} : {1}".format(txt_file.name, pos)) snippets.add((line.strip(), pos, book_id)) break pos = enc_file.tell() return snippets