Exemple #1
0
 def rollover(self):
     """Roll the StringIO over to a TempFile"""
     if not self._rolled:
         tmp = EncodedFile(TemporaryFile(), data_encoding='utf-8')
         pos = self.buffer.tell()
         tmp.write(self.buffer.getvalue())
         tmp.seek(pos)
         self.buffer.close()
         self._buffer = tmp
Exemple #2
0
 def rollover(self):
     """Roll the StringIO over to a TempFile"""
     if not self._rolled:
         tmp = EncodedFile(TemporaryFile(dir=self._dir),
                           data_encoding='utf-8')
         pos = self.buffer.tell()
         tmp.write(self.buffer.getvalue())
         tmp.seek(pos)
         self.buffer.close()
         self._buffer = tmp
Exemple #3
0
def load_snippets_from_txt_file(txt_file, snippet_count, book_id):
    """Load snippet_count snippets from the given text file."""
    size = os.path.getsize(txt_file.name)

    snippets = set()
    enc_file = EncodedFile(txt_file.file, 'utf-8', errors='ignore')
    while len(snippets) < snippet_count:
        starting_byte = random.randint(size / 10, 9 * size / 10)
        # Ignore the first line read since the cursor my start in the middle.
        enc_file.seek(starting_byte)
        line = guarded_readline(enc_file)

        pos = enc_file.tell()
        for i in range(2):
            line = guarded_readline(enc_file)
            if len(line) >= MIN_SNIPPET_SIZE:
                line = unicode(line, encoding='utf-8', errors='ignore')
                if VERBOSE:
                    print("{0} : {1}".format(txt_file.name, pos))
                snippets.add((line.strip(), pos, book_id))
                break
            pos = enc_file.tell()

    return snippets
Exemple #4
0
def load_snippets_from_txt_file(txt_file, snippet_count, book_id):
    """Load snippet_count snippets from the given text file."""
    size = os.path.getsize(txt_file.name)

    snippets = set()
    enc_file = EncodedFile(txt_file.file, 'utf-8', errors='ignore')
    while len(snippets) < snippet_count:
        starting_byte = random.randint(size / 10, 9 * size / 10)
        # Ignore the first line read since the cursor my start in the middle.
        enc_file.seek(starting_byte)
        line = guarded_readline(enc_file)

        pos = enc_file.tell()
        for i in range(2):
            line = guarded_readline(enc_file)
            if len(line) >= MIN_SNIPPET_SIZE:
                line = unicode(line, encoding='utf-8', errors='ignore')
                if VERBOSE:
                    print("{0} : {1}".format(txt_file.name, pos))
                snippets.add((line.strip(), pos, book_id))
                break
            pos = enc_file.tell()

    return snippets