def test_find_sentence_starts(self): sentences1_mmap = mmap.mmap(self.sentences1_file.fileno(), 0, access=mmap.ACCESS_READ) sentence_starts = find_sentence_starts(sentences1_mmap) self.sentences1_file.seek(sentence_starts[0]) self.assertEqual(self.sentences1_file.readline(), "yksi kaksi\n") self.sentences1_file.seek(sentence_starts[1]) self.assertEqual(self.sentences1_file.readline(), "kolme neljä viisi\n") self.sentences1_file.seek(sentence_starts[2]) self.assertEqual(self.sentences1_file.readline(), "kuusi seitsemän kahdeksan\n") self.sentences1_file.seek(sentence_starts[3]) self.assertEqual(self.sentences1_file.readline(), "yhdeksän\n") self.sentences1_file.seek(sentence_starts[4]) self.assertEqual(self.sentences1_file.readline(), "kymmenen\n") self.sentences1_file.seek(0) sentences2_mmap = mmap.mmap(self.sentences2_file.fileno(), 0, access=mmap.ACCESS_READ) sentence_starts = find_sentence_starts(sentences2_mmap) self.sentences2_file.seek(sentence_starts[0]) self.assertEqual(self.sentences2_file.readline(), "kymmenen yhdeksän\n") self.sentences2_file.seek(sentence_starts[1]) self.assertEqual(self.sentences2_file.readline(), "kahdeksan seitsemän kuusi\n") self.sentences2_file.seek(sentence_starts[2]) self.assertEqual(self.sentences2_file.readline(), "viisi\n") self.sentences2_file.seek(sentence_starts[3]) self.assertEqual(self.sentences2_file.readline(), "neljä\n") self.sentences2_file.seek(sentence_starts[4]) self.assertEqual(self.sentences2_file.readline(), "kolme kaksi yksi\n") self.sentences2_file.seek(0)
def test_find_sentence_starts(self): sentences1_mmap = mmap.mmap(self.sentences1_file.fileno(), 0, access=mmap.ACCESS_READ) sentence_starts = find_sentence_starts(sentences1_mmap) self.sentences1_file.seek(sentence_starts[0]) self.assertEqual(self.sentences1_file.readline(), 'yksi kaksi\n') self.sentences1_file.seek(sentence_starts[1]) self.assertEqual(self.sentences1_file.readline(), 'kolme neljä viisi\n') self.sentences1_file.seek(sentence_starts[2]) self.assertEqual(self.sentences1_file.readline(), 'kuusi seitsemän kahdeksan\n') self.sentences1_file.seek(sentence_starts[3]) self.assertEqual(self.sentences1_file.readline(), 'yhdeksän\n') self.sentences1_file.seek(sentence_starts[4]) self.assertEqual(self.sentences1_file.readline(), 'kymmenen\n') self.sentences1_file.seek(0) sentences2_mmap = mmap.mmap(self.sentences2_file.fileno(), 0, access=mmap.ACCESS_READ) sentence_starts = find_sentence_starts(sentences2_mmap) self.sentences2_file.seek(sentence_starts[0]) self.assertEqual(self.sentences2_file.readline(), 'kymmenen yhdeksän\n') self.sentences2_file.seek(sentence_starts[1]) self.assertEqual(self.sentences2_file.readline(), 'kahdeksan seitsemän kuusi\n') self.sentences2_file.seek(sentence_starts[2]) self.assertEqual(self.sentences2_file.readline(), 'viisi\n') self.sentences2_file.seek(sentence_starts[3]) self.assertEqual(self.sentences2_file.readline(), 'neljä\n') self.sentences2_file.seek(sentence_starts[4]) self.assertEqual(self.sentences2_file.readline(), 'kolme kaksi yksi\n') self.sentences2_file.seek(0)
def __init__(self, files): """Creates a memory map of the given files and finds the sentence starts. The pointers to sentence starts will be saved in a structure where each element is a tuple of two indices - the first index will select the file from the mmaps list and the second index points to the position inside the file. Also saves in ``pointer_ranges`` an index to the first pointer and one past the last pointer of each file. :type files: list of file objects :param files: input text files """ self.mmaps = [] self.pointers = [] self.pointer_ranges = [] for subset_file in files: subset_index = len(self.mmaps) subset_mmap = mmap.mmap(subset_file.fileno(), 0, prot=mmap.PROT_READ) self.mmaps.append(subset_mmap) logging.debug("Finding sentence start positions in %s.", subset_file.name) sys.stdout.flush() pointers = [(subset_index, x) for x in find_sentence_starts(subset_mmap)] pointers_start = len(self.pointers) self.pointers.extend(pointers) pointers_stop = len(self.pointers) self.pointer_ranges.append((pointers_start, pointers_stop))