Example #1
0
    def test_find_sentence_starts(self):
        sentences1_mmap = mmap.mmap(self.sentences1_file.fileno(), 0, access=mmap.ACCESS_READ)
        sentence_starts = find_sentence_starts(sentences1_mmap)
        self.sentences1_file.seek(sentence_starts[0])
        self.assertEqual(self.sentences1_file.readline(), "yksi kaksi\n")
        self.sentences1_file.seek(sentence_starts[1])
        self.assertEqual(self.sentences1_file.readline(), "kolme neljä viisi\n")
        self.sentences1_file.seek(sentence_starts[2])
        self.assertEqual(self.sentences1_file.readline(), "kuusi seitsemän kahdeksan\n")
        self.sentences1_file.seek(sentence_starts[3])
        self.assertEqual(self.sentences1_file.readline(), "yhdeksän\n")
        self.sentences1_file.seek(sentence_starts[4])
        self.assertEqual(self.sentences1_file.readline(), "kymmenen\n")
        self.sentences1_file.seek(0)

        sentences2_mmap = mmap.mmap(self.sentences2_file.fileno(), 0, access=mmap.ACCESS_READ)
        sentence_starts = find_sentence_starts(sentences2_mmap)
        self.sentences2_file.seek(sentence_starts[0])
        self.assertEqual(self.sentences2_file.readline(), "kymmenen yhdeksän\n")
        self.sentences2_file.seek(sentence_starts[1])
        self.assertEqual(self.sentences2_file.readline(), "kahdeksan seitsemän kuusi\n")
        self.sentences2_file.seek(sentence_starts[2])
        self.assertEqual(self.sentences2_file.readline(), "viisi\n")
        self.sentences2_file.seek(sentence_starts[3])
        self.assertEqual(self.sentences2_file.readline(), "neljä\n")
        self.sentences2_file.seek(sentence_starts[4])
        self.assertEqual(self.sentences2_file.readline(), "kolme kaksi yksi\n")
        self.sentences2_file.seek(0)
Example #2
0
    def test_find_sentence_starts(self):
        sentences1_mmap = mmap.mmap(self.sentences1_file.fileno(),
                                    0,
                                    access=mmap.ACCESS_READ)
        sentence_starts = find_sentence_starts(sentences1_mmap)
        self.sentences1_file.seek(sentence_starts[0])
        self.assertEqual(self.sentences1_file.readline(), 'yksi kaksi\n')
        self.sentences1_file.seek(sentence_starts[1])
        self.assertEqual(self.sentences1_file.readline(),
                         'kolme neljä viisi\n')
        self.sentences1_file.seek(sentence_starts[2])
        self.assertEqual(self.sentences1_file.readline(),
                         'kuusi seitsemän kahdeksan\n')
        self.sentences1_file.seek(sentence_starts[3])
        self.assertEqual(self.sentences1_file.readline(), 'yhdeksän\n')
        self.sentences1_file.seek(sentence_starts[4])
        self.assertEqual(self.sentences1_file.readline(), 'kymmenen\n')
        self.sentences1_file.seek(0)

        sentences2_mmap = mmap.mmap(self.sentences2_file.fileno(),
                                    0,
                                    access=mmap.ACCESS_READ)
        sentence_starts = find_sentence_starts(sentences2_mmap)
        self.sentences2_file.seek(sentence_starts[0])
        self.assertEqual(self.sentences2_file.readline(),
                         'kymmenen yhdeksän\n')
        self.sentences2_file.seek(sentence_starts[1])
        self.assertEqual(self.sentences2_file.readline(),
                         'kahdeksan seitsemän kuusi\n')
        self.sentences2_file.seek(sentence_starts[2])
        self.assertEqual(self.sentences2_file.readline(), 'viisi\n')
        self.sentences2_file.seek(sentence_starts[3])
        self.assertEqual(self.sentences2_file.readline(), 'neljä\n')
        self.sentences2_file.seek(sentence_starts[4])
        self.assertEqual(self.sentences2_file.readline(), 'kolme kaksi yksi\n')
        self.sentences2_file.seek(0)
    def __init__(self, files):
        """Creates a memory map of the given files and finds the sentence
        starts.

        The pointers to sentence starts will be saved in a structure where each
        element is a tuple of two indices - the first index will select the file
        from the mmaps list and the second index points to the position inside
        the file.

        Also saves in ``pointer_ranges`` an index to the first pointer and one
        past the last pointer of each file.

        :type files: list of file objects
        :param files: input text files
        """

        self.mmaps = []
        self.pointers = []
        self.pointer_ranges = []

        for subset_file in files:
            subset_index = len(self.mmaps)
            subset_mmap = mmap.mmap(subset_file.fileno(),
                                    0,
                                    prot=mmap.PROT_READ)
            self.mmaps.append(subset_mmap)

            logging.debug("Finding sentence start positions in %s.",
                          subset_file.name)
            sys.stdout.flush()
            pointers = [(subset_index, x)
                        for x in find_sentence_starts(subset_mmap)]
            pointers_start = len(self.pointers)
            self.pointers.extend(pointers)
            pointers_stop = len(self.pointers)
            self.pointer_ranges.append((pointers_start, pointers_stop))
    def __init__(self, files):
        """Creates a memory map of the given files and finds the sentence
        starts.

        The pointers to sentence starts will be saved in a structure where each
        element is a tuple of two indices - the first index will select the file
        from the mmaps list and the second index points to the position inside
        the file.

        Also saves in ``pointer_ranges`` an index to the first pointer and one
        past the last pointer of each file.

        :type files: list of file objects
        :param files: input text files
        """

        self.mmaps = []
        self.pointers = []
        self.pointer_ranges = []

        for subset_file in files:
            subset_index = len(self.mmaps)
            subset_mmap = mmap.mmap(subset_file.fileno(),
                                    0,
                                    prot=mmap.PROT_READ)
            self.mmaps.append(subset_mmap)

            logging.debug("Finding sentence start positions in %s.",
                          subset_file.name)
            sys.stdout.flush()
            pointers = [(subset_index, x)
                        for x in find_sentence_starts(subset_mmap)]
            pointers_start = len(self.pointers)
            self.pointers.extend(pointers)
            pointers_stop = len(self.pointers)
            self.pointer_ranges.append((pointers_start, pointers_stop))