예제 #1
0
    def big_data(self):
        if not (os.path.exists('compressed_index.txt')
                and os.path.exists('coded_sources.txt')):
            if os.path.exists('blocks'):
                rmtree('blocks')
            os.mkdir('blocks')
            reload()

        with open('coded_sources.txt') as f:
            lines = f.readlines()
            for l in lines:
                name, code = l.split(' ')
                code = int(code)
                self.book_to_int[name] = code
                self.int_to_book[code] = name
                self.__unique_id = code + 1

        with open('compressed_index.txt', 'r') as f:

            while True:
                line = f.readline()
                if not line:
                    break

                word, count, sources = line.split(' ', 2)
                self.all_words.append(word)
                self.compressed_all_words.add_word(word)

                books = sources.split(';')
                d = dict()
                prev = 0
                for book in books:
                    if book == '\n':
                        continue
                    if book[0] == ' ':
                        book = book[1:]
                    name, positions = book.split(' ', 1)
                    code_name = ByteCodec.decode(name)
                    code_name = code_name + prev

                    pos = []
                    for i in positions.split(' '):
                        pos.append(int(i))

                    d[code_name] = pos

                self.data[word] = d

        self.wildcard.update(self.all_words)
예제 #2
0
    def big_data(self):
        if not os.path.exists('index.txt'):
            if os.path.exists('blocks'):
                rmtree('blocks')
            os.mkdir('blocks')
            reload()

        with open('index.txt', 'r') as f:

            while True:
                line = f.readline()
                if not line:
                    break

                word, count, sources = line.split(' ', 2)
                self.all_words.append(word)

                books = sources.split(';')
                d = dict()
                for book in books:
                    if book == '\n':
                        continue
                    if book[0] == ' ':
                        book = book[1:]
                    name, positions = book.split(' ', 1)
                    if name in self.book_to_int:
                        key = self.book_to_int[name]
                    else:
                        self.book_to_int[name] = self.__unique_id
                        key = self.__unique_id
                        self.int_to_book[self.__unique_id] = name
                        self.__unique_id += 1

                    pos = []
                    for i in positions.split(' '):
                        pos.append(int(i))

                    d[key] = pos

                self.data[word] = d

        self.wildcard.update(self.all_words)