def big_data(self): if not (os.path.exists('compressed_index.txt') and os.path.exists('coded_sources.txt')): if os.path.exists('blocks'): rmtree('blocks') os.mkdir('blocks') reload() with open('coded_sources.txt') as f: lines = f.readlines() for l in lines: name, code = l.split(' ') code = int(code) self.book_to_int[name] = code self.int_to_book[code] = name self.__unique_id = code + 1 with open('compressed_index.txt', 'r') as f: while True: line = f.readline() if not line: break word, count, sources = line.split(' ', 2) self.all_words.append(word) self.compressed_all_words.add_word(word) books = sources.split(';') d = dict() prev = 0 for book in books: if book == '\n': continue if book[0] == ' ': book = book[1:] name, positions = book.split(' ', 1) code_name = ByteCodec.decode(name) code_name = code_name + prev pos = [] for i in positions.split(' '): pos.append(int(i)) d[code_name] = pos self.data[word] = d self.wildcard.update(self.all_words)
def big_data(self): if not os.path.exists('index.txt'): if os.path.exists('blocks'): rmtree('blocks') os.mkdir('blocks') reload() with open('index.txt', 'r') as f: while True: line = f.readline() if not line: break word, count, sources = line.split(' ', 2) self.all_words.append(word) books = sources.split(';') d = dict() for book in books: if book == '\n': continue if book[0] == ' ': book = book[1:] name, positions = book.split(' ', 1) if name in self.book_to_int: key = self.book_to_int[name] else: self.book_to_int[name] = self.__unique_id key = self.__unique_id self.int_to_book[self.__unique_id] = name self.__unique_id += 1 pos = [] for i in positions.split(' '): pos.append(int(i)) d[key] = pos self.data[word] = d self.wildcard.update(self.all_words)