def _save_block(self, file_num, tokens): """ Save an inverted block. :param file_num: The file index. :type file_num: int :param tokens: List of tokens to save. :type tokens: list """ for raw_token in tokens: raw_token = raw_token.strip() for token in tokenize(raw_token): if len(token) > 0: line = token + ' ' + str(file_num) + "\n" self._lines.append(line) if len(line) > self.block_size: self._save_line_buffer()
def construct(self): dictionary = dict() postings = dict() last_docs = dict() files = dict() token_id = 0 num_files = len(self._files) # Apply lossless compression by only saving the gaps between document ids (not the document ids themselves) for doc_id in range(0, num_files): # Loop through all documents file = self._files[doc_id] files[doc_id] = file with open(file, 'r') as handle: # Open the file and tokenize tokens = tokenize(handle.read()) for token in tokens: # Update the dictionary and postings if token not in dictionary: dictionary[token] = token_id postings[token_id] = deque([doc_id]) last_docs[token_id] = doc_id token_id += 1 else: found_token_id = dictionary[token] gap = doc_id - last_docs[found_token_id] postings[found_token_id].append(gap) last_docs[found_token_id] = doc_id if self.show_progress: sys.stdout.write("\rIndexing: " + '{0} / {1}'.format(doc_id, num_files)) sys.stdout.flush() # The index construction is done if self.show_progress: sys.stdout.write("\rIndex construction: done\n") sys.stdout.flush() # Save the index index = (postings, dictionary, files) with open(self.get_index_path(), 'wb') as handle: pickle.dump(index, handle)
def construct(self): """ Construct the index. """ # Make a subdirectory to store the blocks if os.path.exists(self._block_path): shutil.rmtree(self._block_path) os.mkdir(self._block_path) self._lines = [] self._block_num = 0 file_num = 0 for file in self._files: file_pointer = open(file) buffered_token = '' while True: block = file_pointer.read(self.block_size + 1) if not block: break tokens = tokenize(block) # Only do something if there are found tokens if len(tokens) > 0: # Detect whether the block ends with a token ends_with_token = (tokens[-1][-1] == block[-1]) # Detect whether the block start with a token starts_with_token = (tokens[0][0] == block[0]) # Check whether the whole block is a token block_is_token = (tokens[0] == block) if block_is_token: # If the full block is a token, then just enlarge the buffer buffered_token += tokens[0] else: # Otherwise, there are at least two tokens in the block block_tokens = [] if starts_with_token: block_tokens.append(buffered_token + tokens[0]) buffered_token = '' tokens = tokens[1:] else: block_tokens.append(buffered_token) buffered_token = '' if ends_with_token: buffered_token = tokens[-1] tokens = tokens[:-1] for token in tokens: block_tokens.append(token) # Save the block tokens and increase the block number if len(block_tokens) > 0: self._save_block(file_num, block_tokens) if len(buffered_token) > 0: # If there is some left over token, save it as a block token and increase the block number self._save_block(file_num, [buffered_token]) # Do not forget to close the file file_pointer.close() # Increase the file identifier file_num += 1 # Display the progres if self.show_progress: sys.stdout.write("\rIndex construction: " + '{0} / {1}'.format(file_num, len(self._files))) sys.stdout.flush() # Save the line buffer if there is any left self._save_line_buffer() # The index construction is done if self.show_progress: sys.stdout.write("\rIndex construction: done\n") sys.stdout.flush() # Merge the blocks self._merge_blocks() # Clean up the temporary block folder shutil.rmtree(self._block_path)