def __init__(self, blockfiles, file_name="mf.txt", out_dir="./merged"): self.file_name = file_name self.out_dir = out_dir self.block_files = blockfiles self.get_out_dir() self.prep_output() self.out_file = core.BlockFile(os.path.join(self.out_dir, self.file_name))
def index(self): """ Method to actual index the corpus using the SPIMI algorithm Compresses dictionary depending on parameters passed in class constructor Writes blocks to inverted block files in output directory :return: """ done = False while not done: block_dict = {} try: while sys.getsizeof( block_dict) / 1024 / 1024 <= self.block_size: token = self.tokens.next() if token[0] not in block_dict: block_dict[token[0]] = list() block_dict[token[0]].append(token[1]) else: block_dict[token[0]].append(token[1]) except StopIteration: print "Parsed all tokens in all documents" done = True sorted_block = [str(term) for term in sorted(block_dict.keys())] block_name = self.block_prefix + str(self.block_index) + ".txt" outFile = core.BlockFile(os.path.join(self.out_dir, block_name)) outFile.open_file(mode="w") for element in sorted_block: docids = " ".join(str(doc) for doc in block_dict[element]) outString = element + " " + docids outFile.write_line(outString + "\n") outFile.close_file() self.block_index += 1 self.blocklist.append(os.path.join(self.out_dir, block_name))
def prep_files(self): """ Reads all inverted block files to prepare for merger function :return: List of open File classes for block files """ open_files = [] for out_file in self.block_files: open_files.append(core.BlockFile(out_file)) return open_files
def get_index(self): """ Populates class with merged spimi index file to allow for in-memory querying :return: """ in_file = core.BlockFile(self.merge) in_file.open_file() in_line = in_file.read_line() while in_line: self.index[in_line.term] = in_line.postings in_line = in_file.read_line()