def build_index(training_data_dir, dictionary_file, postings_file, is_debug): training_files = sorted(os.listdir(training_data_dir), key=lambda x: int(x)) if is_debug: training_files = training_files[:DEBUG_LIMIT] dictionary = Dictionary(dictionary_file) postings = Postings(postings_file) for training_file in training_files: doc_id = int(training_file) doc_path = osp.join(training_data_dir, training_file) postings.not_list().add(doc_id) add_doc_to_index(doc_id, doc_path, dictionary, postings) postings.save() # turn line nos to byte offsets f = open(postings_file) current_line = 1 f.readline() # skip postings list containing all doc ids while True: term = dictionary.term_for_offset(current_line) dictionary.add_term(term, f.tell()) line = f.readline() if not line: break current_line += 1 dictionary.save()
class Engine(object): def __init__(self, fd, fp): self.dictionary = Dictionary(fd, load=True) self.postings = Postings(fp, mode='r') def _get_postings(self, termInfo): if termInfo[-1] is not None: return self.postings.list_at_offset(termInfo[-1]) return None def execute_query(self, reverse_polish): args = [] while reverse_polish: token = reverse_polish.popleft() if not isinstance(token, Operator): dterm = self.dictionary.term(token) postings_list = self._get_postings(dterm) args.append(postings_list) else: if isinstance(token, NOTOperator): args.append(self.postings.not_list()) # print '\nExecuting ', token, ' for args: ', str(args), '\n' for i in range(len(args)): if args[i] is not None and args[i]._entries_len == 0: args[i] = None splitpoint = -1 * token.nargs o_args = args[splitpoint:] args = args[:splitpoint] + [token.execute(o_args)] return args[-1]