Ejemplo n.º 1
0
def build_index(training_data_dir, dictionary_file, postings_file, is_debug):
    training_files = sorted(os.listdir(training_data_dir),
                            key=lambda x: int(x))
    if is_debug:
        training_files = training_files[:DEBUG_LIMIT]

    dictionary = Dictionary(dictionary_file)
    postings = Postings(postings_file)
    for training_file in training_files:
        doc_id = int(training_file)
        doc_path = osp.join(training_data_dir, training_file)
        postings.not_list().add(doc_id)
        add_doc_to_index(doc_id, doc_path, dictionary, postings)
    postings.save()

    # turn line nos to byte offsets
    f = open(postings_file)
    current_line = 1
    f.readline()  # skip postings list containing all doc ids
    while True:
        term = dictionary.term_for_offset(current_line)
        dictionary.add_term(term, f.tell())
        line = f.readline()
        if not line:
            break
        current_line += 1
    dictionary.save()
Ejemplo n.º 2
0
class Engine(object):
    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')

    def _get_postings(self, termInfo):
        if termInfo[-1] is not None:
            return self.postings.list_at_offset(termInfo[-1])
        return None

    def execute_query(self, reverse_polish):
        args = []

        while reverse_polish:
            token = reverse_polish.popleft()

            if not isinstance(token, Operator):
                dterm = self.dictionary.term(token)
                postings_list = self._get_postings(dterm)
                args.append(postings_list)
            else:
                if isinstance(token, NOTOperator):
                    args.append(self.postings.not_list())
                # print '\nExecuting ', token, ' for args: ', str(args), '\n'
                for i in range(len(args)):
                    if args[i] is not None and args[i]._entries_len == 0:
                        args[i] = None
                splitpoint = -1 * token.nargs
                o_args = args[splitpoint:]
                args = args[:splitpoint] + [token.execute(o_args)]

        return args[-1]