Ejemplo n.º 1
0
class Engine(object):
    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')

    def _get_postings(self, termInfo):
        if termInfo[-1] is not None:
            return self.postings.list_at_offset(termInfo[-1])
        return None

    def execute_query(self, reverse_polish):
        args = []

        while reverse_polish:
            token = reverse_polish.popleft()

            if not isinstance(token, Operator):
                dterm = self.dictionary.term(token)
                postings_list = self._get_postings(dterm)
                args.append(postings_list)
            else:
                if isinstance(token, NOTOperator):
                    args.append(self.postings.not_list())
                # print '\nExecuting ', token, ' for args: ', str(args), '\n'
                for i in range(len(args)):
                    if args[i] is not None and args[i]._entries_len == 0:
                        args[i] = None
                splitpoint = -1 * token.nargs
                o_args = args[splitpoint:]
                args = args[:splitpoint] + [token.execute(o_args)]

        return args[-1]
Ejemplo n.º 2
0
class Engine(object):

    NUM_RESULTS = 10

    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')

    def _get_postings(self, offset):
        return self.postings.list_at_offset(offset)

    def _accumulate_scores(self, scores, postings_list, q_wt):
        for doc_id, d_tf in postings_list:
            scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf

    def _normalize(self, scores, q_len):
        for doc_id in scores:
            scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id))

    def _get_top_n_docs(self, scores, n):
        scores_heap = [(-v, k) for k, v in scores.items()]
        heapq.heapify(scores_heap)
        return [
            heapq.heappop(scores_heap)[1] for i in xrange(n)
            if len(scores_heap) > 0
        ]

    def execute_query(self, query_map):
        scores = {}
        for term in query_map:
            q_idf, term_offset = self.dictionary.term(term)

            # unknown term, skip everything, score 0
            if term_offset is None:
                continue

            # accumulate scores for postings list
            query_map[term] = q_wt = tf(query_map[term]) * q_idf
            postings_list = self._get_postings(term_offset)
            self._accumulate_scores(scores, postings_list, q_wt)

        # perform length normalization (query and document)
        q_len = math.sqrt(sum(x * x for x in query_map.values()))
        self._normalize(scores, q_len)

        # find top n
        top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS)
        return " ".join(str(x) for x in top_n_docs)
Ejemplo n.º 3
0
class Engine(object):
    """
    Search engine that uses a simple vector space model to retrieve patents
    """

    NUM_RESULTS = 500

    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')

    def _get_postings(self, offset):
        """
        This method gets the postings list at an offset
        """
        return self.postings.list_at_offset(offset)

    def _accumulate_scores(self, scores, postings_list, q_wt):
        """
        This method accumulates scores for a term
        """
        for doc_id, d_tf in postings_list:
            scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf

    def _normalize(self, scores, q_len):
        """
        This method normalises scores for every document
        """
        for doc_id in scores:
            scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id))

    def _get_top_n_docs(self, scores, n):
        """
        This method creates a heap of the docs and pick out the top few
        """
        scores_heap = [(-v, k) for k, v in scores.items()]
        heapq.heapify(scores_heap)
        return [heapq.heappop(scores_heap)[1] for i in xrange(n)
                if len(scores_heap) > 0]

    def execute_query(self, query_map):
        """
        This method is called to execute a query
        """
        scores = {}
        for term in query_map:
            q_idf, term_offset = self.dictionary.term(term)

            # unknown term, skip everything, score 0
            if term_offset is None:
                continue

            # accumulate scores for postings list
            query_map[term] = q_wt = tf(query_map[term]) * q_idf
            postings_list = self._get_postings(term_offset)
            self._accumulate_scores(scores, postings_list, q_wt)

        # perform length normalization (query and document)
        q_len = math.sqrt(sum(x * x for x in query_map.values()))
        self._normalize(scores, q_len)

        # find top n
        # top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS)
        # return " ".join(str(x) for x in top_n_docs)

        return " ".join(str(x) for x in scores.keys())
Ejemplo n.º 4
0
class feedbackEngine(object):
    """
    Search engine that uses relevance feedback
    with a vector space model to retrieve patents
    """

    global NUM_RESULTS
    global QUERY_WEIGHT
    global P_FEEDBACK_WEIGHT
    NUM_RESULTS = 10
    QUERY_WEIGHT = 0.5
    P_FEEDBACK_WEIGHT = 0.5

    def __init__(self, fd, fp):
        self.dictionary = Dictionary(fd, load=True)
        self.postings = Postings(fp, mode='r')
        self.feedback = False

    def _get_postings(self, offset):
        """
        This method gets the postings list at an offset
        """
        return self.postings.list_at_offset(offset)

    def _accumulate_scores(self, scores, postings_list, q_wt):
        """
        This method accumulates scores for a term
        """
        for doc_id, d_tf in postings_list:
            scores[doc_id] = scores.get(doc_id, 0) + q_wt * d_tf

    def _normalize(self, scores, q_len):
        """
        This method normalises scores for every document
        """
        for doc_id in scores:
            scores[doc_id] /= (q_len * self.dictionary.doc_length(doc_id))

    def _get_top_n_docs(self, scores, n):
        """
        This method creates a heap of the docs and pick out the top few
        """
        scores_heap = [(-v, k) for k, v in scores.items()]
        heapq.heapify(scores_heap)
        return [heapq.heappop(scores_heap)[1] for i in xrange(n)
                if len(scores_heap) > 0]

    def relevance_feedback(self, query_map, top_n_docs):
        """
        This method expands the query based on pseudo relevance feedback
        """
        self.feedback = True
        vector_sum = {}
        term_dict = self.dictionary._terms

        # constructing the document vector for the top n docs
        for term in term_dict:
            term_offset = term_dict[term][1]

            # unknown term, skip everything, score 0
            if term_offset is None or term is None:
                continue

            # adding the term frequencies of all the documents in top_n_docs
            postings_list = self._get_postings(term_offset)
            for doc_id, d_tf in postings_list:
                if doc_id in top_n_docs:
                    temp_term_freq = d_tf*P_FEEDBACK_WEIGHT
                    if term in vector_sum:
                        vector_sum[term] += temp_term_freq
                    else:
                        vector_sum[term] = temp_term_freq

        # averaging the vector for the top docs to get the centroid
        for term in vector_sum:
            vector_sum[term] /= NUM_RESULTS
            vector_sum[term] *= P_FEEDBACK_WEIGHT

        # adding the initial query vector terms to the centroid
        for term in vector_sum:
            if term in query_map:
                vector_sum[term] += query_map[term] * QUERY_WEIGHT

        # adding the remaining terms left in the query vector
        for term in query_map:
            if term not in vector_sum:
                vector_sum[term] = query_map[term] * QUERY_WEIGHT

        # execute query with the new query vector
        return self.execute_query(vector_sum)

    def execute_query(self, query_map):
        """
        This method is called to execute a query
        """
        scores = {}
        query_map_copy = copy.deepcopy(query_map)
        for term in query_map:
            q_idf, term_offset = self.dictionary.term(term)

            # unknown term, skip everything, score 0
            if term_offset is None:
                continue

            # accumulate scores for postings list
            query_map[term] = q_wt = tf(query_map[term]) * q_idf
            postings_list = self._get_postings(term_offset)
            self._accumulate_scores(scores, postings_list, q_wt)

        # perform length normalization (query and document)
        q_len = math.sqrt(sum(x * x for x in query_map.values()))
        self._normalize(scores, q_len)

        # if havent done relevance feedback, do relevance feedback
        if not self.feedback:
            top_n_docs = self._get_top_n_docs(scores, Engine.NUM_RESULTS)
            stringout = self.relevance_feedback(query_map_copy, top_n_docs)

        # if here, calling from within relevance feedback
        else:
            # return the output of all the scores after relevance feedback
            stringout = " ".join(str(x) for x in scores.keys())

        return stringout