Python Indexer.LoadUrls Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: LoadUrls

Examples at hotexamples.com: 1

Python Indexer.LoadUrls - 1 examples found. These are the top rated real world Python examples of indexer.Indexer.LoadUrls extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

class Searcher:
    """ Searcher is a class dealing with real-time querying.
        It implements the ranked retrieval based on the VSM(Vector Space Model).
        It also support Phrasal Queries and Pivoted Normalized Document Length.
    Args:
        dictionary_file: the file path of the dictionary
        postings_file: the file path of the postings
        rate: the penalty rate of the pivoted normalized document length
        expand: boolean indicator for using Query Expansion
        feedback: boolean indicator for using Relevance Feedback
        pivoted: boolean indicator for using pivoted normalized document length
        score: output the score of each result
    """
    def __init__(self,
                 dictionary_file,
                 postings_file,
                 expand=False,
                 feedback=True,
                 rate=0.01,
                 pivoted=False,
                 score=False):

        self.dictionary_file = dictionary_file
        self.postings_file = postings_file
        self.rate = rate
        self.pivoted = pivoted
        self.score = score

        self.stemmer = PorterStemmer()
        self.indexer = Indexer(dictionary_file, postings_file)
        self.refiner = Refiner(indexer=self.indexer,
                               expand=expand,
                               feedback=feedback)
        self.feedback = feedback

        self.a, _, _, _ = self.indexer.LoadDict()

    """ Search and return docIds according to the boolean expression.
    Args:
        query: the query string
        relevant_docs: relevanct doc used for relevance feedback
    Returns:
        result: the list of K most relevant docIds in response to the query
        urls: the url corresponding to each result
        score: the score corresponding to each result
    """

    def search(self, query, relevant_docs):
        # step 1: let refiner to refine the query and get query_infos
        query_infos, postings_lists = self.refiner.refine(query, relevant_docs)

        # step 2: get candidate docs that need to rank(phrasal query)
        # step 2-1: get all the docs that contains all the terms in the query
        self._get_intersection(query_infos, postings_lists)

        # step 2-2: judging every doc whether it contains the phrase
        self._judge(query_infos, postings_lists)

        # step 3: rank documents based on VSM and relevance feedback based on top doc to get final result
        # step 3-1: rank documents get the result
        result, score = self.rank(query_infos, postings_lists)

        # step 3-2: relevance feedback based on top doc
        if (self.feedback):
            self.refiner._feedback(query_infos, result[:5], postings_lists)

        # step 3-3: rank again using new query vector
        result, score = self.rank(query_infos, postings_lists)

        # step 4: fetch url from result
        urls = self.indexer.LoadUrls(result)

        # step 4: return the result
        return result, urls, score

    """ Rank the documents and return the K most relevant docIds.
        The result should be in the order of relevant.
    Args:
        query_infos: a list of instances of QueryInfo that contains the query
        postings_lists: the dictionary with terms to posting lists mapping
    Returns:
       result: the list of K most relevant docIds in response to the query
       score: the list of the scores corresponding to the docIds
    """

    def rank(self, query_infos, postings_lists):
        total_scores = defaultdict(lambda: 1)

        for query_info in query_infos:
            # step 1: Initialize variables
            terms = query_info.terms
            scores = defaultdict(lambda: 0)
            query_vector = query_info.query_vector

            # step 2: processing every document and every term
            for i, term in enumerate(terms):
                candidates = query_info.candidates
                postings_list = postings_lists[term]
                postings = postings_list[1]
                weights = postings_list[2]

                for j in range(0, len(postings)):
                    doc = postings[j]

                    if query_info.is_phrase and (doc not in candidates):
                        continue

                    weight = weights[j]
                    scores[doc] += weight * query_vector[i]

            # step 3: use pivoted document length
            """
            for doc in scores:
                length = self.total_doc[doc]
                if self.pivoted:
                    piv = 1 - self.rate + self.rate * length / self.average
                    scores[doc] /= length * piv
                else:
                    scores[doc] /= length
            """

            # step 4: update total scores
            for doc in scores:
                total_scores[doc] += scores[doc]

        # use pagerank to change the weight
        self._pagerank(total_scores)

        # step 4: get the topK docs from the heap
        heap = [(total_scores[doc], -doc) for doc in total_scores]
        heap = heapq.nlargest(len(total_scores), heap, key=lambda x: x)

        result = [-item[1] for item in heap]

        score = []
        if self.score:
            score = [item[0] for item in heap]

        # step 5: return the topK docs
        return result, score

    """ Use array a generated from pagerank algorithm to change the weight
    Args:
        total_scores: dict of doc and score
    Returns:
        total_scores: processed dict of doc and score
    """

    def _pagerank(self, total_scores):
        # normalize the total_scores
        try:
            sum_score = sum(total_scores.values())
            # min_score = min(total_scores.values())
            # max_score = max(total_scores.values())
        except ValueError:
            return total_scores
        else:
            # to_be_divided_by = max_score - min_score
            to_be_divided_by = sum_score
            for key in total_scores.keys():
                total_scores[key] = (total_scores[key]) / to_be_divided_by

            # normalize the a value
            # min_score = np.min(self.a)
            # max_score = np.max(self.a)
            # to_be_divided_by = max_score - min_score

            # # all same value in array a
            # if (to_be_divided_by == 0):
            #     return total_scores

            # self.a = (self.a - min_score)/to_be_divided_by

            # use pagerank to change the weight
            for key in total_scores.keys():
                total_scores[key] = (total_scores[key] + self.a[0][key]) / 2

            return total_scores

    """ Get the intersection of docs
    Args:
        query_infos: a list of instances of QueryInfo that contains the query
        postings_lists: the dictionary with terms to posting lists mapping
    """

    def _get_intersection(self, query_infos, postings_lists):
        for query_info in query_infos:
            terms = query_info.terms

            if not query_info.is_phrase:
                continue

            if len(terms) == 0:
                query_info.candidates = []
                continue

            # optimize the order of the merge
            costs = []
            for term in terms:
                postings = postings_lists[term][1]
                costs.append((term, len(postings)))

            costs.sort(key=lambda key: key[1])

            # perform pairwise merge
            result = postings_lists[costs[0][0]][1]
            for i in range(1, len(costs)):
                term = costs[i][0]
                postings = postings_lists[term][1]

                p1 = p2 = 0
                len1, len2 = len(result), len(postings)
                temp = array.array('i')

                while p1 < len1 and p2 < len2:
                    doc1 = result[p1]
                    doc2 = postings[p2]

                    if doc1 == doc2:
                        temp.append(doc1)
                        p1, p2 = p1 + 1, p2 + 1
                    elif doc1 < doc2:
                        p1 += 1
                    else:
                        p2 += 1

                result = temp

            # update the candidates
            query_info.candidates = set(result)

    """ Judging whether candidate documents contain the phrase
    Args:
        query_infos: a list of instances of QueryInfo that contains the query
        postings_lists: the dictionary with terms to posting lists mapping
    """

    def _judge(self, query_infos, postings_lists):
        for query_info in query_infos:
            tokens = query_info.tokens

            if not query_info.is_phrase:
                continue

            if len(tokens) <= 1:
                continue

            positions = defaultdict(lambda: [])
            candidates = query_info.candidates

            # get postions for docs
            for i, token in enumerate(tokens):
                postings_list = postings_lists[token]
                postings = postings_list[1]
                length = len(postings)
                for j in range(0, length):
                    docId = postings[j]
                    if docId in candidates:
                        positions[docId].append(postings_list[3][j])

            # judging every doc
            ans = set()
            for doc in positions:
                position = positions[doc]
                pointers = [0] * len(position)

                index = 1
                flag = False
                prev_pos = position[0][0]
                while True:
                    pointer = pointers[index]
                    length = len(position[index])

                    while pointer + 1 < length:
                        tmp = position[index][pointer + 1]
                        if tmp <= prev_pos + 1:
                            pointer += 1
                        else:
                            break

                    pointers[index] = pointer
                    cur_pos = position[index][pointer]

                    if cur_pos != prev_pos + 1:
                        index -= 1
                        pointers[index] += 1
                        if pointers[index] >= len(position[index]):
                            break
                        if index == 0:
                            index += 1

                        pointer = pointers[index - 1]
                        prev_pos = position[index - 1][pointer]
                        continue
                    else:
                        prev_pos = cur_pos
                        index += 1
                        if index >= len(position):
                            flag = True
                            break

                if flag:
                    ans.add(doc)

                query_info.candidates = ans