Example #1
0
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        query_as_list = self._parser.parse_sentence(query)
        query_as_list_model_1 = query_as_list
        if (self._model != None):
            query_as_list_model_1 = self._model.extend_query(query_as_list)

        if (self._model_1 != None):
            query_as_list_model_2 = self._model_1.extend_query(query_as_list)
            query_as_list_model_1.extend(query_as_list_model_2)

        query_as_list = query_as_list_model_1

        docs_dict = self._indexer.get_docs_dict()
        relevant_docs, query_dict = self._relevant_docs_from_posting(
            query_as_list)
        if relevant_docs == None or len(relevant_docs) == 0:
            return 0, []

        relevant_docs1 = self._ranker.BM25(relevant_docs, query_as_list,
                                           docs_dict, query_dict)
        n_relevant = len(relevant_docs1)
        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs1, 2000)
        return n_relevant, ranked_doc_ids
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        query_as_list = self._parser.parse_sentence(query)
        q_new_spelling, wrongWords = self.do_spelling(query_as_list)
        # print("query_as_list: ", query_as_list)
        # print("q_new_spelling: ", q_new_spelling)
        # print("wrongWords: ", wrongWords)
        query_as_list = self.deleteWrongSpelledWords(query_as_list, wrongWords)

        self.upper_lower_case(query_as_list, self._indexer)
        self.upper_lower_case(q_new_spelling, self._indexer)
        self.upper_lower_case(wrongWords, self._indexer)

        # print("query as list: ", query_as_list)
        # print("wordnet :", q_wordnet)
        # Find relevant docs
        relevant_docs = self._relevant_docs_from_posting(query_as_list +
                                                         q_new_spelling +
                                                         wrongWords)
        n_relevant = len(relevant_docs)
        # Send all to ranking
        ranked_doc_ids = Ranker.rank_relevant_docs(
            query_as_list + q_new_spelling, wrongWords, relevant_docs,
            self._indexer, k)
        return n_relevant, ranked_doc_ids
Example #3
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_dict = self._parser.parse_query(query)

        # wordnet
        for word in query_as_dict.copy().keys():
            syn = []
            # if word not in self._indexer.inverted_idx:
            for synset in wordnet.synsets(word):
                for lemma in synset.lemmas():
                    syn.append(lemma.name().replace('_',
                                                    ' '))  # add the synonyms
            for s in syn:
                if s not in query_as_dict and s in self._indexer.inverted_idx:
                    query_as_dict[s] = 1
                    break

        relevant_docs = self._relevant_docs_from_posting(query_as_dict)

        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        n_relevant = len(ranked_doc_ids)
        # print("SE1 top5:")
        # print(ranked_doc_ids[:5])
        return n_relevant, ranked_doc_ids
Example #4
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_dict = self._parser.parse_query(query)

        # thesaurus
        for word in query_as_dict.copy().keys():
            if len(thes.synonyms(word)[1][1]):
                syn = list(thes.synonyms(word)[1][1])[:30]
                for s in syn:
                    if s not in query_as_dict and s in self._indexer.inverted_idx:
                        query_as_dict[s] = 1
                        break

        relevant_docs = self._relevant_docs_from_posting(query_as_dict)

        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        # print("SE4 top5:")
        # print(ranked_doc_ids[:5])
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
Example #5
0
 def search_with_extension(self, query, extension, k=None):
     query_as_dict = self.get_query_by_inverted_index(query)
     query_as_dict = self.get_extended_and_query_by_inverted_index(
         extension, query_as_dict)
     relevant_docs = self._relevant_docs_from_posting(query_as_dict.keys())
     n_relevant = len(relevant_docs)
     ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs,
                                                query_as_dict, k)
     return n_relevant, ranked_doc_ids
Example #6
0
 def search(self, query, k):
     """
     Executes a query over an existing index and returns the number of
     relevant docs and an ordered list of search results (tweet ids).
     Input:
         query - string.
         k - number of top results to return, default to everything.
     Output:
         A tuple containing the number of relevant search results, and
         a list of tweet_ids where the first element is the most relavant
         and the last is the least relevant result.
     """
     query_as_list = self._parser.parse_sentence(query)
     relevant_docs = self._relevant_docs_from_posting(query_as_list)
     n_relevant = len(relevant_docs)
     ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
     return n_relevant, ranked_doc_ids[:k]
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        query_as_list, entities = self._parser.parse_sentence(query)
        entities = entities.keys()
        query_as_list.extend(entities)
        query_expand = []

        keys = self._indexer.inverted_idx.keys()

        query_expand = []
        if self._model.__class__.__name__ == 'GlobalMethod':
            self._model.execute_global_method_and_generate_matrix(
                inverted_index=self._indexer.inverted_idx,
                postingDic=self._indexer.postingDict)

        for word in query_as_list:
            temp_words = self._model.expand_query(word)
            for inner_word in temp_words:
                if inner_word in keys and inner_word not in query_expand:
                    query_expand.append(str(inner_word))

        for term in query_as_list:
            if term in keys and term not in query_expand:
                query_expand.append(str(term))
            elif term.upper() in keys and term not in query_expand:
                query_expand.append(str(term.upper()))
            elif term.lower() in keys and term not in query_expand:
                query_expand.append(str(term.lower()))

        relevant_docs = self._relevant_docs_from_posting(query_expand)
        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
Example #8
0
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            2 parameters are returned - number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        query_object = self._parser.parse_query(query)

        relevant_docs = self._relevant_docs_from_posting(query_object)
        normalized_query = self.normalized_query(query_object)
        n_relevant = len(relevant_docs)
        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs, normalized_query, self._indexer.docs_dict, k)
        return n_relevant, ranked_doc_ids
Example #9
0
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relevant
            and the last is the least relevant result.
        """
        query_as_list = self._parser.parse_sentence(query)

        relevant_docs = self.relevant_docs_from_posting(query_as_list)
        n_relevant = len(relevant_docs)
        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs, k)
        # print(np.percentile(list(dict(ranked_doc_ids).values()), 10))
        # print(max(dict(ranked_doc_ids).items(), key=operator.itemgetter(1)))
        return n_relevant, ranked_doc_ids
Example #10
0
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        query_as_dict = self._parser.parse_query(query)
        query_as_list = self.get_list_words(query_as_dict)
        # query_as_list = self._parser.parse_sentence(query)
        # # thesaurus
        # for word in query_as_list.copy():
        #     if len(thes.synonyms(word)[1][1]):
        #         syn = list(thes.synonyms(word)[1][1])[:30]
        #         for s in syn:
        #             if s not in query_as_list and s in self._indexer.inverted_idx:
        #                 query_as_list.append(s)
        #                 break
        # # wordnet
        # for word in query_as_list.copy():
        #     syn = set()
        #     # if word not in self._indexer.inverted_idx:
        #     for synset in wordnet.synsets(word):
        #         for lemma in synset.lemmas():
        #             syn.add(lemma.name().replace('_', ' '))  # add the synonyms
        #     for s in syn:
        #         if s not in query_as_list and s in self._indexer.inverted_idx:
        #             query_as_list.append(s)
        #             break
        relevant_docs = self._relevant_docs_from_posting(query_as_list)

        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        # print("SE5 top5:")
        # print(ranked_doc_ids[:5])
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
Example #11
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_list = self._parser.parse_sentence(query)
        # Find wordNet and Thesaurus words
        q_wordnet = searcher_Wordnet.Searcher.do_wordnet(query_as_list)
        q_thesaurus = searcher_Thesaurus.Searcher.do_thesaurus(query_as_list)

        # q_new_spelling, wrongWords = searcher_Spelling.Searcher.do_spelling(query_as_list)

        # Upper lower case
        searcher_Wordnet.Searcher.upper_lower_case(query_as_list,
                                                   self._indexer)
        searcher_Wordnet.Searcher.upper_lower_case(q_wordnet, self._indexer)
        searcher_Wordnet.Searcher.upper_lower_case(q_thesaurus, self._indexer)
        # searcher_Wordnet.Searcher.upper_lower_case(wrongWords, self._indexer)
        # searcher_Wordnet.Searcher.upper_lower_case(q_new_spelling, self._indexer)

        # print("query as list: ", query_as_list)
        # print("wordnet :", q_wordnet)

        complete_query = query_as_list
        added_words = q_wordnet + q_thesaurus

        relevant_docs = self._relevant_docs_from_posting(complete_query +
                                                         added_words)
        n_relevant = len(relevant_docs)
        # send to ranking the wordNet + Thesaurus together
        ranked_doc_ids = Ranker.rank_relevant_docs(complete_query, added_words,
                                                   relevant_docs,
                                                   self._indexer, k)
        return n_relevant, ranked_doc_ids
Example #12
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_dict = self._parser.parse_query(query)

        # spell checker
        query_as_list = query_as_dict.copy().keys()
        spell = SpellChecker()
        # misspeled = spell.unknown(query_as_list)
        # for word in misspeled:
        for word in query_as_list:
            if ' ' not in word:
                # correct_word = spell.correction(word)
                correct_words = spell.candidates(word)
                correct_word = ''
                for c_word in correct_words:
                    if c_word != word and c_word in self._indexer.inverted_idx:
                        correct_word = c_word
                        break
                if len(correct_word) == 0:
                    continue
                query_as_dict[correct_word] = 1


        relevant_docs = self._relevant_docs_from_posting(query_as_dict)

        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        # print("SE3 top5:")
        # print(ranked_doc_ids[:5])
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
Example #13
0
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        query_as_list = self._parser.parse_sentence(query)
        query_as_list = self._model.spellCheck(query_as_list)

        relevant_docs, Ranker.query_weight = self._relevant_docs_from_posting(
            query_as_list)
        ranked_doc_ids = Ranker.rank_relevant_docs(
            relevant_docs, self._indexer.get_docs_count())
        n_relevant = len(ranked_doc_ids)
        ranked_doc_ids = [doc_id for doc_id, rank in ranked_doc_ids]

        return n_relevant, ranked_doc_ids
Example #14
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_list = self.parser.parse_sentence(query)

        if self.is_thesaurus:
            query_as_list_with_synonym = self.thesaurus_method(query_as_list[0])
            query_as_list = [query_as_list_with_synonym, None]

        relevant_docs = self.relevant_docs_from_posting(query_as_list)
        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        if k:
            ranked_doc_ids = Ranker.retrieve_top_k(ranked_doc_ids, k)
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
Example #15
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker()
        self._model = model

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        # query_as_list = self._parser.parse_sentence(query)
        query_as_list = self._parser.parse_query(query)

        relevant_docs = self._relevant_docs_from_posting(query_as_list)
        n_relevant = len(relevant_docs)
        ranked_doc_ids = self._ranker.rank_relevant_docs(
            self._indexer.documents_dict, self._model, relevant_docs,
            query_as_list, k)
        # print(n_relevant, ranked_doc_ids)
        return n_relevant, ranked_doc_ids

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_docs = {}
        for term in query_as_list:
            posting_list = self._indexer.get_term_posting_list(term)
            for doc_id, tf in posting_list:
                df = relevant_docs.get(doc_id, 0)
                relevant_docs[doc_id] = df + 1

        min_len = min(2000, len(relevant_docs))
        relevant_docs_sorted = dict(
            sorted(relevant_docs.items(),
                   key=lambda item: item[1],
                   reverse=True)[:min_len])
        return relevant_docs_sorted

    def basic_search(self, query, k=None):
        query_as_list = self._parser.parse_query(query)

        relevant_docs = self._relevant_docs_from_posting(query_as_list)
        n_relevant = len(relevant_docs)
        ranked_doc_ids = self._ranker.basic_rank_relevant_docs(
            relevant_docs, k)
        return n_relevant, ranked_doc_ids

    def wordnet_search(self, query, k=None):
        # nltk.download('wordnet')
        query_as_list = self._parser.parse_query(query)
        query_tmp = list(query_as_list)
        for term in query_tmp:
            synonyms = wordnet.synsets(term.lower())
            for synonym in synonyms:
                extra_term = synonym.lemmas()[0].name()
                if extra_term != term.lower():
                    query_as_list.append(extra_term)
                    break

        relevant_docs = self._relevant_docs_from_posting(query_as_list)
        n_relevant = len(relevant_docs)
        ranked_doc_ids = self._ranker.basic_rank_relevant_docs(
            relevant_docs, k)
        return n_relevant, ranked_doc_ids
Example #16
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        # self._model = model
        self.parser = parser
        self.ranker = Ranker(indexer.tweet_info)
        self.inverted_index = indexer.inverted_idx
        self.firstUnion = True
        self.posting_dir = ConfigClass.get_output()
        self.DocsToRetrieve = ConfigClass.numOfDocsToRetrieve
        self.scoreLowerBoundFactor = 0.5

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - (list after extension, len of original query)
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """

        query_as_list = query
        relevant_docs, qLen = self.relevant_docs_from_posting(query_as_list)
        n_relevant = len(relevant_docs)
        ranked_doc_ids = self.ranker.rank_relevant_docs(
            relevant_docs, query_as_list)
        if k is not None and k > 0 and k < n_relevant:
            ranked_doc_ids = ranked_doc_ids[:k]
            return k, ranked_doc_ids

        return n_relevant, ranked_doc_ids

    """
    This function  count the amount of words from query that appear in each document.
    :param query: query
    :return: list (max size of self.DocsToRetrieve ) of relevant documents (first will be document that all terms in the query
     appeared in the tweet), and len of the query
    """

    def relevant_docs_from_posting(self, query):
        #the query comes at its Expended form
        sorted_l = []
        if len(query) == 0:  # empty query
            return [], 0
        relevantDocs_pq = PriorityQueue()
        modifiedQuery_l = copy.deepcopy(query)

        termsToRemoveFromQuery = []
        # at this point if query holds Entity, it will hold the terms builds the Entity and the Entity as 1 term
        # this is why this part below for : ['BILL','Gates','blabla','bla','Bill Gates']
        # if "Bill Gates" is already known Entity it will leave us with: ['blabla','bla','Bill Gates']
        for term in query:  # cleaning parts of entities from the query if the entity exist in the inverted index
            if " " in term:
                if term in self.inverted_index:  # entity and in inverted Index
                    # modifiedQuery_l.append(term)
                    entity_l = term.split(" ")
                    for word in entity_l:
                        try:
                            termsToRemoveFromQuery.append(word.upper())
                        except:
                            termsToRemoveFromQuery.append(word.lower())
                else:  # unknown entity
                    modifiedQuery_l.remove(term)

        for word in termsToRemoveFromQuery:  #clear all appears of token from modifiedQuery
            modifiedQuery_l[:] = [x for x in modifiedQuery_l if x != word]
        query = modifiedQuery_l
        #count the original query len (words without ~ at the end)
        i = 0
        notExpendedQueryLen = 0
        word = query[i]
        while word[-1] != '~':
            notExpendedQueryLen += 1
            i += 1
            try:
                word = query[i]
            except:
                break
        if notExpendedQueryLen > ConfigClass.shortQueryLen:  #long query
            self.scoreLowerBoundFactor = ConfigClass.longQueryFactor
        else:
            self.scoreLowerBoundFactor = ConfigClass.shortQueryFactor
        minScoreForEntry = self.scoreLowerBoundFactor * notExpendedQueryLen

        # clear query for terms that are not in Inverted Index:
        listOfValidTerms = []
        for term in query:  # term can be mix of upper and lower, or one of them. if not term doesn't exist in II
            expendedToken = False
            if term[-1] == "~":
                expendedToken = True
                term = term[:-1]
            #count the number of words in query that appear in inverted index(doesnt matter if lower or upper)
            if term.lower() in self.inverted_index:
                if expendedToken:
                    term += "~"
                term = term.lower()
            elif term.upper() in self.inverted_index:
                if expendedToken:
                    term += "~"
                term = term.upper()
            elif term in self.inverted_index:  # only for entity
                if expendedToken:
                    term += "~"
            else:
                continue
            listOfValidTerms.append(term)

        ###################################################
        #######          SEARCHING PART               #####
        ###################################################

        numOfValidTerms = len(listOfValidTerms)
        if numOfValidTerms == 0:  # No vaild terms in query
            return [], 0

        if numOfValidTerms == 1:  # Only 1 word out of the query was founded in II
            term = listOfValidTerms[0]
            if term[-1] == "~":  #only for expended token
                factor = ConfigClass.expendedWordWeight
                term = term[:-1]
            else:
                factor = ConfigClass.wordFromOGQueryWeight
            nodes_l = self.inverted_index[term][2]
            for node in nodes_l:
                max_f = self.ranker.tweets_info[node.tweetID][1]
                self.ranker.tweet_SigmaWij_d[node.tweetID] = [
                    node.tf * max_f, node.Wij * factor
                ]  # node.tf*max_f : is a rollback to num of appearance of term in tweet
                score = node.tf * max_f  # num of appear of query word in this specific tweet-node
                ##################################
                if score >= minScoreForEntry:
                    relevantDocs_pq.put(
                        (-score, node.tweetID)
                    )  # -score is to reverse the queue to max priority first.
                ##################################
                # relevantDocs_pq.put((-score, node.tweetID))  # -score is to reverse the queue to max priority first.
            while len(sorted_l
                      ) < self.DocsToRetrieve and relevantDocs_pq.qsize() > 0:
                # while relevantDocs_pq.qsize() > 0:
                itemFromPq = relevantDocs_pq.get()
                positiveScore_tweetID_Tuple = (-itemFromPq[0], itemFromPq[1])
                sorted_l.append(positiveScore_tweetID_Tuple)
            return sorted_l, len(query)

        # query len > 1
        # first time we init the tweet_SigmaWij_d with values from first list of nodes that we'll unite with others later
        factor = ConfigClass.wordFromOGQueryWeight
        term = listOfValidTerms[0]
        if term[-1] == "~":  #check if first word in valid terms is an expended word
            term = term[:-1]
            factor = ConfigClass.expendedWordWeight
        unionList = self.inverted_index[term][2]  # list of nodes
        for node in unionList:
            self.ranker.tweet_SigmaWij_d[node.tweetID] = [1, node.Wij * factor]

        for i in range(1, len(listOfValidTerms)):
            term = listOfValidTerms[i]
            if term[-1] == "~":  # check if first word in valid terms is an expended word
                term = term[:-1]
                factor = ConfigClass.expendedWordWeight
            else:
                factor = ConfigClass.wordFromOGQueryWeight
            unionList = self.UnionLists(unionList,
                                        self.inverted_index[term][2], factor)

        for node in unionList:
            score = self.ranker.tweet_SigmaWij_d[node.tweetID][
                0]  # num of appear of query word in this specific tweet-node
            ##################################
            if score >= minScoreForEntry:
                relevantDocs_pq.put(
                    (-score, node.tweetID)
                )  # -score is to reverse the queue to max priority first.
            ##################################
            # relevantDocs_pq.put((-score, node.tweetID))  # -score is to reverse the queue to max priority first.
        while len(sorted_l) < self.DocsToRetrieve and relevantDocs_pq.qsize(
        ) > 0:
            # while relevantDocs_pq.qsize() > 0:
            itemFromPq = relevantDocs_pq.get()
            positiveScore_tweetID_Tuple = (-itemFromPq[0], itemFromPq[1])
            sorted_l.append(positiveScore_tweetID_Tuple)
        return sorted_l, len(query)

    def UnionLists(
        self, listA, listB, factor
    ):  #factor will be 1 for term from original query, 0.5 foe expended term
        listA_len, listB_len = len(listA), len(listB)
        tweet_SigmaWij_d = self.ranker.tweet_SigmaWij_d
        a, b = 0, 0
        union_l = []
        while a < listA_len and b < listB_len:
            if listA[a] < listB[b]:
                union_l.append(listA[a])
                a += 1
            elif listB[b] < listA[a]:  # update Wij, update numOfApeers in tweet
                union_l.append(listB[b])
                if listB[b].tweetID in tweet_SigmaWij_d:
                    tweet_SigmaWij_d.get(
                        listB[b].tweetID)[0] += 1  # numOfAppers
                    tweet_SigmaWij_d.get(
                        listB[b].tweetID
                    )[1] += listB[b].Wij * factor  # update Wij
                else:
                    tweet_SigmaWij_d[listB[b].tweetID] = [
                        1, listB[b].Wij * factor
                    ]
                b += 1
            else:
                union_l.append(listB[b])
                # update Wij WITH B Wij, update numOfApeers (+1) in tweet
                tweet_SigmaWij_d.get(listB[b].tweetID)[0] += 1  # numOfAppers
                tweet_SigmaWij_d.get(
                    listB[b].tweetID)[1] += listB[b].Wij * factor  # update Wij
                b += 1
                a += 1

        while a < listA_len:
            union_l.append(listA[a])
            a += 1

        while b < listB_len:
            union_l.append(listB[b])
            # update Wij WITH B Wij, update numOfApeers (+1) in tweet
            if listB[b].tweetID in tweet_SigmaWij_d:
                tweet_SigmaWij_d.get(listB[b].tweetID)[0] += 1  # numOfAppers
                tweet_SigmaWij_d.get(
                    listB[b].tweetID)[1] += listB[b].Wij * factor  # update Wij
            else:
                tweet_SigmaWij_d[listB[b].tweetID] = [1, listB[b].Wij * factor]
            b += 1

        return union_l
Example #17
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker()
        self._model = model

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_list = query.split(" ")
        query_as_list = self._parser.text_operation(query_list)
        # extension  by word net
        queary_list_after_word_net = self.q_word_net(query_as_list)
        #remove stop words
        query_as_list = self._parser.parse_sentence(queary_list_after_word_net)
        # find the docs
        relevant_docs = self._relevant_docs_from_posting(
            query_as_list)  # return all the rel doc for the quiry

        relevant_docs = OrderedDict(
            sorted(relevant_docs.items(),
                   key=lambda item: item[1],
                   reverse=True))
        relevant_docs = dict(itertools.islice(relevant_docs.items(),
                                              2000))  #max is 2000 docs
        #relevant_docs_sort = self._ranker.rank_relevant_docs(relevant_docs, self._indexer, len(query_as_list))
        relevant_docs_sort = self._ranker.rank_relevant_docs(
            relevant_docs, self._indexer, len(query_as_list))
        n_relevant = len(relevant_docs)
        if k is not None:
            relevant_docs_sort = self.ranker.retrieve_top_k(
                relevant_docs_sort, self.k)
        return n_relevant, relevant_docs_sort

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_docs = {}
        for word in query_as_list:
            posting_list = self._indexer.get_term_posting_list(
                word)  # get all the twite with this word
            for doc in posting_list:
                tf = self._indexer.get_term_inverted_idx(word)[2]
                id = doc[0]
                if id not in relevant_docs.keys():
                    relevant_docs[id] = [1, []]
                    # self._indexer.get_term_inverted_idx[word]
                    tfidf = doc[4] * tf
                    relevant_docs[id][1].append(tfidf)
                else:
                    tfidf = doc[4] * tf
                    relevant_docs[id][1].append(tfidf)
                    relevant_docs[id][0] += 1

        return relevant_docs

    """
    this function expand the query by using word net 
    get query as list and add words by word net
    """

    def q_word_net(self, query):
        extend_query = []
        extend_query.extend(query)
        for word in query:
            add_new_word = False
            counter_same_word = 0
            syn_list = wn.synsets(word)
            for i in range(len(syn_list)):
                if syn_list[i].lemma_names() != []:
                    for lemma in syn_list[i].lemma_names():
                        if lemma == word:
                            continue
                        else:
                            new_word = lemma
                            if "_" not in new_word:
                                if self._indexer._is_term_exist_in_idx(
                                        new_word):
                                    extend_query.append(new_word)
                                    add_new_word = True
                                    break
                            else:  # more then one word
                                new_word_list = new_word.split("_")
                                for w in new_word_list:
                                    if self._indexer._is_term_exist_in_idx(w):
                                        extend_query.extend(new_word_list)
                                        add_new_word = True
                                        break
                        if add_new_word == True:
                            break
                        counter_same_word += 1
                        if counter_same_word > 1:
                            break
                    if add_new_word == True:
                        break
                if add_new_word == True:
                    break
                elif i > 1:
                    break

        return set(extend_query)
Example #18
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """

        # parse query according to the same parsing rules of the corpus
        entities = {}
        term_dict = {}
        parsed_query = self._parser.parse_sentence(query,
                                                   entities,
                                                   stemming=self.stemming)
        self._parser.parse_capital_letters(parsed_query, term_dict)
        processed_query = [*term_dict.keys()] + [*entities.keys()]

        # perform spell correction
        if self.spell_correction:

            from spellchecker import SpellChecker
            spell_checker = SpellChecker()
            corrected_terms = []

            # list all misspelled terms in the query
            misspelled_terms = spell_checker.unknown([*term_dict.keys()])
            for term in misspelled_terms:

                # only correct terms that aren't in the inverted dictionary
                # terms in the dictionary are considered correct for retrieval
                if term not in self._indexer.inverted_idx:
                    candidates = spell_checker.candidates(term)
                    max_to_return = min(Searcher.TOP_N, len(candidates))
                    candidates = candidates[:
                                            max_to_return]  # return only the top 3 results
                    if term in candidates:  # remove duplicate originally correct terms
                        candidates.remove(term)

                    for candidate in candidates:  # remove corrections already in query
                        if candidate in parsed_query:
                            candidates.remove(candidate)

                    corrected_terms.extend(candidates)

            processed_query += corrected_terms  # extend query with corrected words

        if self.thesaurus:

            from nltk.corpus import lin_thesaurus as thes

            candidates = []
            for term in processed_query:

                synsets = thes.synonyms(term)
                for synset in synsets:
                    synonyms = [*synset[1]]
                    if len(synonyms) > 0:
                        max_to_return = min(Searcher.TOP_N, len(synonyms))
                        best_synonyms = synonyms[:max_to_return]
                        for synonym in best_synonyms:
                            if synonym != term and synonym not in processed_query and synonym in self._indexer.inverted_idx:
                                candidates.append(synonym)  # extend the query
                        break

            processed_query += candidates

        if self.wordnet:

            from nltk.corpus import wordnet

            print("wordenting")
            candidates = []
            for term in processed_query:
                print(f"term {term}:")
                synsests = wordnet.synsets(term)  # retrieve best syn_sets
                max_to_return = min(Searcher.TOP_N, len(synsests))
                synsests = synsests[0:max_to_return]
                print("returned synsets")
                skip = False
                for synset in synsests:
                    for lemma in synset.lemmas(
                    )[:max_to_return]:  # possible synonyms
                        print(f"possible lemma: {lemma.name()}")
                        if lemma.name() != term and lemma.name(
                        ) not in processed_query and lemma.name():
                            if lemma.name() in self._indexer.inverted_idx:
                                candidates.append(lemma.name())
                                print(f"appended {lemma.name()}")
                                skip = True
                                break
                            elif lemma.name().lower(
                            ) in self._indexer.inverted_idx:
                                candidates.append(lemma.name())
                                print(f"appended {lemma.name()}")
                                skip = True
                                break
                            elif lemma.name().upper(
                            ) in self._indexer.inverted_idx:
                                candidates.append(lemma.name())
                                print(f"appended {lemma.name()}")
                                skip = True
                                break

                    if skip:
                        break

            parsed_query += candidates

        # dictionary for holding all relevant documents (at least one query term appeared in the document)
        # format: {document_id: score}
        relevant_docs = {}
        for term in processed_query:

            # check if term exists in inverted dictionary in either lower or upper form
            if term in self._indexer.inverted_idx:
                self.calculate_doc_scores(term, relevant_docs)
            elif term.islower() and term.upper() in self._indexer.inverted_idx:
                self.calculate_doc_scores(term.upper(), relevant_docs)
            elif term.isupper() and term.lower() in self._indexer.inverted_idx:
                self.calculate_doc_scores(term.lower(), relevant_docs)

        n_relevant = len(relevant_docs)
        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)

        return n_relevant, ranked_doc_ids
Example #19
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_list = self._parser.parse_sentence(query)

        if self.spellcheck:
            query_as_list = self.spell_check_query(query_as_list)

        if self.Sij_dic is not None:
            query_as_list.extend(self.expand_query_global_method(query_as_list))


        if self.word_net:
            expend = []
            for term in query_as_list:
                res = self.WordNet(term, query_as_list)
                if res is not None:
                    expend.append(res)

            if len(expend) != 0:
                query_as_list.extend(expend)

        if self.word2vec:

            if self.local:

                lst_before_extend = self._relevant_docs_from_posting(query_as_list)

                add_to_query = Ranker.compute_extend_word(self._ranker, lst_before_extend)

                query_as_list.extend(add_to_query)

                self.counter_of_terms.clear()
                self.unique_tweets_num.clear()
                self.relevant_docs.clear()


            relevant_docs = self.second(query_as_list)
            ranked_doc_ids = Ranker.rank_relevant_docs_w2v(self._ranker, self._model, query_as_list, relevant_docs)

            return len(ranked_doc_ids), ranked_doc_ids


        if self.local:
            lst_before_extend = self._relevant_docs_from_posting(query_as_list)

            add_to_query = Ranker.compute_extend_word(self._ranker, lst_before_extend)

            query_as_list.extend(add_to_query)

            self.counter_of_terms.clear()
            self.unique_tweets_num.clear()
            self.relevant_docs.clear()

            lst_After_extend = self.second(query_as_list)

            ranked_doc_ids = Ranker.rank_relevant_docs(self._ranker, lst_After_extend)

            return len(ranked_doc_ids), ranked_doc_ids



        relevant_docs = self.second(query_as_list)
        ranked_doc_ids = Ranker.rank_relevant_docs(self._ranker, relevant_docs)

        return len(ranked_doc_ids), ranked_doc_ids
Example #20
0
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        # all_dicts = self._indexer.load_index('inverted_idx.pkl')
        inverted_index = self._indexer.inverted_idx
        posting = self._indexer.postingDict
        documents = self._indexer.documents
        dict_of_methods = self._indexer.dict_of_method

        if dict_of_methods['wordnet']== True:
            #wordnet method
            doc_query_app = self.finished_dict(query, inverted_index) #  first parse query words
            list_of_query = doc_query_app.keys()
            words_to_add = {}
            # get each query word its synsets and add to query the ones that in inverted index
            for word in list_of_query:
                opt = wordnet.synsets(word)
                for i in range(len(opt)):
                    check_word = opt[i].lemmas()[0].name()
                    if check_word in doc_query_app.keys() or check_word in words_to_add.keys():
                        continue
                    tested = self._indexer.check_upper_lower(inverted_index, check_word)
                    if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys():
                        continue
                    if tested[1] is True:
                        words_to_add[tested[0]] = 0.0001
                    elif tested[1] is 'replace':
                        words_to_add[tested[0].upper()] = 0.0001
            doc_query_app.update(words_to_add)

        elif dict_of_methods['spell_correction']== True:
            spell = SpellChecker(case_sensitive=True)
            query_as_list = query.split()
            for index in range(len(query_as_list)):
                is_upper = False
                word = query_as_list[index]
                # if word from query not in inverted index look for correction- take the first one that is in inverted index
                if self._indexer.check_upper_lower(inverted_index, word)[1] is False:  # word not in inverted index
                    if word[0].isupper() is True:
                        is_upper = True
                    options = spell.candidates(word)
                    is_found = False
                    i = 0
                    options = list(options)
                    while i < len(options):
                        if self._indexer.check_upper_lower(inverted_index, options[i])[1] is True:
                            corrected = options[i]
                            is_found = True
                            break
                        i += 1
                    # corrected = spell.correction(word)
                    if is_found is not False and corrected != query_as_list[index]:
                        if is_upper is True:
                            corrected = corrected.capitalize()
                        query_as_list[index] = corrected
            doc_query_app = self.finished_dict(" ".join(query_as_list), inverted_index)

        elif dict_of_methods['word2vec'] == True:
            words_to_add = {}
            doc_query_app = self.finished_dict(query, inverted_index)
            query_as_list = query.split()
            insert_new_words = []
            for word in query_as_list:
                if word in self._model.wv.wv.vocab:
                    lst_sim_word_model = self._model.most_similar(word.lower())
                    for similiar_word in lst_sim_word_model:
                        if similiar_word[1] > 0.33:
                            insert_new_words.append(similiar_word[0])

            # if len(insert_new_words) == 0:
            #     continue
            idx = 0
            while idx < len(insert_new_words):
                if insert_new_words[idx] in doc_query_app.keys() or insert_new_words[idx] in words_to_add.keys():
                    idx += 1
                    continue
                tested = self._indexer.check_upper_lower(inverted_index, insert_new_words[idx])
                if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys():
                    idx += 1
                    continue
                if tested[1] is True:
                    words_to_add[tested[0]] = 0.6
                    break
                elif tested[1] is 'replace':
                    words_to_add[tested[0].upper()] = 0.6
                    break
                idx += 1
            doc_query_app.update(words_to_add)

        elif dict_of_methods['thesaurus'] == True:
            doc_query_app = self.finished_dict(query, inverted_index) #  first parse query words
            list_of_query = list(doc_query_app.keys())
            words_to_add = {}
            # get each query word its synonyms and add to query the first that is in inverted index
            stop = set(stopwords.words('english'))
            results = [thes.synonyms(i, fileid="simN.lsp") for i in list_of_query if i not in stop]
            results_as_list = list(results)
            for words in results_as_list:
                inside_list = list(words)
                if len(inside_list) == 0:
                    continue
                idx = 0
                while idx < len(inside_list):
                    if inside_list[idx] in doc_query_app.keys() or inside_list[idx] in words_to_add.keys():
                        idx += 1
                        continue
                    tested = self._indexer.check_upper_lower(inverted_index, inside_list[idx])
                    if tested[1] is False or tested[0] in doc_query_app.keys() or tested[0] in words_to_add.keys():
                        idx += 1
                        continue
                    if tested[1] is True:
                        words_to_add[tested[0]] = 0.0001
                        break
                    elif tested[1] is 'replace':
                        words_to_add[tested[0].upper()] = 0.0001
                        break
                    idx += 1
            doc_query_app.update(words_to_add)

        else:  # dict_of_methods['parser'] = True
            doc_query_app = self.finished_dict(query, inverted_index)

        if len(doc_query_app) == 0:
            return []

        dict_relevant_docs = self._relevant_docs_from_posting(doc_query_app, posting)
        ranked_doc_ids = Ranker.rank_relevant_docs(dict_relevant_docs , posting, documents, doc_query_app)
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
Example #21
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer

        self._ranker = Ranker()
        self._model = model
        self._config = self._indexer.config
        self._method_class = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        self._indexer.load_index("idx_bench.pkl")
        query_as_list = self._parser.parse_sentence(query)[0]
        query_dict, max_tf_query = self.get_query_dict(query_as_list)
        expanded_query_dict = self._method_class.expand_query(
            query_dict, max_tf_query)
        return self.search_helper(expanded_query_dict, k,
                                  self._method_class.p_threshold,
                                  self._method_class.p_rel)

    # create {term : tf} for query
    def get_query_dict(self, tokenized_query):
        max_tf = 1
        query_dict = {}
        for index, term in enumerate(tokenized_query):
            if term not in query_dict:
                query_dict[term] = 1

            else:
                query_dict[term] += 1
                if query_dict[term] > max_tf:
                    max_tf = query_dict[term]

        for term in query_dict:
            query_dict[term] /= max_tf

        return query_dict, max_tf

    def relevant_docs_from_posting(self, query_dict, p_threshold=0):
        relevant_docs = {}
        query_vector = np.zeros(len(query_dict), dtype=float)
        full_cells_threshold = round(p_threshold * len(query_vector))

        for idx, term in enumerate(list(query_dict.keys())):
            try:
                # added
                docs_index = self.get_doc_index()
                tweets_per_term = self._indexer.get_term_posting_tweets_dict(
                    term)

                for tweet_id, vals in tweets_per_term.items():
                    doc_date = docs_index[tweet_id][1]
                    if tweet_id not in relevant_docs.keys():
                        relevant_docs[tweet_id] = [
                            np.zeros(len(query_dict), dtype=float), doc_date
                        ]

                    # Wij - update tweet vector in index of term with tf-idf
                    tf_tweet = vals[0]
                    idf_term = self._indexer.get_term_idf(term)
                    relevant_docs[tweet_id][0][idx] = tf_tweet * idf_term

                    # Wiq - update query vector in index of term with tf-idf
                    tf_query = query_dict[term]
                    query_vector[idx] = tf_query * idf_term
            except:
                pass

        # OPTIMIZATIONS
        for doc in list(relevant_docs.keys()):
            if np.count_nonzero(relevant_docs[doc][0]) < full_cells_threshold:
                del relevant_docs[doc]

        return relevant_docs, query_vector

    def set_method_type(self, method_type):
        if method_type == '1':
            self._method_class = LocalMethod(self)
        elif method_type == '2':
            self._method_class = Thesaurus(self)
        elif method_type == '3':
            self._method_class = Wordnet(self)
        elif method_type == '4':
            self._method_class = MySpellCheker(self)
        # elif.. more methods

    def get_term_index(self):
        return self._indexer.inverted_idx_term

    def get_doc_index(self):
        return self._indexer.inverted_idx_doc

    def is_term_in_index(self, term):
        return term in self._indexer.inverted_idx_term

    def search_helper(self, query_dict, k, p_threshold=0, p_relevant=0):
        relevant_docs, query_vector = self.relevant_docs_from_posting(
            query_dict, p_threshold)
        n_relevant = len(relevant_docs)
        ranked_docs = self._ranker.rank_relevant_docs(relevant_docs,
                                                      query_vector)
        return n_relevant, self._ranker.retrieve_top_k(ranked_docs, k,
                                                       p_relevant)
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker(indexer)
        self._model = model
        self.word_net = WordNet(indexer)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        #parse the query and get list of term, list of entity
        query_as_list, entity_dict = self._parser.parse_sentence(query)
        entity_as_list = list(entity_dict.keys())
        #start wordnet
        query_extand = self.word_net.query_expan(query_as_list)
        query_as_list.extend(query_extand)

        #get the relevant docs id, info of relevant term
        relevant_docs_query, relevant_terms_query = self._relevant_docs_from_posting(
            query_as_list)
        relevant_docs_entity, relevant_terms_entity = self._relevant_docs_to_entity(
            entity_as_list)

        #combine dict of docs and entity
        full_relevant_doc = {**relevant_docs_query, **relevant_docs_entity}
        full_relevant_term = {**relevant_terms_query, **relevant_terms_entity}

        n_relevant = len(full_relevant_doc)
        #start ranker
        ranked_doc_ids = self._ranker.rank_relevant_docs(
            relevant_doc=full_relevant_doc,
            relevant_terms=full_relevant_term,
            query_terms=query_as_list)
        try:
            doc_id, doc_rank = zip(*ranked_doc_ids)
        except:
            doc_id = ()
        return n_relevant, list(doc_id)

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_terms = {}
        relevant_docs = {}
        for term in query_as_list:
            posting_list_of_term = []
            #if term is numeric get term info
            if term.isnumeric():
                list = self._indexer.get_term_posting_list(term)
                if not list is None:
                    posting_list_of_term.extend(list)
            else:  #if term no numeric get info to upper&lower
                list = self._indexer.get_term_posting_list(term)
                if not list is None:
                    posting_list_of_term.extend(list)
                list = self._indexer.get_term_posting_list(term.casefold())
                if not list is None:
                    posting_list_of_term.extend(list)
            #add to relevant doc and save the info of term
            for doc_id in posting_list_of_term:
                if doc_id in relevant_docs:
                    relevant_docs[doc_id[0]] += 1
                else:
                    relevant_docs[doc_id[0]] = 1
                relevant_terms[term] = posting_list_of_term
        return relevant_docs, relevant_terms

    def _relevant_docs_to_entity(self, entity_as_list):
        relevant_docs = {}
        relevant_terms = {}
        for term in entity_as_list:
            posting_list = self._indexer.get_term_posting_list(term)
            if not posting_list is None:
                for doc_id, tf in posting_list:
                    df = relevant_docs.get(doc_id, 0)
                    relevant_docs[doc_id] = df + 1
                    relevant_terms[term] = posting_list
        return relevant_docs, relevant_terms
Example #23
0
class Searcher:
    __slots__ = [
        '_parser', '_indexer', '_ranker', '_the_count', '_model',
        '_min_relevant', '_ext_val', '_wordnet_count'
    ]

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, config, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker(config)
        self._model = model
        self._the_count = config.the_count
        self._wordnet_count = config.wordnet_count
        self._min_relevant = config.min_relevant
        self._ext_val = config.ext_val

    def CalculateW(self, query, extenders):
        output = {term: 1 for term in query}
        for term in extenders:
            if term not in output:
                output[term] = 0
            output[term] += self._ext_val
        return output

    def wordNet(self, word):
        syn = set()
        for syn_set in wordnet.synsets(word):
            for lemma in syn_set.lemmas():
                syn.add(lemma.name())
                if lemma.antonyms():
                    syn.add(lemma.antonyms()[0].name())
                if len(syn) >= self._wordnet_count:
                    return syn
        return syn

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None, methods=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relevant
            and the last is the least relevant result.
        """

        # spell corrections
        if 1 in methods:
            spell = SpellChecker()
            query = ' '.join(
                [spell.correction(word) for word in query.split()])

        query_terms = self._parser.Tokenize(query).keys()
        extenders = set()

        # wordNet
        if 2 in methods:
            for word in query_terms:
                for ex_word in self.wordNet(word.text):
                    extenders.add(self._parser.add_to_dict(ex_word))

        # lin_thesaurus
        if 3 in methods:
            for word in query_terms:
                for ex_word in list(thes.synonyms(
                        word.text)[1][1])[:self._the_count]:
                    extenders.add(self._parser.add_to_dict(ex_word))

        extenders = {extender for extender in extenders if extender}
        w_of_term_in_query = self.CalculateW(query_terms, extenders)

        relevant_docs = self._relevant_docs_from_posting(
            w_of_term_in_query.keys())
        ranked_doc_ids = self._ranker.rank_relevant_docs(
            relevant_docs, k, w_of_term_in_query)

        return len(ranked_doc_ids), ranked_doc_ids

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_terms):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_terms: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_docs = {}
        for term in query_terms:
            if len(term.postings) == 0:
                continue
            idf = math.log2(len(term.postings) / len(self._indexer.documents))
            for doc_id, tf in term.postings:
                if doc_id not in relevant_docs.keys():
                    relevant_docs[doc_id] = {}
                relevant_docs[doc_id][term] = tf * idf  # wiq

        return {
            doc: relevant_docs[doc]
            for doc in relevant_docs if len(relevant_docs[doc]) >= min(
                self._min_relevant, len(query_terms))
        }