Example #1
0
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_list = self.parser.parse_sentence(query)

        if self.is_thesaurus:
            query_as_list_with_synonym = self.thesaurus_method(query_as_list[0])
            query_as_list = [query_as_list_with_synonym, None]

        relevant_docs = self.relevant_docs_from_posting(query_as_list)
        ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        if k:
            ranked_doc_ids = Ranker.retrieve_top_k(ranked_doc_ids, k)
        n_relevant = len(ranked_doc_ids)
        return n_relevant, ranked_doc_ids
Example #2
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer

        self._ranker = Ranker()
        self._model = model
        self._config = self._indexer.config
        self._method_class = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        self._indexer.load_index("idx_bench.pkl")
        query_as_list = self._parser.parse_sentence(query)[0]
        query_dict, max_tf_query = self.get_query_dict(query_as_list)
        expanded_query_dict = self._method_class.expand_query(
            query_dict, max_tf_query)
        return self.search_helper(expanded_query_dict, k,
                                  self._method_class.p_threshold,
                                  self._method_class.p_rel)

    # create {term : tf} for query
    def get_query_dict(self, tokenized_query):
        max_tf = 1
        query_dict = {}
        for index, term in enumerate(tokenized_query):
            if term not in query_dict:
                query_dict[term] = 1

            else:
                query_dict[term] += 1
                if query_dict[term] > max_tf:
                    max_tf = query_dict[term]

        for term in query_dict:
            query_dict[term] /= max_tf

        return query_dict, max_tf

    def relevant_docs_from_posting(self, query_dict, p_threshold=0):
        relevant_docs = {}
        query_vector = np.zeros(len(query_dict), dtype=float)
        full_cells_threshold = round(p_threshold * len(query_vector))

        for idx, term in enumerate(list(query_dict.keys())):
            try:
                # added
                docs_index = self.get_doc_index()
                tweets_per_term = self._indexer.get_term_posting_tweets_dict(
                    term)

                for tweet_id, vals in tweets_per_term.items():
                    doc_date = docs_index[tweet_id][1]
                    if tweet_id not in relevant_docs.keys():
                        relevant_docs[tweet_id] = [
                            np.zeros(len(query_dict), dtype=float), doc_date
                        ]

                    # Wij - update tweet vector in index of term with tf-idf
                    tf_tweet = vals[0]
                    idf_term = self._indexer.get_term_idf(term)
                    relevant_docs[tweet_id][0][idx] = tf_tweet * idf_term

                    # Wiq - update query vector in index of term with tf-idf
                    tf_query = query_dict[term]
                    query_vector[idx] = tf_query * idf_term
            except:
                pass

        # OPTIMIZATIONS
        for doc in list(relevant_docs.keys()):
            if np.count_nonzero(relevant_docs[doc][0]) < full_cells_threshold:
                del relevant_docs[doc]

        return relevant_docs, query_vector

    def set_method_type(self, method_type):
        if method_type == '1':
            self._method_class = LocalMethod(self)
        elif method_type == '2':
            self._method_class = Thesaurus(self)
        elif method_type == '3':
            self._method_class = Wordnet(self)
        elif method_type == '4':
            self._method_class = MySpellCheker(self)
        # elif.. more methods

    def get_term_index(self):
        return self._indexer.inverted_idx_term

    def get_doc_index(self):
        return self._indexer.inverted_idx_doc

    def is_term_in_index(self, term):
        return term in self._indexer.inverted_idx_term

    def search_helper(self, query_dict, k, p_threshold=0, p_relevant=0):
        relevant_docs, query_vector = self.relevant_docs_from_posting(
            query_dict, p_threshold)
        n_relevant = len(relevant_docs)
        ranked_docs = self._ranker.rank_relevant_docs(relevant_docs,
                                                      query_vector)
        return n_relevant, self._ranker.retrieve_top_k(ranked_docs, k,
                                                       p_relevant)
Example #3
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self.config = indexer.config
        self._parser = parser
        self._indexer = indexer
        self.number_of_docs = indexer.num_of_docs
        self._model = model
        # self.inverted_index, self.document_dict = self._indexer.load_index("idx_engine1.pkl")
        self.inverted_index, self.document_dict = self._indexer.inverted_idx, self._indexer.document_dict

        self.glove_dict = self._indexer.glove_dict
        use_glove = True
        if len(self.glove_dict) == 0:
            use_glove = False
        self.ranker = Ranker(self.config, use_glove)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        relevant_docs, query_glove_vec, square_w_iq = self.relevant_docs_from_posting(
            query)
        ranked_docs = self.ranker.rank_relevant_doc(relevant_docs,
                                                    query_glove_vec,
                                                    square_w_iq)
        top_k = self.ranker.retrieve_top_k(ranked_docs, k)
        return top_k

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """

        term_to_indices = {}
        max_tf = 0
        query_glove_vec = np.zeros(shape=25)

        for idx, term in enumerate(query_as_list):
            if term in self.glove_dict:
                query_glove_vec += self.glove_dict[term]

            try:
                if term in self.inverted_index:

                    if term not in term_to_indices:

                        idx_set = {idx}
                        if len(idx_set) > max_tf:
                            max_tf = len(idx_set)
                        term_to_indices[term] = idx_set

                    else:  # term already in term dict, so only update it's index list
                        term_to_indices[term].add(idx)
                        if len(term_to_indices[term]) > max_tf:
                            max_tf = len(term_to_indices[term])

                else:  # term is un-known
                    idx_set = {idx}
                    if len(idx_set) > max_tf:
                        max_tf = len(idx_set)
                    term_to_indices[term] = idx_set

            except:
                print('term {} not found in inverted index'.format(term))

        query_glove_vec /= len(query_as_list)

        p = 0.5
        min_num_of_words_to_relevent = int(len(query_as_list) * p)
        pre_doc_dict = {}
        pre_doc_dict_counter = Counter()

        relevant_docs = {}
        w_iq_square = 0
        for term, term_indices in term_to_indices.items():

            term_tf_idf = ((len(term_indices) / len(query_as_list)) *
                           self.calc_idf(term))
            w_iq_square += math.pow(term_tf_idf, 2)

            try:
                # if doc_list is not None:
                if term in self.inverted_index:
                    # for doc_tuple in doc_list.items():
                    for tweet_id in self.inverted_index[term][1]:
                        pre_doc_dict_counter[tweet_id] += 1
                        if tweet_id not in pre_doc_dict:
                            # example - > tf_idf_vec
                            # [[tf1, tf2...]
                            #  [idf1, idf2...]]
                            tf_idf_numarator = 0
                            tf_idf_denomenator = math.sqrt(
                                self.document_dict[tweet_id][1])
                            tweet_doc_length = self.inverted_index.get_doc_length(
                                term, tweet_id)
                            glove_vec = self.document_dict[tweet_id][0]
                            tweet_date = self.inverted_index.get_tweet_date(
                                term, tweet_id)

                            pre_doc_dict[tweet_id] = [
                                tf_idf_numarator, tf_idf_denomenator,
                                tweet_doc_length, glove_vec, tweet_date
                            ]

                        pre_doc_dict[tweet_id][
                            0] += self.inverted_index.get_tf_idf(
                                term, tweet_id) * term_tf_idf

                        if tweet_id not in relevant_docs and \
                            pre_doc_dict_counter[tweet_id] >= min_num_of_words_to_relevent:
                            relevant_docs[tweet_id] = pre_doc_dict[tweet_id]

            except:
                print('term {} not found in posting'.format(term))

        return relevant_docs, query_glove_vec, math.sqrt(w_iq_square)

    def calculate_tf(self, tweet_term_tuple):
        """
        calculates term frequency.
        :param tweet_term_tuple: tuple containing all information of the tweet of the term.
        :return:
        """
        # to calc normalize tf
        num_of_terms_in_doc = tweet_term_tuple[1]
        frequency_term_in_doc = tweet_term_tuple[2]
        tf = frequency_term_in_doc / num_of_terms_in_doc

        return tf

    def calculate_idf_BM25(self, term_data):
        """
        calculates idf according to BM25 algorithm.
        :param term_data:
        :return:
        """
        n = self.number_of_docs
        df = term_data[0]
        idf = math.log(((n - df + 0.5) / (df + 0.5)) + 1)
        return idf

    def calc_idf(self, term):
        """
        calculates idf of term
        :param term: term
        :return:
        """
        # to calc idf
        n = self.number_of_docs
        # df = term_data[0]
        if term not in self.inverted_index:
            return 0
        df = self.inverted_index[term][0]
        idf = math.log10(n / df)
        return idf
Example #4
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._ranker = Ranker()
        self._model = model
        self.terms_searched = {}
        self.total_num_of_docs = parser.curr_idx

    ###############################################################################################
    #
    #ours
    # the big matrix is the base for the functions

    # return list of list

    def revocer_doc_ids(self, doc_id_tf_list):
        tmp_add = 0
        for tmp_list in doc_id_tf_list:
            tmp_add += tmp_list[0]
            tmp_list[0] = tmp_add
        return doc_id_tf_list

    # N= total amount of document in the corpus
    def _relevant_docs_from_posting(self, query_as_list, total_num_of_docs):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: query
        :return: dictionary of relevant documents.
        """
        terms_idf = {}
        similar_terms = []
        doc_id_dict = {}
        query_as_list = self._parser.parse_all_text(
            ' '.join(query_as_list).lower())  #

        if self._model is not None:
            if isinstance(self._model, list):
                query_as_list_to_extend = []
                for model in self._model:
                    if model is _SpellChecker():
                        query_as_list = model.improve_query(query_as_list)
                    else:
                        query_as_list_to_extend.extend(
                            model.improve_query(query_as_list))
                query_as_list = set(query_as_list_to_extend)

            else:
                try:
                    query_as_list = self._model.improve_query(query_as_list)
                except AttributeError:
                    print("Failed query expansion")
                    pass

    #     for term in query_as_list:
    #         # query expansion
    #         try:
    #             similar_terms.extend(self._model.get_similar_words(term)) # list
    #
    #         except AttributeError:
    #             print("Failed query expansion")
    #             break
    # if len(similar_terms) > 1:
    #     try:
    #         query_as_list = set(query_as_list.extend(similar_terms))
    #     except TypeError:
    #         pass

        for new_term in query_as_list:
            try:
                if new_term not in self._indexer.term_indexer_dict.keys():
                    if new_term.lower(
                    ) in self._indexer.term_indexer_dict.keys():
                        new_term = new_term.lower()
                    elif new_term.upper(
                    ) in self._indexer.term_indexer_dict.keys():
                        new_term = new_term.upper()

                if new_term in self._indexer.term_indexer_dict.keys():
                    df = self._indexer.term_indexer_dict[new_term][0]
                    if df != 0:
                        terms_idf[new_term] = math.log2(
                            float(total_num_of_docs) / float(df))

                    else:
                        terms_idf[new_term] = 0

                    docs_list = self._indexer.term_indexer_dict[new_term][1]
                    doc_id_dict.update(dict(docs_list))
                    self.terms_searched[new_term] = dict(docs_list)

            except:
                traceback.print_exc()

        doc_id_list = doc_id_dict.keys()
        final_dict = {}

        try:
            for term in query_as_list:
                if term in self.terms_searched.keys():
                    df = terms_idf[term]
                    for doc_id in doc_id_list:
                        if doc_id in self.terms_searched[term].keys():
                            tf = self.terms_searched[term][doc_id]
                            if term not in final_dict.keys():
                                final_dict[term] = [[tf, df, doc_id]]
                            else:
                                final_dict[term].append([tf, df, doc_id])
        except:
            traceback.print_exc()

        for doc_id in doc_id_list:
            for term in self._indexer.file_indexer_dict[doc_id].keys():
                # if term not in self.terms_searched.keys():
                tf = self._indexer.file_indexer_dict[doc_id][term]
                df = math.log2(
                    float(total_num_of_docs) /
                    float(self._indexer.term_indexer_dict[term][0]))
                if term not in final_dict.keys():
                    final_dict[term] = [[tf, df, doc_id]]
                else:
                    final_dict[term].append([tf, df, doc_id])

        return final_dict, doc_id_list, self._indexer.file_indexer_dict

    ######################################################################################################################################
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.

    def search(self, query, k=None):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        query_as_list = self._parser.parse_sentence(query)

        final_dict, doc_id_list, file_indexer_dict = self._relevant_docs_from_posting(
            query_as_list, self.total_num_of_docs)

        ranked_docs_list, ranked_docs_dict = self._ranker.rank_relevant_doc(
            final_dict, doc_id_list, query_as_list, file_indexer_dict)
        #results_dict = {self._parser.doc_idx_tweet_id[k]: ranked_docs_dict[k] for k in ranked_docs_list}

        ranked_docs_list_top_k = self._ranker.retrieve_top_k(
            ranked_docs_list, k)
        results_list_top_k = [
            self._parser.doc_idx_tweet_id[key]
            for key in ranked_docs_list_top_k
        ]

        return len(ranked_docs_list), results_list_top_k

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """
        relevant_docs = {}
        for term in query_as_list:
            posting_list = self._indexer.get_term_posting_list(term)
            for doc_id, tf in posting_list:
                df = relevant_docs.get(doc_id, 0)
                relevant_docs[doc_id] = df + 1
        return relevant_docs