Esempio n. 1
0
 def __init__(self, parser, indexer, model=None):
     self._parser = parser
     self._indexer = indexer
     self._renker = Ranker()
     self._model = model
Esempio n. 2
0
class Searcher:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit. The model
    # parameter allows you to pass in a precomputed model that is already in
    # memory for the searcher to use such as LSI, LDA, Word2vec models.
    # MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
    def __init__(self, parser, indexer, model=None):
        self._parser = parser
        self._indexer = indexer
        self._renker = Ranker()
        self._model = model

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query, k=None):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results (tweet ids).
        Input:
            query - string.
            k - number of top results to return, default to everything.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        p = self._parser
        start_qury = timer()
        query_as_list = p.parse_sentence(query)
        query_as_list = self.expand_query_wordnet(
            query_as_list)  # returnes a list of words
        advance_query = {}  # key- term. value - tf of the term in qurey
        start_searcher = timer()
        relevant_docs = self._relevant_docs_from_posting(query_as_list)

        end_searcher = timer()
        # print(str(timedelta(seconds=end_searcher - start_searcher)) + "searcher time")
        for term in query_as_list:
            if term in relevant_docs.keys():
                advance_query[term] = query_as_list.count(term) / len(
                    query_as_list)
            elif term.lower() in relevant_docs.keys():
                advance_query[term.lower(
                )] = query_as_list.count(term) / len(query_as_list)
        relevant_doc_dict = self.get_relevant_doc_dict(
            relevant_docs
        )  # key= doc_id, value= (num_of_terms appears_in_doc from qury, [(terms,num_of_term_appears)])
        relevant_doc_dict = sorted(relevant_doc_dict.items(),
                                   key=lambda item: item[1][0],
                                   reverse=True)
        relevant_doc_dict = dict(relevant_doc_dict[0:2000]) if len(
            relevant_doc_dict) > 2000 else dict(relevant_doc_dict)
        # relevant_doc_dict = sorted(relevant_doc_dict.keys(), key=lambda x:x[0],reverse=True)
        start_renking = timer()
        if self._model != None:
            ranked_docs = self._renker.rank_relevant_docs(
                relevant_doc_dict, advance_query, self._indexer, self._model)
        else:
            ranked_docs = self._renker.rank_relevant_docs(
                relevant_doc_dict, advance_query, self._indexer)
        end_qury = timer()
        # print(str(timedelta(seconds=end_qury - start_renking)) + "ranking time")
        # print(str(timedelta(seconds=end_qury - start_qury)) + "qury time")

        return len(ranked_docs), ranked_docs
        # query_as_list = self._parser.parse_sentence(query)
        #
        # relevant_docs = self._relevant_docs_from_posting(query_as_list)
        # n_relevant = len(relevant_docs)
        # ranked_doc_ids = Ranker.rank_relevant_docs(relevant_docs)
        # return n_relevant, ranked_doc_ids

    # feel free to change the signature and/or implementation of this function
    # or drop altogether.
    def _relevant_docs_from_posting(self, query_as_list):
        """
        This function loads the posting list and count the amount of relevant documents per term.
        :param query_as_list: parsed query tokens
        :return: dictionary of relevant documents mapping doc_id to document frequency.
        """

        relevant_docs = {}
        # if self.config.toStem:
        #     sttemer = PorterStemmer()

        # if self.config.toStem:
        #     sttemer = PorterStemmer()
        for term in query_as_list:
            try:  #collecting term data
                #for cases like 'NILLI' or 'Donald Trump'
                inverted_index = self._indexer.inverted_idx
                posting_dict = self._indexer.postingDict
                try:
                    if inverted_index[term][
                            1] > self._indexer.config.get_cut_by():
                        continue
                    term_data = inverted_index[term]
                    term_line_in_posting = term_data[0][1]
                    file_name = term_data[0][0]
                    origin_lines = posting_dict[file_name]
                    original_term_data = origin_lines[term_line_in_posting]
                    relevant_docs[term] = original_term_data
                except:
                    # lower case
                    term_data = inverted_index[term.lower()]
                    term_line_in_posting = term_data[0][1]
                    file_name = term_data[0][0]
                    origin_lines = posting_dict[file_name]
                    relevant_docs[term.lower()] = origin_lines[
                        term_line_in_posting]  # + original_term_data
            except Exception:
                pass
        return relevant_docs  #dict Keys- Term, Values- list of docs

    def get_relevant_doc_dict(self, relevant_docs):
        docs_dict = {
        }  # key= doc_id, value= (num_of_terms appears_in_doc from qury, [(terms,num_of_term_appears)])
        #relevant_docs = dict- key=term, value= [(num_of_term_appears, dic_id),(num_of_term_appears, dic_id)]
        for term in relevant_docs.keys():
            for doc_ditails in relevant_docs[term]:
                doc_id = doc_ditails[1]
                if doc_id in docs_dict.keys():
                    flag = False
                    #clean double docs in corpus
                    for term_in_doc in docs_dict[doc_id][1]:
                        if term_in_doc[0] == term:
                            flag = True
                    #not same term in same doc
                    if not flag:
                        sum_terms = docs_dict[doc_id][0] + 1
                        #details = docs_dict[doc_id]
                        docs_dict[doc_id] = (sum_terms, docs_dict[doc_id][1] +
                                             [(term, doc_ditails[0])])
                        #details1= docs_dict[doc_id]
                else:
                    docs_dict[doc_id] = (1, [(term, doc_ditails[0])])
        return docs_dict

    def expand_query_wordnet(self, query):
        expand_set = set()
        for term in query:
            for sys in wordnet.synsets(term):
                for lemma in sys.lemmas():
                    if lemma.name().__contains__("_"):
                        # splited = lemma.name().split("_")
                        # [expand_set.add(word) for word in splited]
                        continue
                    else:
                        try:
                            if self._indexer.inverted_index[lemma.name(
                            )][1] < 10 or self._indexer.inverted_index[
                                    lemma.name(
                                    )][1] > self._indexer.config.get_cut_by(
                                    ):  # TODO
                                continue
                            expand_set.add(lemma.name())
                            break
                        except:
                            pass  #not in the indexer
        [query.append(term) for term in expand_set if term not in query]
        return list(query)