Exemple #1
0
    def fn_search(self, query, expand_mode, stem_mode, folder_path, summ_mode):
        path = folder_path + path_delimiter + "stop_words.txt"
        Parser.set_stop_words_file(path)
        Parser.set_stemmer_mode(stem_mode)

        start_search = time.time(
        )  # Time elapsed since query received until results came back
        searcher = Searcher.Searcher()

        if expand_mode == 0 and summ_mode == 0:  # regular, manual query search
            results = searcher.search(stem_mode, folder_path, final_dictionary,
                                      cache_dictionary, documents_dictionary,
                                      query)

            file_num = len(results)  # The number of results
            end_search = time.time()
            total_time = end_search - start_search
            self.display_results(total_time, file_num, results)

        elif summ_mode == 1:
            doc_id = query
            results = searcher.find_popular_sentences(
                stem_mode, folder_path, final_dictionary, cache_dictionary,
                documents_dictionary, doc_id,
                get_text_from_document(doc_id, folder_path))
        else:  # We need to expand the query
            results = searcher.expand(stem_mode, folder_path, final_dictionary,
                                      cache_dictionary, documents_dictionary,
                                      query)

        file_num = len(results)  # The number of results
        end_search = time.time()
        total_time = end_search - start_search
        self.display_results(total_time, file_num, results)
Exemple #2
0
def handle_files(file_list_ref, documents_dictionary_ref):
    terms_dictionary = {}

    if stem_mode:
        # This code take a document's text from the list and parsing & stemming the text
        for value in file_list_ref:
            doc_id = value[0]
            file_name = value[2]
            after_stemming = Stemmer.stemWithCache(Parser.start(value[1]))
            # This function update the document parameters
            __update_and_merge_dictionaries(doc_id, file_name,
                                            terms_dictionary,
                                            documents_dictionary_ref,
                                            after_stemming)
            # This function merge all the dictionary in loop and create dictionary for the whole part
    else:
        # This code take a document's text from the list and only parsing the text
        for value in file_list_ref:
            doc_id = value[0]
            file_name = value[2]
            after_parse = Parser.start(value[1])
            # This function update the document parameters
            __update_and_merge_dictionaries(doc_id, file_name,
                                            terms_dictionary,
                                            documents_dictionary_ref,
                                            after_parse)
            # This function merge all the dictionary in loop and create dictionary for the whole part

    # This function create new temp posting file for each part
    Indexer.create_temp_posting_file(terms_dictionary)
Exemple #3
0
    def data_set_Path(self, path):
        global corpus_path
        print("Received corpus folder..")
        corpus_path = path + "/corpus"

        print("Received stopwords filename..")
        global __stopwords_path
        __stopwords_path = path + "/stop_words.txt"
        Parser.set_stop_words_file(__stopwords_path)
Exemple #4
0
 def start_evaluating_qry(self, searcher, q_file_path, semantic_model,
                          str_single_qry, mode_semantic, stemmer):
     self.init_helpers()
     qry_parser = Parser(self.hash_stopwords, self.hash_keywords_months,
                         self.hash_keywords_prices, self.hash_punc,
                         self.hash_punc_middle, self.hash_alphabet,
                         self.stemmer, self.hash_qry_stopwords)
     if str_single_qry == '':
         file_path = q_file_path
         skip_one = 0
         with open(file_path, 'r') as file:
             q_counter = 0
             data = file.read()
             data_list = data.split("<top>")
             del data
             for qry in data_list:
                 if skip_one == 1:
                     q_counter += 1
                     qry = "<top>" + qry
                     qry_parser.start_parse(qry, 0, semantic_model, 0,
                                            mode_semantic, stemmer)
                 else:
                     skip_one = 1
     else:
         qry_parser.start_parse(str_single_qry, 0, semantic_model, 1,
                                mode_semantic, stemmer)
     hash_titles = qry_parser.hash_titles
     hash_qry_terms = qry_parser.hash_terms
     # searcher.ranker.set_params(5, 0.05, 5, 1)
     searcher.search(hash_qry_terms, hash_titles)
Exemple #5
0
    def fn_run_query_file(self, query_file_path, stem_mode, folder_path):
        start_search = time.time(
        )  # Time elapsed since query received until results came back
        path = folder_path + path_delimiter + "stop_words.txt"
        Parser.set_stop_words_file(path)
        Parser.set_stemmer_mode(stem_mode)

        if stem_mode:
            Searcher.idf_weight = 0.8
            Searcher.df_weight = 0.7
            Searcher.denominator = 1.3
            Ranker.bm25_weight = 0.1
            Ranker.cos_sim_weight = 0.9
            Ranker.b = 0.6
            Ranker.k = 1.85
            Ranker.bm25_lambda = 0.25
            Ranker.bm25_idf = 0.5

        else:

            Searcher.idf_weight = 0.8
            Searcher.df_weight = 0.7
            Searcher.denominator = 1.3
            Ranker.bm25_weight = 0.1
            Ranker.cos_sim_weight = 0.9
            Ranker.b = 0.6
            Ranker.k = 1.85
            Ranker.bm25_lambda = 0.25
            Ranker.bm25_idf = 0.5

        searcher = Searcher.Searcher()
        results = searcher.multi_search(
            stem_mode, folder_path, final_dictionary, cache_dictionary,
            documents_dictionary, Reader.extract_queries(query_file_path))

        file_num = len(results)  # The number of results
        end_search = time.time()
        total_time = end_search - start_search

        self.display_query_file_results(total_time, file_num, results)
Exemple #6
0
 def parse_file(self, file_path):
     if len(self.hash_stopwords) == 0:
         self.init_helpers()
     global f_counter
     p = None
     file_terms = {}
     p_name = "#NUM_" + str(f_counter.value)
     with f_counter.get_lock():
         f_counter.value += 1
     f_start = time.time()
     p = Parser(self.hash_stopwords, self.hash_keywords_months,
                self.hash_keywords_prices, self.hash_punc,
                self.hash_punc_middle, self.hash_alphabet, self.stemmer,
                None)
     self.get_doc_from_file(file_path, p)
     for c in self.final_solution:
         while c in p.hash_terms:
             del p.hash_terms[c]
     for term in self.hash_stopwords:
         while term in p.hash_terms or term.upper() in p.hash_terms:
             del p.hash_terms[term]
     if '' in p.hash_terms:
         del p.hash_terms['']
     if len(p.hash_terms) > 1:
         with open(
                 self.post_path +
                 '/Engine_Data/temp_hash_objects/file_hash_' + p_name +
                 '.pkl', 'wb') as output:
             pickle.dump(p.hash_terms, output, pickle.HIGHEST_PROTOCOL)
         with open(
                 self.post_path +
                 '/Engine_Data/Cities_hash_objects/hash_cities' + p_name +
                 '.pkl', 'wb') as output:
             pickle.dump(p.hash_cities, output, pickle.HIGHEST_PROTOCOL)
         with open(
                 self.post_path +
                 '/Engine_Data/Docs_hash_objects/hash_docs' + p_name +
                 '.pkl', 'wb') as output:
             pickle.dump(p.hash_docs, output, pickle.HIGHEST_PROTOCOL)
     file_terms = {}
     self.vocabulary = {}
     f_end = time.time()
     time_to_file = f_end - f_start
     if f_counter.value % 20 == 0:
         p_c = float(f_counter.value)
         p_c = int(p_c * 100 / self.number_of_files)
         if p_c != self.percent:
             self.percent = p_c
             self.print_prog(p_c)
Exemple #7
0
    def multi_search(self, stem_mode, folder_path, final_dictionary,
                     cache_dictionary, documents_dictionary, query_list):
        if final_dictionary is None or cache_dictionary is None or documents_dictionary is None:
            return []
        else:
            list_for_return = []
            self.__set_folder_path(folder_path)
            self.__set_stem_mode(stem_mode)

            for query_tuple in query_list:  # iterates over the queries in the query file given
                # This code take a document's text from the list and parsing & stemming the text
                query_num = query_tuple[0]
                title = query_tuple[1]
                description = query_tuple[2]
                narrative = query_tuple[3]

                narrative_sentences = Parser.sentences(narrative)
                new_narrative = ''
                for sentence in narrative_sentences:
                    if "not relevant" not in sentence:
                        new_narrative += " " + sentence
                    elif stem_mode and "are relevant" in sentence:
                        new_narrative += " " + sentence

                narrative = new_narrative
                # We define a words we want to avoid in our parse and stem
                terms_dictionary = self.__parse_stem(title)
                description_dictionary = self.__parse_stem(description + ' ' +
                                                           narrative)
                if self.stem_mode:
                    words_to_avoid = {
                        "relev", "document", "discuss", "consid", "i.e", "issu"
                    }
                else:
                    words_to_avoid = {
                        "relevant", "documents", "document", "discuss",
                        "discussing", "information", "considered", "i.e",
                        "issues"
                    }

                description_dictionary_other = {}
                for key, value in description_dictionary.items():
                    if key in final_dictionary and key not in words_to_avoid:
                        # idf * query_tf
                        term_idf = final_dictionary[key][0][0]
                        description_dictionary_other[key] = (self.idf_weight * term_idf) + \
                                                            (self.df_weight * value)

                results_num = int(
                    len(description_dictionary_other) / self.denominator)
                description_dictionary_other = dict(
                    Counter(description_dictionary_other).most_common(
                        results_num))

                for key, value in description_dictionary_other.items():
                    if key in terms_dictionary:
                        terms_dictionary[key] += description_dictionary_other[
                            key]
                    else:
                        terms_dictionary[key] = description_dictionary_other[
                            key]

                dictionary = self.ranker.rank(
                    self.folder_path, final_dictionary, cache_dictionary,
                    documents_dictionary, terms_dictionary,
                    REGULAR_RESULTS_NUMBER, stem_mode)
                list_for_return.append((query_num, list(dictionary.keys())))

        return list_for_return
Exemple #8
0
 def __parse_stem(self, text):
     if self.stem_mode:  # stem mode is True
         after_dictionary = Stemmer.stemWithCache(Parser.start(text))
     else:  # stem mode is False
         after_dictionary = Parser.start(text)
     return after_dictionary
Exemple #9
0
    def find_popular_sentences(self, stem_mode, folder_path, final_dictionary,
                               cache_dictionary, documents_dictionary, doc_id,
                               text):
        # If the dictionaries are empty, we cant continue
        if final_dictionary is None or cache_dictionary is None or documents_dictionary is None:
            return []
        else:
            self.__set_folder_path(folder_path)
            self.__set_stem_mode(stem_mode)

            # We need to parse and stem the text again, because it's slow to fetch the values from posting
            after_dictionary = self.__parse_stem(text)

            # We want to normalize the values with the max_tf value
            max_tf = documents_dictionary[doc_id][0]

            for key, value in after_dictionary.items(
            ):  # Here we update the tf of the document
                after_dictionary[key] = value / max_tf

            list_of_sentence = Parser.sentences(
                text)  # Here we get the list of sentences from Parser

            # Here we initialize the dictionary of sentences
            sentence_dictionary = {}
            for sentence_index in range(0, len(list_of_sentence)):
                sentence_dictionary[sentence_index] = 0

            term_with_after_dictionary = {
            }  # dictionary with dictionary after parsing and stemming
            sentence_index = 0
            for sentence in list_of_sentence:  # for every sentence, we doing the following steps
                terms_dictionary = self.__parse_stem(sentence)

                # We delete the terms that not in the final dictionary
                keys_to_delete = []
                for term in terms_dictionary.keys():
                    if term in final_dictionary:
                        pass
                    else:
                        keys_to_delete.append(term)
                for term in keys_to_delete:
                    terms_dictionary.pop(term)

                # Here we save the dictionary after parsing and stemming for later uses
                term_with_after_dictionary[sentence_index] = terms_dictionary
                sentence_index += 1

            # For every sentence we calculate the rank value with the formula tf*idf*freq
            for sentence_index, terms_dictionary in term_with_after_dictionary.items(
            ):
                if terms_dictionary is not None:
                    for term, value in terms_dictionary.items():
                        freq = terms_dictionary[term]
                        idf = final_dictionary[term][0][0]
                        tf = after_dictionary[term]
                        sentence_dictionary[sentence_index] += tf * idf * freq

            # Here we choose only the SENTENCE_NUMBER of the best sentences
            list_of_best_sentence = list(
                dict(
                    Counter(sentence_dictionary).most_common(
                        SENTENCE_NUMBER)).keys())

            # for each key in list_of_best_sentence we update the value to be the sentence
            sentence_index = 0
            for key in list_of_best_sentence:
                list_of_best_sentence[sentence_index] = list_of_sentence[key]
                sentence_index += 1

            return list_of_best_sentence
Exemple #10
0
    def Mega_Test(self, query_file_path, stem_mode, folder_path):

        path = folder_path + path_delimiter + "stop_words.txt"
        Parser.set_stop_words_file(path)
        Parser.set_stemmer_mode(stem_mode)
        searcher = Searcher.Searcher()

        results_dict = {}
        query_rel_docs = {
        }  # The dictionary containing each query id (key) and the RELEVANT documents (value, set)
        file_types = (("Comma Separated Value Document", "*.csv"), )
        rel_doc_path = askopenfilename(title="Choose result csv file",
                                       filetypes=file_types)
        file_types = (("Any file", "*.*"), )
        save_path = asksaveasfilename(
            title="Choose where to save BM25 results",
            filetypes=file_types,
            initialfile="mega_test.csv")
        docs = open(rel_doc_path).read()
        docs = docs.split('\n')
        docs = docs[:-1]

        for entry in docs:
            tuple = entry.split(',')
            qid = tuple[0]
            doc = tuple[1]
            if qid in query_rel_docs:
                query_rel_docs[tuple[0]].add(doc)
            else:
                query_rel_docs[tuple[0]] = set()
                query_rel_docs[tuple[0]].add(doc)

        Searcher.denominator = 1.3  # Range [1 : 4]
        Searcher.df_weight = 0.7  # Range (0 : 1]
        Searcher.idf_weight = 0.2
        Ranker.bm25_weight = 0.05
        Ranker.cos_sim_weight = 0.95  # doesn't need a loop
        Ranker.bm25_k = 1.2  # Range: [1.2 - 2.0]
        Ranker.bm25_b = 0.1  # Range: 0.45-1.0
        Ranker.bm25_lambda = 0.2  # Range: 0-1.0

        columns = 'bm25 weight, cos sim weight, bm25 k, bm25 b, bm25 lambda, Denominator, searcher idf, searcher df, score\n'
        srp = open(save_path, 'a')
        srp.write(columns)
        srp.close()

        while Ranker.bm25_k <= 2.01:
            Ranker.bm25_b = 0.1
            while Ranker.bm25_b <= 1.01:
                results = searcher.multi_search(
                    stem_mode, folder_path, final_dictionary, cache_dictionary,
                    documents_dictionary,
                    Reader.extract_queries(query_file_path))
                score = 0
                for entry in results:
                    qid = entry[0]
                    rel_set = query_rel_docs[qid]
                    returned_set = set(entry[1])
                    intersection_set = rel_set.intersection(returned_set)
                    score += len(intersection_set)

                result_score = "%f, %f, %f, %f, %f, %f, %f, %f, %d\n" % \
                               (Ranker.bm25_weight, Ranker.cos_sim_weight, Ranker.bm25_k,
                                Ranker.bm25_b,Ranker.bm25_lambda,
                                Searcher.denominator, Searcher.idf_weight, Searcher.df_weight, score)
                srp = open(save_path, 'a')
                srp.write(result_score)
                srp.close()
                Ranker.bm25_b += 0.1
            Ranker.bm25_k += 0.1
Exemple #11
0
 def startSearch(self, stemBool):
     global stem_mode
     stem_mode = stemBool
     Indexer.set_stemmer_mode(stemBool)
     Parser.set_stemmer_mode(stemBool)
     self.run()  #begin indexing!
Exemple #12
0
    def find(self, query, stem_mode):

        total_value = 0
        for x in range(0, 5):  # We want to use this value for the weights
            total_value += x

        sum_of_df = 0
        wiki_wiki = wikipediaapi.Wikipedia(
            'en')  # Which language we want to search the term for
        page_py = wiki_wiki.page(query)  # Define the query in the file

        query_dictionary = {}  # The dictionary we will return to the user
        if page_py.exists():
            line = page_py.summary  # Here we collect the summary about the page in wiki
            if len(
                    line
            ) < 300:  # If we wiki didn't return a specific term, we ask for the sections
                line = print_sections(page_py.sections)
            if stem_mode:
                stop_set = {'disambigu'}  # Popular words we want to avoid
                query_after = Stemmer.stemWithCache(Parser.start(query))
                terms_dictionary = Stemmer.stemWithCache(Parser.start(line))
            else:
                stop_set = {'Disambiguation'}  # Popular words we want to avoid
                query_after = Parser.start(query)
                terms_dictionary = Parser.start(line)

            concept = {}
            links = page_py.links  # Here we collect the links from the page in wiki
            for title in sorted(links.keys()):
                if stem_mode:
                    term = Stemmer.stemWithCache(
                        Parser.start(links[title].title))
                else:
                    term = Parser.start(links[title].title)

                for t, value in term.items(
                ):  # For each term in summary dictionary, we need to check the values
                    if links[title].ns == 0 and t in terms_dictionary and \
                            t not in query_after and t not in stop_set:
                        if t not in concept:
                            concept[t] = value
                        else:
                            concept[
                                t] += value  # we want to add the value (the df to the dictionary)

            # Here we ask only for most common query results
            query_dictionary = dict(
                Counter(concept).most_common(number_of_results))
            for term, value in query_dictionary.items():
                sum_of_df += value

            for term, value in query_dictionary.items():
                positive_value = int(total_value * value / sum_of_df) + 1
                if positive_value == 0:
                    positive_value = 1
                query_dictionary[term] = positive_value
            if len(query_after) is not 0:
                query = list(query_after.keys())[0]
        else:
            print("Invalid query")

        query_dictionary[query] = number_of_results
        return query_dictionary