Example #1
0
    def _lauch_engine(self):
        # Create boolean or vectorial search engine
        if self.config['Research_engine']['type'] == 'vectorial':
            research_engine = Vectorial_search(
                reverse_index=self.reverse_index,
                similarity=self.config['Vectorial_search']['similarity'],
            )
            query_processor = Process_query('sources/common_words',
                                            'vectorial')

        elif self.config['Research_engine']['type'] == 'boolean':
            research_engine = Boolean_search(
                reverse_index=self.reverse_index,
                p_norm=self.config['Boolean_search']['p_norm'],
                default_similarity=self.config['Boolean_search']
                ['default_similarity'])
            query_processor = Process_query('sources/common_words', 'boolean')

        elif self.config['Research_engine']['type'] == 'probabilistic':
            research_engine = Probabilistic_search(
                reverse_index=self.reverse_index,
                rsv_relevant_method=self.config['Probabilistic_search']
                ['rsv_relevant_method'])

            query_processor = Process_query('sources/common_words',
                                            'probabilistic')
        else:
            raise ValueError('Unsupported research engine type!')

        max_results_number = self.config['Research_engine'][
            'max_results_number']
        while 1:
            query = raw_input('Enter your query: ')
            t0 = time.time()
            results = research_engine.do_search(
                query_processor.format_query(query))
            print len(results), 'results in', time.time() - t0, 'seconds'
            if max_results_number > 0 and len(results) > max_results_number:
                results = results[:max_results_number]
                print 'printing only the first', max_results_number, 'results: \n'

            print 'document id \t score'
            print '-----------------------------'
            for (document_id, score) in results:
                print document_id, '\t\t', score
    def _lauch_engine(self):
        # Create boolean or vectorial search engine
        if self.config['Research_engine']['type'] == 'vectorial':
            research_engine = Vectorial_search(
                reverse_index=self.reverse_index,
                similarity=self.config['Vectorial_search']['similarity'],
            )
            query_processor = Process_query('sources/common_words', 'vectorial')

        elif self.config['Research_engine']['type'] == 'boolean':
            research_engine = Boolean_search(
                reverse_index=self.reverse_index,
                p_norm=self.config['Boolean_search']['p_norm'],
                default_similarity=self.config['Boolean_search']['default_similarity']
            )
            query_processor = Process_query('sources/common_words', 'boolean')

        elif self.config['Research_engine']['type'] == 'probabilistic':
            research_engine = Probabilistic_search(
                reverse_index=self.reverse_index,
                rsv_relevant_method=self.config['Probabilistic_search']['rsv_relevant_method']
            )

            query_processor = Process_query('sources/common_words', 'probabilistic')
        else:
            raise ValueError('Unsupported research engine type!')

        max_results_number = self.config['Research_engine']['max_results_number']
        while 1:
            query = raw_input('Enter your query: ')
            t0 = time.time()
            results = research_engine.do_search(query_processor.format_query(query))
            print len(results), 'results in', time.time()-t0, 'seconds'
            if max_results_number > 0 and len(results) > max_results_number:
                results = results[:max_results_number]
                print 'printing only the first', max_results_number, 'results: \n'

            print 'document id \t score'
            print '-----------------------------'
            for (document_id, score) in results:
                print document_id, '\t\t', score
    def run_testing(self):
        print 'Launching tests!'
        print 'Loading documents...',

        reverse_index_builder = Reverse_index_builder(
            ponderation_method=self.config['Reverse_index']['ponderation'],
            index_type=self.config['Reverse_index']['index_type'],
            save_folder_path=self.config['Reverse_index']['save_folder_path']
        )

        reverse_index = reverse_index_builder.create_reverse_index('sources/cacm.all', 'sources/common_words')
        print ' Done'

        print 'Loading test data...',
        # {query: [answer1, answer2...]}
        self.query_answer = self._parse_queries_answers(self.queries_filename, self.answers_filename)
        print ' Done'

        print 'Initializing variables...',
        time_parsing_queries = 0.
        time_doing_researches = 0.
        precision = []
        recall = []
        r_measure = []
        f_measure = []
        average_precision = []

        if self.config['Research_engine']['type'] == 'vectorial':
            search_engine = Vectorial_search(reverse_index, self.similarity_method)
        elif self.config['Research_engine']['type'] == 'boolean':
            search_engine = Boolean_search(reverse_index, self.p_norm, self.default_similarity)
        elif self.config['Research_engine']['type'] == 'probabilistic':
            search_engine = Probabilistic_search(reverse_index, self.rsv_relevant_method)
        query_processor = Process_query(stop_list_filename='sources/cacm.all', format_type=self.config['Research_engine']['type'])

        print ' Done'

        t0 = time.time()
        print 'Let\'s get to it! (this may take 5-10 seconds)'
        for query in self.query_answer:
            expected_answers = self.query_answer[query]

            t_init = time.time()
            processed_query = query_processor.format_query(query)
            t_parse = time.time()
            time_parsing_queries += t_parse - t_init

            answers_with_score = search_engine.do_search(processed_query)
            answers = map(lambda (x, y): x, answers_with_score)

            t_query = time.time()
            time_doing_researches += t_query - t_parse

            precision.append(self._compute_precision(answers, expected_answers))
            recall.append(self._compute_recall(answers, expected_answers))
            r_measure.append(self._compute_r_measure(answers, expected_answers))
            f_measure.append(self._compute_f_measure(precision[-1], recall[-1]))
            average_precision.append(self._compute_average_precision(answers, expected_answers))

        number_of_tests = float(len(self.query_answer))
        print 'Number of queries tested:', int(number_of_tests), 'in', round(time.time() - t0, 2), 'seconds'
        print 'Average time spent on query processing:', time_parsing_queries / number_of_tests, 'seconds',
        print ', doing the research:', time_doing_researches / number_of_tests, 'seconds'
        print 'Average time spent on a query (total):', (time_doing_researches + time_parsing_queries) / number_of_tests, 'seconds'
        print """
###################################
#      PERFORMANCE MEASURES       #
###################################"""
        print 'Max Precision:', max(precision), 'average:', reduce(lambda x, y: x + y, precision) / float(len(precision))
        print 'Max Recall:', max(recall), 'average:', reduce(lambda x, y: x + y, recall) / float(len(recall))
        print 'Max F-measure', max(f_measure), 'average:', reduce(lambda x, y: x + y, f_measure) / float(len(f_measure))
        print 'Min E-measure', 1 - max(f_measure), 'average:', 1 - reduce(lambda x, y: (x + y), f_measure) / float(len(f_measure))
        print 'Max R-measure', max(r_measure), 'average:', reduce(lambda x, y: x + y, r_measure) / float(len(r_measure))
        print 'Mean Average Precision (MAP)', reduce(lambda x, y: x + y, average_precision) / float(len(average_precision))