Example #1
0
    def test_cache_all_data(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRTFIDF.cache_all_data()
        IRLog.get_instance().stop_log()
Example #2
0
 def get_summary_and_description_tfidf_squared_length(self):
     from ir_tfidf import IRTFIDF
     if self.__summary_squared_length is None or \
                     self.__description_squared_length is None:
         summary, description = self.get_summary_and_description_tfidf()
         self.__summary_squared_length = \
                 IRTFIDF.get_squared_length(summary)
         self.__description_squared_length = \
                 IRTFIDF.get_squared_length(description)
     return self.__summary_squared_length, self.__description_squared_length
Example #3
0
 def get_summary_and_description_tfidf_squared_length(self):
     from ir_tfidf import IRTFIDF
     if self.__summary_squared_length is None or \
                     self.__description_squared_length is None:
         summary, description = self.get_summary_and_description_tfidf()
         self.__summary_squared_length = \
                 IRTFIDF.get_squared_length(summary)
         self.__description_squared_length = \
                 IRTFIDF.get_squared_length(description)
     return self.__summary_squared_length, self.__description_squared_length
Example #4
0
    def test_batch_generate_tfidf(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRTFIDF.batch_generate_tfidf()
        IRLog.get_instance().stop_log()
Example #5
0
def server_cache(msg, res):
    from ir_log import IRLog
    from ir_text import IRText
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRLog.get_instance().println('Server is caching data')
    IRText.cache_all_data()
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRLog.get_instance().println('Server cached data')
    return SIGNAL_CONTINUE
Example #6
0
def server_cache(msg, res):
    from ir_log import IRLog
    from ir_text import IRText
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRLog.get_instance().println('Server is caching data')
    IRText.cache_all_data()
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRLog.get_instance().println('Server cached data')
    return SIGNAL_CONTINUE
Example #7
0
    def query(cls, summary, description, top_n):

        from ir_term_count import IRTermCount
        from ir_tfidf import IRTFIDF
        summary_bow, description_bow = \
            IRTermCount.calculate_term_count(summary, description)
        summary_tfidf, description_tfidf = \
            IRTFIDF.calculate_tfidf_for_report_termcount(summary_bow,
                                                         description_bow)
        similarities = \
            IRTFIDF.get_top_n_similarity_over_all(summary_tfidf,
                                                  description_tfidf,
                                                  top_n)
        return similarities
Example #8
0
 def compare_and_print_tfidf(cls, title_a, report_a,
                             title_b, report_b):
     from ir_log import IRLog
     from ir_config import IRConfig
     from ir_tfidf import IRTFIDF
     
     summary_field_name = IRConfig.get_instance().get('bug_summary_name')
     description_field_name = IRConfig.get_instance().get('bug_description_name')
     summary_a, description_a = report_a.get_summary_and_description_tfidf()
     summary_b, description_b = report_b.get_summary_and_description_tfidf()
     IRLog.get_instance().println('[TFIDF][Summary][%s][%s]' \
             % (title_a, title_b))
     IRTFIDF.show_dict_compare(summary_a, summary_b, summary_field_name)
     IRLog.get_instance().println('[TFIDF][Description][%s][%s]' \
             % (title_a, title_b))
     IRTFIDF.show_dict_compare(description_a, description_b, description_field_name)
Example #9
0
    def similarity_over_all(self):
        """Calculate similarity between bug (summary, description) over
         all.

        Returns:
            dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]}
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_text import IRText
        from ir_tfidf import IRTFIDF

        logger = IRLog.get_instance()
        search_time_span = 2 * 3600 * 24 * 365

        bug_id_name = IRConfig.get_instance().get('bug_id_name')

        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')

        basic_collection = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'r')

        reports2scan = basic_collection.find({
            product_name: self.get_product(),
            create_ts_name: {
                '$gt': self.get_create_ts() - search_time_span
            },
            bug_id_name: {
                '$nin': self.__exclude_report_ids
            }
        })
        result = {}
        IRLog.get_instance().println('Comparing with %d reports.' \
                % (reports2scan.count()) )

        print self.__summary_text
        print self.__description_text

        for report in reports2scan:
            bug_id = report[bug_id_name]
            if bug_id == self.get_dummy_bug_id():
                continue
            # because we don't want to load stacktrace in case of self.__stacktrace
            #    being none, we create and fill the info of report manually
            other_report = IRReport("", "")
            other_report.__summary_tfidf, other_report.__description_tfidf = \
                    IRTFIDF.get_tfidf_of_bug(bug_id)
            # if self.__stacktrace is empty, we don't need to do this
            if self.get_stacktrace() is not None and \
                    self.get_stacktrace().__len__() > 0:
                other_report.__stacktrace = IRText.get_stacktrace_of_bug(
                    bug_id)
            if other_report.__stacktrace is None:
                other_report.__stacktrace = []
            result[bug_id] = self.similarity_with(other_report)

        return result
Example #10
0
    def test_calculate_tfidf_for_report_termcount_bidf(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRConfig.get_instance().set('tfidf_algorithm', 'bidf')
        summary = {'firefox':5, 'chrome':12}
        description = {'max':10, 'min':30, 'fix':10}
        summary_tfidf, description_tfidf = \
            IRTFIDF.calculate_tfidf_for_report_termcount(summary, description)
        IRLog.get_instance().println('Summary')
        IRTFIDF.show_dict_compare(summary_tfidf, summary_tfidf)
        IRLog.get_instance().println('Description')
        IRTFIDF.show_dict_compare(description_tfidf, description_tfidf)
        IRLog.get_instance().stop_log()
Example #11
0
    def test_tfidf_asm_similarity(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF
        
        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        
        vec_a = {'firefox':1, 'chrome':1}
        vec_b = {'firefox':1, 'chrome':1, 'ie':1}
        vec_c = {'firefox':1, 'windows':1, 'linux':1}

        delta = 0.0001
        assert abs(1.0 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_b)) < delta
        assert abs(0.5 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_c)) < delta
        assert IRTFIDF.tfidf_asm_similarity(vec_a, vec_b) > \
                IRTFIDF.tfidf_asm_similarity(vec_a, vec_b, None, ['ie'], 100)
Example #12
0
    def compare_and_print_tfidf(cls, title_a, report_a, title_b, report_b):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        summary_field_name = IRConfig.get_instance().get('bug_summary_name')
        description_field_name = IRConfig.get_instance().get(
            'bug_description_name')
        summary_a, description_a = report_a.get_summary_and_description_tfidf()
        summary_b, description_b = report_b.get_summary_and_description_tfidf()
        IRLog.get_instance().println('[TFIDF][Summary][%s][%s]' \
                % (title_a, title_b))
        IRTFIDF.show_dict_compare(summary_a, summary_b, summary_field_name)
        IRLog.get_instance().println('[TFIDF][Description][%s][%s]' \
                % (title_a, title_b))
        IRTFIDF.show_dict_compare(description_a, description_b,
                                  description_field_name)
Example #13
0
    def test_get_squared_length(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary = {'firefox':0.4, 'chrome':0.6}
        assert abs(IRTFIDF.get_squared_length(summary) - 0.52 ) < 0.00001
Example #14
0
    def get_term_by_simple_entropy(cls, diff, sim_bug_ids, penalty_terms=None):
        """Get the best term which has most entropy in diff.
            

        Args:
            diff: [(set, set)], generated by get_all_reports_difference

        Retruns:
            str, The term
        """

        termcount = {}
        max_score = -1.0
        max_score_term = None
        # count the occurance of term
        total_score = 0.0
        for index, delta in enumerate(diff):
            total_score += sim_bug_ids[index][1][0]
            # only account for
            for term in delta[1]:
                if penalty_terms is not None and term in penalty_terms:
                    continue
                if not term in termcount:
                    termcount[term] = 0.0
                termcount[term] += sim_bug_ids[index][1][0]
        # calcualte the value and pick the most
        from ir_config import IRConfig
        from ir_document_count import IRDocumentCount
        from ir_tfidf import IRTFIDF
        description_name = IRConfig.get_instance().get('bug_description_name')
        # debug use
        scoreboard = []
        # /debug use
        from math import log
        for term in termcount:
            bg_score = termcount[term] / total_score
            ig_score = -2.0 * abs(float(termcount[term]) / total_score -
                                  0.5) + 1
            idf = IRTFIDF.get_unit_idf(IRDocumentCount.get_documentcount(term, \
                    description_name))
            score = ig_score * idf
            scoreboard.append((term, score, ig_score, idf))
            if score > max_score:
                max_score = score
                max_score_term = term
        scoreboard.sort(cmp=lambda x, y: cmp(x[1], y[1]), reverse=True)
        from ir_log import IRLog
        IRLog.get_instance().println(
            'Candidate keywords: %s' %
            ','.join(['word', 'score', 'ig_score', 'idf']))
        IRLog.get_instance().println('\n'.join([ \
                ','.join([t[0],str(t[1]), str(t[2]), str(t[3])]) for t in scoreboard[:10] \
                ]))
        return max_score_term
Example #15
0
    def similarity_over_all(self):
        """Calculate similarity between bug (summary, description) over
         all.

        Returns:
            dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]}
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_text import IRText
        from ir_tfidf import IRTFIDF

        logger = IRLog.get_instance()
        search_time_span = 2 * 3600 * 24 * 365
        
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')

        basic_collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'r')
        
        reports2scan = basic_collection.find({
            product_name : self.get_product(),
            create_ts_name : {'$gt' : self.get_create_ts() - search_time_span},
            bug_id_name : {'$nin' : self.__exclude_report_ids} })
        result = {}
        IRLog.get_instance().println('Comparing with %d reports.' \
                % (reports2scan.count()) )
        
        print self.__summary_text
        print self.__description_text

        for report in reports2scan:
            bug_id = report[bug_id_name]
            if bug_id == self.get_dummy_bug_id():
                continue
            # because we don't want to load stacktrace in case of self.__stacktrace 
            #    being none, we create and fill the info of report manually
            other_report = IRReport("", "")
            other_report.__summary_tfidf, other_report.__description_tfidf = \
                    IRTFIDF.get_tfidf_of_bug(bug_id)
            # if self.__stacktrace is empty, we don't need to do this
            if self.get_stacktrace() is not None and \
                    self.get_stacktrace().__len__() > 0:
                other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id)
            if other_report.__stacktrace is None:
                other_report.__stacktrace = []
            result[bug_id] = self.similarity_with(other_report)

        return result
Example #16
0
    def test_show_dict_compare(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        
        summary_a, description_a = IRTFIDF.get_tfidf_of_bug(100000)
        summary_b, description_b = IRTFIDF.get_tfidf_of_bug(100200)
        IRLog.get_instance().println('Summary 100000 vs 100200')
        IRTFIDF.show_dict_compare(summary_a, summary_b, summary_name)
        IRLog.get_instance().println('Description 100000 vs 100200')
        IRTFIDF.show_dict_compare(description_a, description_b)
        IRLog.get_instance().println('Summary 100000 vs 100000')
        IRTFIDF.show_dict_compare(summary_a, summary_a)
        IRLog.get_instance().println('Description 100000 vs 100000')
        IRTFIDF.show_dict_compare(description_a, description_a, description_name)
Example #17
0
    def __update_summary_and_description_tfidf_from_termcount(self):
        from ir_tfidf import IRTFIDF

        summary_termcount, description_termcount = \
                self.get_summary_and_description_termcount()
        summary_tfidf, description_tfidf = \
            IRTFIDF.calculate_tfidf_for_report_termcount(summary_termcount,
                                                         description_termcount)
        if self.__summary_tfidf is None:
            self.__summary_tfidf = summary_tfidf
        if self.__description_tfidf is None:
            self.__description_tfidf = description_tfidf
Example #18
0
 def __update_summary_and_description_tfidf_from_termcount(self):
     from ir_tfidf import IRTFIDF
     
     summary_termcount, description_termcount = \
             self.get_summary_and_description_termcount()
     summary_tfidf, description_tfidf = \
         IRTFIDF.calculate_tfidf_for_report_termcount(summary_termcount,
                                                      description_termcount)
     if self.__summary_tfidf is None:
         self.__summary_tfidf = summary_tfidf
     if self.__description_tfidf is None:
         self.__description_tfidf = description_tfidf
Example #19
0
    def get_term_by_simple_entropy(cls, diff, sim_bug_ids, penalty_terms =
    None):
        """Get the best term which has most entropy in diff.
            

        Args:
            diff: [(set, set)], generated by get_all_reports_difference

        Retruns:
            str, The term
        """

        termcount = {}
        max_score = -1.0
        max_score_term = None
        # count the occurance of term
        total_score = 0.0
        for index, delta in enumerate(diff):
            total_score += sim_bug_ids[index][1][0]
            # only account for
            for term in delta[1]:
                if penalty_terms is not None and term in penalty_terms:
                    continue
                if not term in termcount:
                    termcount[term] = 0.0
                termcount[term] += sim_bug_ids[index][1][0]
        # calcualte the value and pick the most
        from ir_config import IRConfig
        from ir_document_count import IRDocumentCount
        from ir_tfidf import IRTFIDF
        description_name = IRConfig.get_instance().get('bug_description_name')
        # debug use
        scoreboard = []
        # /debug use
        from math import log
        for term in termcount:
            bg_score = termcount[term] / total_score
            ig_score = -2.0 * abs(float(termcount[term]) / total_score - 0.5) + 1
            idf = IRTFIDF.get_unit_idf(IRDocumentCount.get_documentcount(term, \
                    description_name))
            score = ig_score * idf
            scoreboard.append((term, score, ig_score, idf))
            if score > max_score:
                max_score = score
                max_score_term = term
        scoreboard.sort(cmp=lambda x,y:cmp(x[1],y[1]), reverse=True)
        from ir_log import IRLog
        IRLog.get_instance().println('Candidate keywords: %s' % ','.join(['word','score','ig_score','idf']))
        IRLog.get_instance().println('\n'.join([ \
                ','.join([t[0],str(t[1]), str(t[2]), str(t[3])]) for t in scoreboard[:10] \
                ]))
        return max_score_term
Example #20
0
    def test_get_tfidf_of_bug(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRTFIDF.get_tfidf_of_bug(100000)
        IRLog.get_instance().println('Summary tfidf: %s' % (str(summary)))
        IRLog.get_instance().println('Description tfidf: %s' % (str(description)))
        IRLog.get_instance().stop_log()
Example #21
0
 def get_tfidf(self):
     if self.__tfidf is None:
         from ir_config import IRConfig
         from ir_mongodb_helper import IRMongodbHelper
         from ir_tfidf import IRTFIDF
         description_name = IRConfig.get_instance().get('bug_description_name')
         tfidf_collection = IRMongodbHelper.get_instance().get_collection(
             'bug_db_name', 'bug_tfidf_collection_name', False)
         bug_count = tfidf_collection.count()
         
         self.__tfidf = \
                 IRTFIDF.calculate_tfidf(self.get_termcount(),
                                         description_name, bug_count, None, 'tfidf')
     return self.__tfidf
Example #22
0
 def get_summary_and_description_tfidf(self):
     if self.__bug_id is None:
         if self.__summary_tfidf is None or \
                         self.__description_tfidf is None:
             self.__update_summary_and_description_tfidf_from_termcount()
         return [self.__summary_tfidf, self.__description_tfidf]
     else:
         if self.__allow_cache and \
                         self.__summary_tfidf is not None and \
                         self.__description_tfidf is not None:
             return [self.__summary_tfidf, self.__description_tfidf]
         from ir_tfidf import IRTFIDF
         summary_tfidf, description_tfidf = \
             IRTFIDF.get_tfidf_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__summary_tfidf, self.__description_tfidf = \
                     summary_tfidf, description_tfidf
         return [summary_tfidf, description_tfidf]
Example #23
0
 def get_summary_and_description_tfidf(self):
     if self.__bug_id is None:
         if self.__summary_tfidf is None or \
                         self.__description_tfidf is None:
             self.__update_summary_and_description_tfidf_from_termcount()
         return [self.__summary_tfidf, self.__description_tfidf]
     else:
         if self.__allow_cache and \
                         self.__summary_tfidf is not None and \
                         self.__description_tfidf is not None:
             return [self.__summary_tfidf, self.__description_tfidf]
         from ir_tfidf import IRTFIDF
         summary_tfidf, description_tfidf = \
             IRTFIDF.get_tfidf_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__summary_tfidf, self.__description_tfidf = \
                     summary_tfidf, description_tfidf
         return [summary_tfidf, description_tfidf]
Example #24
0
    def test_calcualte_tfidf_for_report_termcount_tfidf(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        
        summary = {'firefox':5, 'chrome':12}
        description = {'max':10, 'min':30, 'fix':10}
        summary_tfidf, description_tfidf = \
            IRTFIDF.calculate_tfidf_for_report_termcount(summary, description)
        summary_sum = 0.0
        for term, tfidf in summary_tfidf.items():
            summary_sum += tfidf ** 2 
        description_sum = 0.0
        for term, tfidf in description_tfidf.items():
            description_sum += tfidf ** 2
        # print summary_sum, description_sum
        assert (summary_sum - 1.0) ** 2 < 0.00001
        assert (description_sum - 1.0) ** 2 < 0.00001
Example #25
0
    def do_test_over_file(self, filename):
        """Do test over the file.

        Args:
            filename: str, the input file which generated by 
                generate_incomplete_test_file.
        """
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_tfidf import IRTFIDF
        from ir_report import IRReport
        from ir_document_count import IRDocumentCount

        IRText.cache_all_data()
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()

        remove_self_bug_id = IRConfig.get_instance().get_bool('remove_self_bug_id', True)

        sim_tot_precision = 0.0
        sim_tot_recall = 0.0
        sim_bi_tot_recall = 0.0
        sim_tot_size = 0

        dup_tot_precision = 0.0
        dup_tot_recall = 0.0
        dup_bi_toto_recall = 0.0
        dup_num = 0
        test_num = 0

        infile = open(filename, 'r')
        for line in infile:
            IRLog.get_instance().println('----test----')
            test_num += 1
            line.strip()
            new_report = IRReport.from_string(line)
            ori_report = IRReport(new_report.get_dummy_bug_id())
            #IRLog.get_instance().println('Summary')
            #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(),
            #                              new_report.get_summary_termcount())
            #IRLog.get_instance().println('Description')
            #IRTermCount.show_dict_compare(ori_report.get_description_termcount(),
            #                              new_report.get_description_termcount())
            # do test for single
            similarities, duplicates = new_report.similarities_and_duplicates()
            sim_ids = [sim[0] for sim in similarities]
            dup_ids = [dup[0] for dup in duplicates]
            IRLog.get_instance().println('Sim ids: %s' % str(sim_ids))
            IRLog.get_instance().println('Dup ids: %s' % str(dup_ids))
            # evaluate sim
            sim_hit, sim_nothit, real_duplicates = \
                IRDuplicateGroup.is_in_same_duplicate_group(
                    new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id)
            # some group contain only one
            if real_duplicates.__len__() == 0:
                test_num -= 1
                continue
            
            precision, recall = self.__report_result(
                new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates)

            sim_tot_precision += precision
            sim_tot_recall += recall
            sim_tot_size += sim_ids.__len__()
            sim_bi_tot_recall += 1 if recall > 0.0 else 0

            if dup_ids.__len__() > 0:
                dup_num += 1
                dup_hit, dup_nothit, real_duplicates = \
                        IRDuplicateGroup.is_in_same_duplicate_group(
                                new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id)
                precision, recall = self.__report_result(
                        new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates)
                dup_tot_precision += precision
                dup_tot_recall += recall
                dup_bi_toto_recall += 1 if recall > 0.0 else 0
        # general conclusion
        if dup_num == 0:
            dup_num = 1.0
        IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\
                '#dup', 'dup pre', 'dup rec', 'dup birec']))
        IRLog.get_instance().println(','.join([str(test_num), \
                str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \
                str(dup_num), \
                str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)]))
        infile.close()
Example #26
0
    def similarity_with(self, other_report):
        """
        Returns:
            [float, float, float, float], [total score, summary, \
                                           description, stacktrace]
        """
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF
        from ir_gnome_st_tools import IRSTTools

        summary_ratio = IRConfig.get_instance().get_float('bug_summary_ratio')
        description_ratio = IRConfig.get_instance().get_float(
            'bug_description_ratio')
        stacktrace_ratio = IRConfig.get_instance().get_float(
            'bug_stacktrace_ratio')

        summary_tfidf_a, description_tfidf_a = \
                self.get_summary_and_description_tfidf()
        summary_tfidf_b, description_tfidf_b = \
                other_report.get_summary_and_description_tfidf()

        tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm')
        stacktrace_algorithm = IRConfig.get_instance().get(
            'stacktrace_algorithm')
        if tfidf_algorithm == 'tfidf':
            summary_similarity = IRTFIDF.tfidf_similarity(
                summary_tfidf_a, summary_tfidf_b)
            description_similarity = IRTFIDF.tfidf_similarity(
                description_tfidf_a, description_tfidf_b)
        elif tfidf_algorithm == 'bidf':
            summary_squared_length, description_squared_length = \
                    self.get_summary_and_description_tfidf_squared_length()
            summary_similarity = IRTFIDF.tfidf_asm_similarity(
                summary_tfidf_a, summary_tfidf_b, summary_squared_length)
            description_similarity = IRTFIDF.tfidf_asm_similarity(
                description_tfidf_a, description_tfidf_b,
                description_squared_length, self.__penalty_terms)

        if self.__stacktrace is None or \
                self.__stacktrace.__len__() == 0 or \
                self.__stacktrace[0].__len__() == 0:
            stacktrace_similarity = 1.0
        else:
            stacktrace_similarity = IRSTTools.compare_stackinfo(
                self.get_stacktrace(), other_report.get_stacktrace(),
                stacktrace_algorithm)

        scoring_strategy = IRConfig.get_instance().get('scoring_strategy',
                                                       'heuristic')
        if scoring_strategy == 'weighted':
            score = self.__weighted_scoring(summary_similarity,
                                            description_similarity,
                                            stacktrace_similarity)
        elif scoring_strategy == 'heuristic':
            score = self.__heuristic_scoring(summary_similarity,
                                             description_similarity,
                                             stacktrace_similarity)
        elif scoring_strategy == 'distweighted':
            score = self.__distweighted_scoring(summary_similarity,
                                                description_similarity,
                                                stacktrace_similarity)
        else:
            assert False, 'invalid scoring strategy'
        return [
            score, summary_similarity, description_similarity,
            stacktrace_similarity
        ]
Example #27
0
        test_file = sys.argv[3]
        bug_id = int(sys.argv[4])
        from ir_sim_bug_evaluator import IRSimBugEvaluator
        new_report = IRSimBugEvaluator.get_report_from_test_file(
            test_file, bug_id)
        if new_report is None:
            IRLog.get_instance().println('Error! Cannot find report %d in %s' % \
                    (bug_id, test_file))
        else:
            if sys.argv.__len__() > 5:
                from ir_term_count import IRTermCount
                penalty_terms_raw = sys.argv[4].split(',')
                penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw))
                IRLog.get_instance().println('%d penalty terms: %s:' \
                    % (penalty_terms.__len__(), ','.join(penalty_terms)))
                new_report.set_penalty_terms(penalty_terms)
    elif mode == 'text':
        text = sys.argv[3]
        new_report = IRReport.from_string(text)
    elif mode == 'inte':
        IRRecommender.start_shell()
        exit()
    else:
        IRLog.get_instance().println('Error! Known mode %s' % mode)
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRRecommender.do_recommend(new_report)
    IRLog.get_instance().stop_log()
Example #28
0
    def start_shell(cls):
        """Start a shell that do recommending interactively"""
        from ir_log import IRLog
        from ir_tfidf import IRTFIDF
        from ir_document_count import IRDocumentCount
        from ir_report import IRReport

        IRLog.get_instance().println("Starting Intereport...")
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()
        IRLog.get_instance().println("Intereport Started. Waiting for input")

        new_report = None
        while 1:
            cmd = raw_input("Input command:").strip()
            if cmd == 'exit':
                IRLog.get_instance().println('Exiting')
                break
            elif cmd == 'new':
                IRLog.get_instance().println('Creating New Report')
                import time
                cur_time = -1
                while cur_time < 0:
                    try:
                        cur_time = int(
                            time.mktime(
                                time.strptime(
                                    raw_input(
                                        "Input Time (e.g., 2011-05-05): "),
                                    '%Y-%m-%d')))
                    except:
                        cur_time = -1
                product = raw_input("Input Product: ")
                summary = raw_input("Summary: ")
                raw_description = raw_input("Description:\n")
                new_report = IRReport.from_string(
                    IRReport.separator.join([
                        str(cur_time),
                        product.lower(), summary, raw_description, '', ''
                    ]))
                cls.__print_report(new_report)
            elif cmd == 'do':
                IRLog.get_instance().println('Do Recommending')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.do_recommend(new_report)
            elif cmd == 'ls':
                IRLog.get_instance().println('Show Current Report')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.__print_report(new_report)
            elif cmd == 'ad':
                IRLog.get_instance().println('Appending Description')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    append_description = raw_input("Append Description:\n")
                    description = ' '.join([
                        new_report.get_description_text(), append_description
                    ])
                    dummy_report = IRReport(new_report.get_summary_text(),
                                            description)
                    dummy_report.set_stacktrace(new_report.get_stacktrace())
                    dummy_report.set_basic_info(new_report.get_create_ts(),
                                                new_report.get_product())
                    dummy_report.set_penalty_terms(
                        new_report.get_penalty_terms())
                    dummy_report.set_dummy_bug_id(
                        new_report.get_dummy_bug_id())
                    new_report = dummy_report
                    IRLog.get_instance().println('Description: %s' %
                                                 description)
            elif cmd == 'ap':
                IRLog.get_instance().println('Appending Penalties')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    raw = []
                    while raw.__len__() < 1:
                        raw = raw_input(
                            'Input Penalties (split by \',\'):').split(',')
                    from ir_term_count import IRTermCount
                    penalty = new_report.get_penalty_terms()
                    if penalty is None:
                        penalty = []
                    penalty += IRTermCount.do_stemming(raw)
                    new_report.set_penalty_terms(penalty)
                    print len(penalty), penalty
                    IRLog.get_instance().println('Penalties: %s' % \
                                                     (', '.join(penalty)))
            elif cmd == 'sd':
                IRLog.get_instance().println('Set Dummy Bug ID')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    bug_id = -1
                    while bug_id <= 0:
                        try:
                            bug_id = int(raw_input('Dummy Bug ID: '))
                        except:
                            bug_id = -1
                    new_report.set_dummy_bug_id(bug_id)
                    IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id)
            elif cmd == 'help':
                cls.__show_help()
            else:
                IRLog.get_instance().println('Error! Unkown command: %s' \
                                                % cmd)
                cls.__show_help()
        # end of while 1
        IRLog.get_instance().println("Bye")
Example #29
0
    def similarity_with(self, other_report):
        """
        Returns:
            [float, float, float, float], [total score, summary, \
                                           description, stacktrace]
        """
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF
        from ir_gnome_st_tools import IRSTTools


        summary_ratio = IRConfig.get_instance().get_float('bug_summary_ratio')
        description_ratio = IRConfig.get_instance().get_float('bug_description_ratio')
        stacktrace_ratio = IRConfig.get_instance().get_float('bug_stacktrace_ratio')

        summary_tfidf_a, description_tfidf_a = \
                self.get_summary_and_description_tfidf()
        summary_tfidf_b, description_tfidf_b = \
                other_report.get_summary_and_description_tfidf()

        tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm')
        stacktrace_algorithm = IRConfig.get_instance().get('stacktrace_algorithm')
        if tfidf_algorithm == 'tfidf':
            summary_similarity = IRTFIDF.tfidf_similarity(
                summary_tfidf_a, summary_tfidf_b)
            description_similarity = IRTFIDF.tfidf_similarity(
                description_tfidf_a, description_tfidf_b)
        elif tfidf_algorithm == 'bidf':
            summary_squared_length, description_squared_length = \
                    self.get_summary_and_description_tfidf_squared_length()
            summary_similarity = IRTFIDF.tfidf_asm_similarity(
                summary_tfidf_a, summary_tfidf_b, summary_squared_length)
            description_similarity = IRTFIDF.tfidf_asm_similarity(
                description_tfidf_a, description_tfidf_b,
                description_squared_length,
                self.__penalty_terms)

        if self.__stacktrace is None or \
                self.__stacktrace.__len__() == 0 or \
                self.__stacktrace[0].__len__() == 0:
            stacktrace_similarity = 1.0
        else:
            stacktrace_similarity = IRSTTools.compare_stackinfo(
                self.get_stacktrace(), other_report.get_stacktrace(),
                stacktrace_algorithm)

        scoring_strategy = IRConfig.get_instance().get('scoring_strategy',
                                                       'heuristic')
        if scoring_strategy == 'weighted':
            score = self.__weighted_scoring(summary_similarity,
                                            description_similarity, stacktrace_similarity)
        elif scoring_strategy == 'heuristic':
            score = self.__heuristic_scoring(summary_similarity,
                                            description_similarity, stacktrace_similarity)
        elif scoring_strategy == 'distweighted':
            score = self.__distweighted_scoring(summary_similarity,
                                            description_similarity, stacktrace_similarity)
        else:
            assert False, 'invalid scoring strategy'
        return [score,
                summary_similarity,
                description_similarity,
                stacktrace_similarity]
Example #30
0
    def start_shell(cls):
        """Start a shell that do recommending interactively"""
        from ir_log import IRLog
        from ir_tfidf import IRTFIDF
        from ir_document_count import IRDocumentCount
        from ir_report import IRReport

        IRLog.get_instance().println("Starting Intereport...")
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()
        IRLog.get_instance().println("Intereport Started. Waiting for input")

        new_report = None
        while 1:
            cmd = raw_input("Input command:").strip()
            if cmd == 'exit':
                IRLog.get_instance().println('Exiting')
                break
            elif cmd == 'new':
                IRLog.get_instance().println('Creating New Report')
                import time
                cur_time = -1
                while cur_time < 0:
                    try:
                        cur_time = int(time.mktime(time.strptime(
                            raw_input("Input Time (e.g., 2011-05-05): "),
                            '%Y-%m-%d')))
                    except:
                        cur_time = -1
                product = raw_input("Input Product: ")
                summary = raw_input("Summary: ")
                raw_description = raw_input("Description:\n")
                new_report = IRReport.from_string(IRReport.separator.join([
                    str(cur_time), product.lower(), summary, raw_description,
                    '', '']))
                cls.__print_report(new_report)
            elif cmd == 'do':
                IRLog.get_instance().println('Do Recommending')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.do_recommend(new_report)
            elif cmd == 'ls':
                IRLog.get_instance().println('Show Current Report')
                if new_report is None:
                     IRLog.get_instance().println('Error! Please create '
                                                  'report first.')
                else:
                    cls.__print_report(new_report)
            elif cmd == 'ad':
                IRLog.get_instance().println('Appending Description')
                if new_report is None:
                     IRLog.get_instance().println('Error! Please create '
                                                  'report first.')
                else:
                    append_description = raw_input("Append Description:\n")
                    description =' '.join([new_report.get_description_text(),
                                           append_description])
                    dummy_report = IRReport(new_report.get_summary_text(),
                                            description)
                    dummy_report.set_stacktrace(new_report.get_stacktrace())
                    dummy_report.set_basic_info(new_report.get_create_ts(),
                                                new_report.get_product())
                    dummy_report.set_penalty_terms(new_report.get_penalty_terms())
                    dummy_report.set_dummy_bug_id(new_report.get_dummy_bug_id())
                    new_report = dummy_report
                    IRLog.get_instance().println('Description: %s' % description)
            elif cmd == 'ap':
                IRLog.get_instance().println('Appending Penalties')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    raw = []
                    while raw.__len__() < 1:
                        raw = raw_input('Input Penalties (split by \',\'):').split(',')
                    from ir_term_count import IRTermCount
                    penalty = new_report.get_penalty_terms()
                    if penalty is None:
                        penalty = []
                    penalty += IRTermCount.do_stemming(raw)
                    new_report.set_penalty_terms(penalty)
                    print len(penalty), penalty
                    IRLog.get_instance().println('Penalties: %s' % \
                                                     (', '.join(penalty)))
            elif cmd == 'sd':
                IRLog.get_instance().println('Set Dummy Bug ID')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    bug_id = -1
                    while bug_id <= 0:
                        try:
                            bug_id = int(raw_input('Dummy Bug ID: '))
                        except:
                            bug_id = -1
                    new_report.set_dummy_bug_id(bug_id)
                    IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id)
            elif cmd == 'help':
                cls.__show_help()
            else:
                IRLog.get_instance().println('Error! Unkown command: %s' \
                                                % cmd)
                cls.__show_help()
        # end of while 1
        IRLog.get_instance().println("Bye")
Example #31
0
    if mode == 'file':
        test_file = sys.argv[3]
        bug_id = int(sys.argv[4])
        from ir_sim_bug_evaluator import IRSimBugEvaluator
        new_report = IRSimBugEvaluator.get_report_from_test_file(test_file, bug_id)
        if new_report is None:
            IRLog.get_instance().println('Error! Cannot find report %d in %s' % \
                    (bug_id, test_file))
        else:
            if sys.argv.__len__() > 5:
                from ir_term_count import IRTermCount
                penalty_terms_raw = sys.argv[4].split(',')
                penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw))
                IRLog.get_instance().println('%d penalty terms: %s:' \
                    % (penalty_terms.__len__(), ','.join(penalty_terms)))
                new_report.set_penalty_terms(penalty_terms)
    elif mode == 'text':
        text = sys.argv[3]
        new_report = IRReport.from_string(text)
    elif mode == 'inte':
        IRRecommender.start_shell()
        exit()
    else:
        IRLog.get_instance().println('Error! Known mode %s' % mode)
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRRecommender.do_recommend(new_report)
    IRLog.get_instance().stop_log()
Example #32
0
    def do_test_over_file(self, filename):
        """Do test over the file.

        Args:
            filename: str, the input file which generated by 
                generate_incomplete_test_file.
        """
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_tfidf import IRTFIDF
        from ir_report import IRReport
        from ir_document_count import IRDocumentCount

        IRText.cache_all_data()
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()

        remove_self_bug_id = IRConfig.get_instance().get_bool(
            'remove_self_bug_id', True)

        sim_tot_precision = 0.0
        sim_tot_recall = 0.0
        sim_bi_tot_recall = 0.0
        sim_tot_size = 0

        dup_tot_precision = 0.0
        dup_tot_recall = 0.0
        dup_bi_toto_recall = 0.0
        dup_num = 0
        test_num = 0

        infile = open(filename, 'r')
        for line in infile:
            IRLog.get_instance().println('----test----')
            test_num += 1
            line.strip()
            new_report = IRReport.from_string(line)
            ori_report = IRReport(new_report.get_dummy_bug_id())
            #IRLog.get_instance().println('Summary')
            #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(),
            #                              new_report.get_summary_termcount())
            #IRLog.get_instance().println('Description')
            #IRTermCount.show_dict_compare(ori_report.get_description_termcount(),
            #                              new_report.get_description_termcount())
            # do test for single
            similarities, duplicates = new_report.similarities_and_duplicates()
            sim_ids = [sim[0] for sim in similarities]
            dup_ids = [dup[0] for dup in duplicates]
            IRLog.get_instance().println('Sim ids: %s' % str(sim_ids))
            IRLog.get_instance().println('Dup ids: %s' % str(dup_ids))
            # evaluate sim
            sim_hit, sim_nothit, real_duplicates = \
                IRDuplicateGroup.is_in_same_duplicate_group(
                    new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id)
            # some group contain only one
            if real_duplicates.__len__() == 0:
                test_num -= 1
                continue

            precision, recall = self.__report_result(
                new_report.get_dummy_bug_id(), sim_hit, sim_nothit,
                real_duplicates)

            sim_tot_precision += precision
            sim_tot_recall += recall
            sim_tot_size += sim_ids.__len__()
            sim_bi_tot_recall += 1 if recall > 0.0 else 0

            if dup_ids.__len__() > 0:
                dup_num += 1
                dup_hit, dup_nothit, real_duplicates = \
                        IRDuplicateGroup.is_in_same_duplicate_group(
                                new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id)
                precision, recall = self.__report_result(
                    new_report.get_dummy_bug_id(), dup_hit, dup_nothit,
                    real_duplicates)
                dup_tot_precision += precision
                dup_tot_recall += recall
                dup_bi_toto_recall += 1 if recall > 0.0 else 0
        # general conclusion
        if dup_num == 0:
            dup_num = 1.0
        IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\
                '#dup', 'dup pre', 'dup rec', 'dup birec']))
        IRLog.get_instance().println(','.join([str(test_num), \
                str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \
                str(dup_num), \
                str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)]))
        infile.close()