Beispiel #1
0
    def __init__(self, id, dispatcher):
        from ir_report import IRReport

        threading.Thread.__init__(self)
        self.__id = id
        self.__msg_queue = Queue.Queue(maxsize=10)
        self.__report = IRReport('', '')
        self.__dispatcher = dispatcher
Beispiel #2
0
    def test_top_n_similarity_over_all(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        report = IRReport(100000)
        bugs_similarities = report.top_n_similarity_over_all(10)
        IRLog.get_instance().println('Bugs with top similarities with bug %d: %s' \
                % (100000, str(bugs_similarities)))
        IRLog.get_instance().stop_log()
Beispiel #3
0
    def test_top_n_similarity_over_all(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        report = IRReport(100000)
        bugs_similarities = report.top_n_similarity_over_all(10)
        IRLog.get_instance().println('Bugs with top similarities with bug %d: %s' \
                % (100000, str(bugs_similarities)))
        IRLog.get_instance().stop_log()
Beispiel #4
0
    def test_binary_search_less(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        report = IRReport('','')
        array1 = range(10,0,-1)
        assert 8 == report.binary_search_less(array1, lambda x:x, 3)
        assert 0 == report.binary_search_less(array1, lambda x:x, 11)
        assert 10 == report.binary_search_less(array1, lambda x:x, 1)
        array2 = []
        assert -1 == report.binary_search_less(array2, lambda x:x, 1)
    def test_get_report_difference(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport
        from ir_recommender import IRRecommender

        IRConfig.get_instance().load('../data/test/bug_test.cfg')

        new_report = IRReport('apple for summary', 'linux description')
        sim_report = IRReport('apple of ghost crashed', 'description linux wow')

        (diff_sum, diff_desc) = \
                IRRecommender.get_report_difference(new_report, sim_report)
        IRLog.get_instance().println('New summary: %s' \
                % (new_report.get_summary_text()))
        IRLog.get_instance().println('Sim summary: %s' \
                % (sim_report.get_summary_text()))
        IRLog.get_instance().println('New description: %s' \
                % (new_report.get_description_text()))
        IRLog.get_instance().println('Sim description: %s' \
                % (sim_report.get_description_text()))
        IRLog.get_instance().println('Diff of summary: %s' % (diff_sum))
        IRLog.get_instance().println('Diff of description: %s' % (diff_desc))
        assert diff_sum == {'ghost', 'crash'}
        assert diff_desc == {'wow'}
    def __generate_single_bug(self, bug_id, drop_rate):
        """Generate an incomplete bug report text.
        
        Args:
            bug_id: int, original bug id.
            drop_rate: float, 0.0 for not drop, 1.0 for totally drop.
        
        Returns:
            IRReport
        """
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_report import IRReport

        # get description and summary
        summary, description = IRText.get_summary_and_description_of_bug(
            bug_id)
        create_ts, product = IRText.get_basic_info_of_bug(bug_id)
        if drop_rate > 0.001:
            summary, description = \
                IRTermCount.create_incomplete_report(summary, description, drop_rate)
            print description
        new_report = IRReport(summary, description)
        new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id))
        new_report.set_dummy_bug_id(bug_id)
        new_report.set_basic_info(create_ts, product)
        return new_report
Beispiel #7
0
    def test_similarities_and_duplicates(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        report = IRReport(100000)
        similarities, duplicates = report.similarities_and_duplicates()
        IRLog.get_instance().println('Report %d' % (100000))
        IRLog.get_instance().println('%d Similar Reports: %s' % (similarities
               .__len__(), ','.join([str(item[0]) for item in similarities])))
        IRLog.get_instance().println('%d Duplicate Reports: %s' % (duplicates
               .__len__(), ','.join([str(item[0]) for item in duplicates])))
        IRLog.get_instance().stop_log()
Beispiel #8
0
    def test_similarities_and_duplicates(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        report = IRReport(100000)
        similarities, duplicates = report.similarities_and_duplicates()
        IRLog.get_instance().println('Report %d' % (100000))
        IRLog.get_instance().println(
            '%d Similar Reports: %s' % (similarities.__len__(), ','.join(
                [str(item[0]) for item in similarities])))
        IRLog.get_instance().println(
            '%d Duplicate Reports: %s' % (duplicates.__len__(), ','.join(
                [str(item[0]) for item in duplicates])))
        IRLog.get_instance().stop_log()
Beispiel #9
0
def set_report_basic_info(report, msg):
    from ir_report import IRReport
    new_report = IRReport.from_string(msg.strip())
    new_report.set_stacktrace(report.get_stacktrace())
    new_report.set_penalty_terms(report.get_penalty_terms())
    new_report.set_skip_terms(report.get_skip_terms())
    new_report.set_exclude_report_ids(report.get_exclude_report_ids())
    return new_report
Beispiel #10
0
 def __init__(self, id, dispatcher):
     from ir_report import IRReport
     
     threading.Thread.__init__(self)
     self.__id = id
     self.__msg_queue = Queue.Queue(maxsize = 10)
     self.__report = IRReport('','')
     self.__dispatcher = dispatcher
Beispiel #11
0
def set_report_basic_info(report, msg):
    from ir_report import IRReport
    new_report = IRReport.from_string(msg.strip())
    new_report.set_stacktrace(report.get_stacktrace())
    new_report.set_penalty_terms(report.get_penalty_terms())
    new_report.set_skip_terms(report.get_skip_terms())
    new_report.set_exclude_report_ids(report.get_exclude_report_ids())
    return new_report
    def __generate_single_bug(self, bug_id, drop_rate):
        """Generate an incomplete bug report text.
        
        Args:
            bug_id: int, original bug id.
            drop_rate: float, 0.0 for not drop, 1.0 for totally drop.
        
        Returns:
            IRReport
        """
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_report import IRReport

        # get description and summary
        summary, description = IRText.get_summary_and_description_of_bug(bug_id)
        create_ts, product = IRText.get_basic_info_of_bug(bug_id)
        if drop_rate > 0.001:
            summary, description = \
                IRTermCount.create_incomplete_report(summary, description, drop_rate)
            print description
        new_report = IRReport(summary, description)
        new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id))
        new_report.set_dummy_bug_id(bug_id)
        new_report.set_basic_info(create_ts, product)
        return new_report
 def get_report_from_test_file(cls, filename, bug_id):
     from ir_report import IRReport
     infile = open(filename, 'r')
     for line in infile:
         raw = line.split(IRReport.separator)
         dummy_bug_id = int(raw[5])
         if bug_id == dummy_bug_id:
             return IRReport.from_string(line.strip())
     return None
 def get_report_from_test_file(cls, filename, bug_id):
     from ir_report import IRReport
     infile = open(filename, 'r')
     for line in infile:
         raw = line.split(IRReport.separator)
         dummy_bug_id = int(raw[5])
         if bug_id == dummy_bug_id:
             return IRReport.from_string(line.strip())
     return None
Beispiel #15
0
    def compare(cls, bug_a, bug_b):
        """
        compare the calculation of two bugs (both in db).
        """

        from ir_report import IRReport
        report_a = IRReport(bug_a)
        report_b = IRReport(bug_b)
        title_a = 'indb' + str(bug_a)
        title_b = 'indb' + str(bug_b)
        # compare text
        cls.print_text(title_a, report_a)
        cls.print_text(title_b, report_b)
        # compare term frequency
        cls.compare_and_print_termcount(title_a, report_a, title_b, report_b)
        # compare tfidf
        cls.compare_and_print_tfidf(title_a, report_a, title_b, report_b)
        # similarity
        cls.print_similarity_score(report_a, report_b)
Beispiel #16
0
    def do_recommend_cmd(cls, cmd_text):
        """Do recommend from cmd_text

        Args:
            cmd_text: str, the text follows the standard format,
                  create_ts;product;summary;raw_description
        """
        from ir_report import IRReport
        new_report = IRReport.from_string(cmd_text.strip())
        cls.do_recommend(new_report)
Beispiel #17
0
    def do_recommend_cmd(cls, cmd_text):
        """Do recommend from cmd_text

        Args:
            cmd_text: str, the text follows the standard format,
                  create_ts;product;summary;raw_description
        """
        from ir_report import IRReport
        new_report = IRReport.from_string(cmd_text.strip())
        cls.do_recommend(new_report)
    def test_get_report_differences(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport
        from ir_recommender import IRRecommender

        IRConfig.get_instance().load('../data/test/bug_test.cfg')

        new_report = IRReport('apple for summary', 'linux description')
        sim_reports = [
            IRReport('apple of ghost crashed', 'description linux wow'),
            IRReport(100000),
            IRReport(100200) ]

        diffs = \
                IRRecommender.get_all_reports_difference(new_report, sim_reports)
        for diff in diffs:
            IRLog.get_instance().println('Diff of summary: %s' % (diff[0]))
            IRLog.get_instance().println('Diff of description: %s' % (diff[1]))
Beispiel #19
0
    def test_create_new_report(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_text = 'Firefox crashed'
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!'
        report = IRReport(summary_text, description_text)
        assert summary_text == report.get_summary_text()
        assert description_text == report.get_description_text()
        report.get_summary_and_description_tfidf()
        report.get_summary_and_description_tfidf_squared_length()
        IRLog.get_instance().stop_log()
Beispiel #20
0
    def test_create_new_report(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_text = 'Firefox crashed'
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!'
        report = IRReport(summary_text, description_text)
        assert summary_text == report.get_summary_text()
        assert description_text == report.get_description_text()
        report.get_summary_and_description_tfidf()
        report.get_summary_and_description_tfidf_squared_length()
        IRLog.get_instance().stop_log()
Beispiel #21
0
    def test_binary_search_less(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        report = IRReport('', '')
        array1 = range(10, 0, -1)
        assert 8 == report.binary_search_less(array1, lambda x: x, 3)
        assert 0 == report.binary_search_less(array1, lambda x: x, 11)
        assert 10 == report.binary_search_less(array1, lambda x: x, 1)
        array2 = []
        assert -1 == report.binary_search_less(array2, lambda x: x, 1)
Beispiel #22
0
    def test_similarity_with(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRConfig.get_instance().set('tfidf_algorithm', 'tfidf')
        report_a = IRReport(100000)
        report_b = IRReport(100200)
        IRLog.get_instance().println('TFIDF similarity between %d and %d is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))
        
        IRConfig.get_instance().set('tfidf_algorithm', 'bidf')
        report_a = IRReport(100000)
        report_b = IRReport(100200)
        IRLog.get_instance().println('Bidf similarity between %d and %d is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))

        IRConfig.get_instance().set('scoring_strategy', 'weighted')
        IRConfig.get_instance().set('bug_summary_ratio', 0.25)
        IRConfig.get_instance().set('bug_description_ratio', 0.25)
        IRConfig.get_instance().set('bug_stacktrace_ratio', 0.5)
        IRLog.get_instance().println('Bidf (Weighted Scoring) similarity '
                                     'between '
                                     '%d and %d '
                                     'is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))

        IRConfig.get_instance().set('scoring_strategy', 'heuristic')
        IRConfig.get_instance().set('bug_summary_ratio', 0.5)
        IRConfig.get_instance().set('bug_description_ratio', 0.5)
        IRLog.get_instance().println('Bidf (Heuristic Scoring) similarity '
                                     'between '
                                     '%d and %d '
                                     'is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))

        IRConfig.get_instance().set('scoring_strategy', 'distweighted')
        IRConfig.get_instance().set('bug_summary_ratio', 0.5)
        IRConfig.get_instance().set('bug_description_ratio', 0.5)
        IRLog.get_instance().println('Bidf (Heuristic Scoring) similarity '
                                     'between '
                                     '%d and %d '
                                     'is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))
Beispiel #23
0
    def compare_with_sim_file(cls, bug_a, filename, bug_b):
        """Warning: bug_a acts as bug in database, bug_b acts as new report."""

        from ir_sim_bug_evaluator import IRSimBugEvaluator
        from ir_report import IRReport

        title_a = "indb" + str(bug_a)
        title_b = "file" + str(bug_b)
        report_a = IRReport(bug_a)
        report_b = IRSimBugEvaluator.get_report_from_test_file(filename, bug_b)

        # text
        cls.print_text(title_a, report_a)
        cls.print_text(title_b, report_b)
        # term frequency
        cls.compare_and_print_termcount(title_a, report_a, title_b, report_b)
        # tfidf
        cls.compare_and_print_tfidf(title_a, report_a, title_b, report_b)
        # similarity
        cls.print_similarity_score(report_b, report_a)
Beispiel #24
0
    def do_recommend(cls, new_report):

        import time
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport
        from ir_sentence import IRSentence

        print 'DO_RECOMMEND', str(new_report.get_stacktrace())

        IRLog.get_instance().println('Finding similar reports')
        start_t = time.time()
        sim_bug_ids, dup_bug_ids = new_report.similarities_and_duplicates()
        similar_t = time.time()
        IRLog.get_instance().println('Found %d similar reports and %d duplicate reports in %fs' \
                % (sim_bug_ids.__len__(), dup_bug_ids.__len__(), similar_t - start_t))

        sim_bugs = [IRReport(sim_bug_id[0]) for sim_bug_id in sim_bug_ids]
        duplicate_packs = [
            cls.get_report_text_by_bug_id(dup_bug_id[0])
            for dup_bug_id in dup_bug_ids
        ]
        IRLog.get_instance().println('Duplicate reports: %s' % ','.\
                join([str(dup_bug_id[0]) for dup_bug_id in dup_bug_ids]))

        IRLog.get_instance().println('Extracting key term')
        deltas = cls.get_all_reports_difference(new_report, sim_bugs)
        term = cls.get_term_by_simple_entropy(deltas, sim_bug_ids,
                                              new_report.get_penalty_terms())
        term_t = time.time()
        keyword = term
        IRLog.get_instance().println('Choose term: %s in %fs' \
                % (term, term_t - similar_t))
        # pick out candidate sentences
        max_sentences_number = 1000
        cur_sentences_number = 0
        IRLog.get_instance().println('Extracting sentences')
        candidate_sentences = []
        for index, delta in enumerate(deltas):
            if not term in delta[1]:
                continue
            # term in this report
            sentences = IRSentence.get_sentence_from_description(
                sim_bugs[index].get_description_text(),
                sim_bugs[index].get_bug_id())
            for sentence in sentences:
                if sentence.contain_term(term):
                    candidate_sentences.append(sentence)
                    cur_sentences_number += 1
            if cur_sentences_number > max_sentences_number:
                break
        sent_t = time.time()
        IRLog.get_instance().println('Extracted %d sentences from %d reports in %fs' \
                % (candidate_sentences.__len__(), deltas.__len__(),
                   sent_t - term_t))
        # cluster sentences
        IRLog.get_instance().println('Clustering sentences')
        selected_sentences_num = IRConfig.get_instance().get_int(
            'bug_sentence_number')
        if candidate_sentences.__len__() > selected_sentences_num:
            clusters, sentence_ids = IRSentence.cluster_sentences(
                candidate_sentences, selected_sentences_num)
        else:
            clusters = [x for x in xrange(candidate_sentences.__len__())]
            sentence_ids = clusters
        clust_t = time.time()
        IRLog.get_instance().println('Finished clustering in %fs' \
                % (clust_t - sent_t))
        # pick out the sentences nearest to centroid in each group
        #pick_group = set()
        #for index, cluster in enumerate(clusters):
        #    if cluster in pick_group:
        #        continue
        #    pick_group.add(cluster)
        #    if pick_group.__len__() == selected_sentences_num:
        #        break
        #    IRLog.get_instance().println("Recommend: %s" \
        #            % (candidate_sentences[index].get_text()))
        sentence_packs = []
        sentence_report_ids = []
        for sentence_id in sentence_ids:
            IRLog.get_instance().println("Recommend (Report#: %d): %s" \
                    % ( candidate_sentences[sentence_id].get_bug_id(),
                        candidate_sentences[sentence_id].get_text()) )
            sentence_packs.append(
                (candidate_sentences[sentence_id].get_bug_id(),
                 candidate_sentences[sentence_id].get_text()))
            sentence_report_ids.append(
                candidate_sentences[sentence_id].get_bug_id())
        IRLog.get_instance().println('Recommending finished in %fs' \
                % (time.time() - start_t))

        return keyword, sentence_packs, duplicate_packs
Beispiel #25
0
    def start_shell(cls):
        """Start a shell that do recommending interactively"""
        from ir_log import IRLog
        from ir_tfidf import IRTFIDF
        from ir_document_count import IRDocumentCount
        from ir_report import IRReport

        IRLog.get_instance().println("Starting Intereport...")
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()
        IRLog.get_instance().println("Intereport Started. Waiting for input")

        new_report = None
        while 1:
            cmd = raw_input("Input command:").strip()
            if cmd == 'exit':
                IRLog.get_instance().println('Exiting')
                break
            elif cmd == 'new':
                IRLog.get_instance().println('Creating New Report')
                import time
                cur_time = -1
                while cur_time < 0:
                    try:
                        cur_time = int(
                            time.mktime(
                                time.strptime(
                                    raw_input(
                                        "Input Time (e.g., 2011-05-05): "),
                                    '%Y-%m-%d')))
                    except:
                        cur_time = -1
                product = raw_input("Input Product: ")
                summary = raw_input("Summary: ")
                raw_description = raw_input("Description:\n")
                new_report = IRReport.from_string(
                    IRReport.separator.join([
                        str(cur_time),
                        product.lower(), summary, raw_description, '', ''
                    ]))
                cls.__print_report(new_report)
            elif cmd == 'do':
                IRLog.get_instance().println('Do Recommending')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.do_recommend(new_report)
            elif cmd == 'ls':
                IRLog.get_instance().println('Show Current Report')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.__print_report(new_report)
            elif cmd == 'ad':
                IRLog.get_instance().println('Appending Description')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    append_description = raw_input("Append Description:\n")
                    description = ' '.join([
                        new_report.get_description_text(), append_description
                    ])
                    dummy_report = IRReport(new_report.get_summary_text(),
                                            description)
                    dummy_report.set_stacktrace(new_report.get_stacktrace())
                    dummy_report.set_basic_info(new_report.get_create_ts(),
                                                new_report.get_product())
                    dummy_report.set_penalty_terms(
                        new_report.get_penalty_terms())
                    dummy_report.set_dummy_bug_id(
                        new_report.get_dummy_bug_id())
                    new_report = dummy_report
                    IRLog.get_instance().println('Description: %s' %
                                                 description)
            elif cmd == 'ap':
                IRLog.get_instance().println('Appending Penalties')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    raw = []
                    while raw.__len__() < 1:
                        raw = raw_input(
                            'Input Penalties (split by \',\'):').split(',')
                    from ir_term_count import IRTermCount
                    penalty = new_report.get_penalty_terms()
                    if penalty is None:
                        penalty = []
                    penalty += IRTermCount.do_stemming(raw)
                    new_report.set_penalty_terms(penalty)
                    print len(penalty), penalty
                    IRLog.get_instance().println('Penalties: %s' % \
                                                     (', '.join(penalty)))
            elif cmd == 'sd':
                IRLog.get_instance().println('Set Dummy Bug ID')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    bug_id = -1
                    while bug_id <= 0:
                        try:
                            bug_id = int(raw_input('Dummy Bug ID: '))
                        except:
                            bug_id = -1
                    new_report.set_dummy_bug_id(bug_id)
                    IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id)
            elif cmd == 'help':
                cls.__show_help()
            else:
                IRLog.get_instance().println('Error! Unkown command: %s' \
                                                % cmd)
                cls.__show_help()
        # end of while 1
        IRLog.get_instance().println("Bye")
    def do_test_over_file(self, filename):
        """Do test over the file.

        Args:
            filename: str, the input file which generated by 
                generate_incomplete_test_file.
        """
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_tfidf import IRTFIDF
        from ir_report import IRReport
        from ir_document_count import IRDocumentCount

        IRText.cache_all_data()
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()

        remove_self_bug_id = IRConfig.get_instance().get_bool(
            'remove_self_bug_id', True)

        sim_tot_precision = 0.0
        sim_tot_recall = 0.0
        sim_bi_tot_recall = 0.0
        sim_tot_size = 0

        dup_tot_precision = 0.0
        dup_tot_recall = 0.0
        dup_bi_toto_recall = 0.0
        dup_num = 0
        test_num = 0

        infile = open(filename, 'r')
        for line in infile:
            IRLog.get_instance().println('----test----')
            test_num += 1
            line.strip()
            new_report = IRReport.from_string(line)
            ori_report = IRReport(new_report.get_dummy_bug_id())
            #IRLog.get_instance().println('Summary')
            #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(),
            #                              new_report.get_summary_termcount())
            #IRLog.get_instance().println('Description')
            #IRTermCount.show_dict_compare(ori_report.get_description_termcount(),
            #                              new_report.get_description_termcount())
            # do test for single
            similarities, duplicates = new_report.similarities_and_duplicates()
            sim_ids = [sim[0] for sim in similarities]
            dup_ids = [dup[0] for dup in duplicates]
            IRLog.get_instance().println('Sim ids: %s' % str(sim_ids))
            IRLog.get_instance().println('Dup ids: %s' % str(dup_ids))
            # evaluate sim
            sim_hit, sim_nothit, real_duplicates = \
                IRDuplicateGroup.is_in_same_duplicate_group(
                    new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id)
            # some group contain only one
            if real_duplicates.__len__() == 0:
                test_num -= 1
                continue

            precision, recall = self.__report_result(
                new_report.get_dummy_bug_id(), sim_hit, sim_nothit,
                real_duplicates)

            sim_tot_precision += precision
            sim_tot_recall += recall
            sim_tot_size += sim_ids.__len__()
            sim_bi_tot_recall += 1 if recall > 0.0 else 0

            if dup_ids.__len__() > 0:
                dup_num += 1
                dup_hit, dup_nothit, real_duplicates = \
                        IRDuplicateGroup.is_in_same_duplicate_group(
                                new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id)
                precision, recall = self.__report_result(
                    new_report.get_dummy_bug_id(), dup_hit, dup_nothit,
                    real_duplicates)
                dup_tot_precision += precision
                dup_tot_recall += recall
                dup_bi_toto_recall += 1 if recall > 0.0 else 0
        # general conclusion
        if dup_num == 0:
            dup_num = 1.0
        IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\
                '#dup', 'dup pre', 'dup rec', 'dup birec']))
        IRLog.get_instance().println(','.join([str(test_num), \
                str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \
                str(dup_num), \
                str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)]))
        infile.close()
Beispiel #27
0
 def get_report_text_by_bug_id(cls, id):
     from ir_report import IRReport
     report = IRReport(id)
     summary_text, description_text = report.get_summary_and_description_text(
     )
     return id, summary_text, description_text
Beispiel #28
0
        test_file = sys.argv[3]
        bug_id = int(sys.argv[4])
        from ir_sim_bug_evaluator import IRSimBugEvaluator
        new_report = IRSimBugEvaluator.get_report_from_test_file(
            test_file, bug_id)
        if new_report is None:
            IRLog.get_instance().println('Error! Cannot find report %d in %s' % \
                    (bug_id, test_file))
        else:
            if sys.argv.__len__() > 5:
                from ir_term_count import IRTermCount
                penalty_terms_raw = sys.argv[4].split(',')
                penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw))
                IRLog.get_instance().println('%d penalty terms: %s:' \
                    % (penalty_terms.__len__(), ','.join(penalty_terms)))
                new_report.set_penalty_terms(penalty_terms)
    elif mode == 'text':
        text = sys.argv[3]
        new_report = IRReport.from_string(text)
    elif mode == 'inte':
        IRRecommender.start_shell()
        exit()
    else:
        IRLog.get_instance().println('Error! Known mode %s' % mode)
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRRecommender.do_recommend(new_report)
    IRLog.get_instance().stop_log()
Beispiel #29
0
class IRSession(threading.Thread):
    """
    A session.
    """
    TIMEOUT = None

    def __init__(self, id, dispatcher):
        from ir_report import IRReport
        
        threading.Thread.__init__(self)
        self.__id = id
        self.__msg_queue = Queue.Queue(maxsize = 10)
        self.__report = IRReport('','')
        self.__dispatcher = dispatcher

    def enqueue(self, msgpack):
        self.__msg_queue.put(msgpack)

    def run(self):
        from ir_log import IRLog
        session_state = STATE_ALIVE
        while session_state == STATE_ALIVE:
            try:
                msgpack = self.__msg_queue.get(True)
                # do something to msgpack
                conn = msgpack['connection']
                respack = msgpack['respack']
                respack[SESSION_ID] = msgpack[SESSION_ID]
                # set phase
                for key, value in msgpack.items():
                    if key in SET_COMMANDS:
                        self.__report = SET_COMMANDS[key](self.__report, value)
                # do phase
                signal = SIGNAL_CONTINUE
                for key, value in msgpack.items():
                    if key in CTL_COMMANDS:
                        signal = CTL_COMMANDS[key](self.__report, respack)
                        if signal == SIGNAL_BREAK:
                            session_state = STATE_EXPIRED
                self.__pack_report_info(respack)
                IRLog.get_instance().println('Send message: %s' % str(respack))
                conn.send(str(respack))
            except Queue.Empty:
                from ir_log import IRLog
                IRLog.get_instance().println('Session %d time out' % self.__id,
                                             2)
                break
        self.__dispatcher.remove_session(self.__id)

    def __pack_report_info(self, respack):
        respack[FEEDBACK_PRODUCT] = self.__report.get_product()
        respack[FEEDBACK_CREATE_TS] = self.__report.get_create_ts()
        respack[FEEDBACK_SUMMARY] = self.__report.get_summary_text()
        respack[FEEDBACK_DESCRIPTION] = self.__report.get_description_text()
        respack[FEEDBACK_PENALTY] = self.__report.get_penalty_terms()
        respack[FEEDBACK_SKIP] = self.__report.get_skip_terms()
        respack[FEEDBACK_IGNORE] = self.__report.get_exclude_report_ids()
        from ir_config import IRConfig
        respack[FEEDBACK_MAX_SENTENCES] = IRConfig.get_instance().get_int(
            'bug_sentence_number')
        respack[FEEDBACK_MAX_DUPLICATES] = IRConfig.get_instance().get_int(
            'bug_duplicate_number')
        respack[FEEDBACK_REPORT_LINK] = IRConfig.get_instance().get(
                'bugzilla_report_link')
Beispiel #30
0
class IRSession(threading.Thread):
    """
    A session.
    """
    TIMEOUT = None

    def __init__(self, id, dispatcher):
        from ir_report import IRReport

        threading.Thread.__init__(self)
        self.__id = id
        self.__msg_queue = Queue.Queue(maxsize=10)
        self.__report = IRReport('', '')
        self.__dispatcher = dispatcher

    def enqueue(self, msgpack):
        self.__msg_queue.put(msgpack)

    def run(self):
        from ir_log import IRLog
        session_state = STATE_ALIVE
        while session_state == STATE_ALIVE:
            try:
                msgpack = self.__msg_queue.get(True)
                # do something to msgpack
                conn = msgpack['connection']
                respack = msgpack['respack']
                respack[SESSION_ID] = msgpack[SESSION_ID]
                # set phase
                for key, value in msgpack.items():
                    if key in SET_COMMANDS:
                        self.__report = SET_COMMANDS[key](self.__report, value)
                # do phase
                signal = SIGNAL_CONTINUE
                for key, value in msgpack.items():
                    if key in CTL_COMMANDS:
                        signal = CTL_COMMANDS[key](self.__report, respack)
                        if signal == SIGNAL_BREAK:
                            session_state = STATE_EXPIRED
                self.__pack_report_info(respack)
                IRLog.get_instance().println('Send message: %s' % str(respack))
                conn.send(str(respack))
            except Queue.Empty:
                from ir_log import IRLog
                IRLog.get_instance().println('Session %d time out' % self.__id,
                                             2)
                break
        self.__dispatcher.remove_session(self.__id)

    def __pack_report_info(self, respack):
        respack[FEEDBACK_PRODUCT] = self.__report.get_product()
        respack[FEEDBACK_CREATE_TS] = self.__report.get_create_ts()
        respack[FEEDBACK_SUMMARY] = self.__report.get_summary_text()
        respack[FEEDBACK_DESCRIPTION] = self.__report.get_description_text()
        respack[FEEDBACK_PENALTY] = self.__report.get_penalty_terms()
        respack[FEEDBACK_SKIP] = self.__report.get_skip_terms()
        respack[FEEDBACK_IGNORE] = self.__report.get_exclude_report_ids()
        from ir_config import IRConfig
        respack[FEEDBACK_MAX_SENTENCES] = IRConfig.get_instance().get_int(
            'bug_sentence_number')
        respack[FEEDBACK_MAX_DUPLICATES] = IRConfig.get_instance().get_int(
            'bug_duplicate_number')
        respack[FEEDBACK_REPORT_LINK] = IRConfig.get_instance().get(
            'bugzilla_report_link')
Beispiel #31
0
def set_report_info(report, msg):
    from ir_report import IRReport
    new_report = IRReport.from_string(msg.strip())
    return new_report
Beispiel #32
0
    if mode == 'file':
        test_file = sys.argv[3]
        bug_id = int(sys.argv[4])
        from ir_sim_bug_evaluator import IRSimBugEvaluator
        new_report = IRSimBugEvaluator.get_report_from_test_file(test_file, bug_id)
        if new_report is None:
            IRLog.get_instance().println('Error! Cannot find report %d in %s' % \
                    (bug_id, test_file))
        else:
            if sys.argv.__len__() > 5:
                from ir_term_count import IRTermCount
                penalty_terms_raw = sys.argv[4].split(',')
                penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw))
                IRLog.get_instance().println('%d penalty terms: %s:' \
                    % (penalty_terms.__len__(), ','.join(penalty_terms)))
                new_report.set_penalty_terms(penalty_terms)
    elif mode == 'text':
        text = sys.argv[3]
        new_report = IRReport.from_string(text)
    elif mode == 'inte':
        IRRecommender.start_shell()
        exit()
    else:
        IRLog.get_instance().println('Error! Known mode %s' % mode)
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRRecommender.do_recommend(new_report)
    IRLog.get_instance().stop_log()
Beispiel #33
0
def set_report_info(report, msg):
    from ir_report import IRReport
    new_report = IRReport.from_string(msg.strip())
    return new_report
Beispiel #34
0
 def get_report_text_by_bug_id(cls, id):
     from ir_report import IRReport
     report = IRReport(id)
     summary_text, description_text = report.get_summary_and_description_text()
     return id, summary_text, description_text
Beispiel #35
0
    def test_create_new_report_from_string(self):
        from nose.tools import eq_
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport
        from ir_term_count import IRTermCount

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_text = 'Firefox crashed'
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!\n'
        report = IRReport(summary_text, description_text)
        report.set_basic_info(12345, 'core')
        report.set_penalty_terms(IRTermCount.do_stemming(['ie', 'explore']))
        report.set_exclude_report_ids([100100])
        report.set_dummy_bug_id(12345)
        report.set_skip_terms(IRTermCount.do_stemming(['new', 'please']))
        # save to text
        text = report.to_string()
        IRLog.get_instance().println('Serialized report: %s' % (text))
        # load from text
        new_report = IRReport.from_string(text)

        assert new_report.get_summary_text() == report.get_summary_text()
        eq_(new_report.get_description_text().strip(),
            report.get_description_text().strip())
        assert new_report.get_create_ts() == report.get_create_ts()
        assert new_report.get_product() == report.get_product()
        assert new_report.get_dummy_bug_id() == report.get_dummy_bug_id()
        assert new_report.get_penalty_terms() == report.get_penalty_terms()
        assert new_report.get_exclude_report_ids(
        ) == report.get_exclude_report_ids()
        eq_(new_report.get_skip_terms(), report.get_skip_terms())
        IRLog.get_instance().stop_log()
Beispiel #36
0
    def start_shell(cls):
        """Start a shell that do recommending interactively"""
        from ir_log import IRLog
        from ir_tfidf import IRTFIDF
        from ir_document_count import IRDocumentCount
        from ir_report import IRReport

        IRLog.get_instance().println("Starting Intereport...")
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()
        IRLog.get_instance().println("Intereport Started. Waiting for input")

        new_report = None
        while 1:
            cmd = raw_input("Input command:").strip()
            if cmd == 'exit':
                IRLog.get_instance().println('Exiting')
                break
            elif cmd == 'new':
                IRLog.get_instance().println('Creating New Report')
                import time
                cur_time = -1
                while cur_time < 0:
                    try:
                        cur_time = int(time.mktime(time.strptime(
                            raw_input("Input Time (e.g., 2011-05-05): "),
                            '%Y-%m-%d')))
                    except:
                        cur_time = -1
                product = raw_input("Input Product: ")
                summary = raw_input("Summary: ")
                raw_description = raw_input("Description:\n")
                new_report = IRReport.from_string(IRReport.separator.join([
                    str(cur_time), product.lower(), summary, raw_description,
                    '', '']))
                cls.__print_report(new_report)
            elif cmd == 'do':
                IRLog.get_instance().println('Do Recommending')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.do_recommend(new_report)
            elif cmd == 'ls':
                IRLog.get_instance().println('Show Current Report')
                if new_report is None:
                     IRLog.get_instance().println('Error! Please create '
                                                  'report first.')
                else:
                    cls.__print_report(new_report)
            elif cmd == 'ad':
                IRLog.get_instance().println('Appending Description')
                if new_report is None:
                     IRLog.get_instance().println('Error! Please create '
                                                  'report first.')
                else:
                    append_description = raw_input("Append Description:\n")
                    description =' '.join([new_report.get_description_text(),
                                           append_description])
                    dummy_report = IRReport(new_report.get_summary_text(),
                                            description)
                    dummy_report.set_stacktrace(new_report.get_stacktrace())
                    dummy_report.set_basic_info(new_report.get_create_ts(),
                                                new_report.get_product())
                    dummy_report.set_penalty_terms(new_report.get_penalty_terms())
                    dummy_report.set_dummy_bug_id(new_report.get_dummy_bug_id())
                    new_report = dummy_report
                    IRLog.get_instance().println('Description: %s' % description)
            elif cmd == 'ap':
                IRLog.get_instance().println('Appending Penalties')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    raw = []
                    while raw.__len__() < 1:
                        raw = raw_input('Input Penalties (split by \',\'):').split(',')
                    from ir_term_count import IRTermCount
                    penalty = new_report.get_penalty_terms()
                    if penalty is None:
                        penalty = []
                    penalty += IRTermCount.do_stemming(raw)
                    new_report.set_penalty_terms(penalty)
                    print len(penalty), penalty
                    IRLog.get_instance().println('Penalties: %s' % \
                                                     (', '.join(penalty)))
            elif cmd == 'sd':
                IRLog.get_instance().println('Set Dummy Bug ID')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    bug_id = -1
                    while bug_id <= 0:
                        try:
                            bug_id = int(raw_input('Dummy Bug ID: '))
                        except:
                            bug_id = -1
                    new_report.set_dummy_bug_id(bug_id)
                    IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id)
            elif cmd == 'help':
                cls.__show_help()
            else:
                IRLog.get_instance().println('Error! Unkown command: %s' \
                                                % cmd)
                cls.__show_help()
        # end of while 1
        IRLog.get_instance().println("Bye")
Beispiel #37
0
    def test_similarity_with(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRConfig.get_instance().set('tfidf_algorithm', 'tfidf')
        report_a = IRReport(100000)
        report_b = IRReport(100200)
        IRLog.get_instance().println('TFIDF similarity between %d and %d is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))

        IRConfig.get_instance().set('tfidf_algorithm', 'bidf')
        report_a = IRReport(100000)
        report_b = IRReport(100200)
        IRLog.get_instance().println('Bidf similarity between %d and %d is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))

        IRConfig.get_instance().set('scoring_strategy', 'weighted')
        IRConfig.get_instance().set('bug_summary_ratio', 0.25)
        IRConfig.get_instance().set('bug_description_ratio', 0.25)
        IRConfig.get_instance().set('bug_stacktrace_ratio', 0.5)
        IRLog.get_instance().println('Bidf (Weighted Scoring) similarity '
                                     'between '
                                     '%d and %d '
                                     'is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))

        IRConfig.get_instance().set('scoring_strategy', 'heuristic')
        IRConfig.get_instance().set('bug_summary_ratio', 0.5)
        IRConfig.get_instance().set('bug_description_ratio', 0.5)
        IRLog.get_instance().println('Bidf (Heuristic Scoring) similarity '
                                     'between '
                                     '%d and %d '
                                     'is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))

        IRConfig.get_instance().set('scoring_strategy', 'distweighted')
        IRConfig.get_instance().set('bug_summary_ratio', 0.5)
        IRConfig.get_instance().set('bug_description_ratio', 0.5)
        IRLog.get_instance().println('Bidf (Heuristic Scoring) similarity '
                                     'between '
                                     '%d and %d '
                                     'is %f' % \
                (100000, 100200, report_a.similarity_with(report_b)[0]))
    def do_test_over_file(self, filename):
        """Do test over the file.

        Args:
            filename: str, the input file which generated by 
                generate_incomplete_test_file.
        """
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_tfidf import IRTFIDF
        from ir_report import IRReport
        from ir_document_count import IRDocumentCount

        IRText.cache_all_data()
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()

        remove_self_bug_id = IRConfig.get_instance().get_bool('remove_self_bug_id', True)

        sim_tot_precision = 0.0
        sim_tot_recall = 0.0
        sim_bi_tot_recall = 0.0
        sim_tot_size = 0

        dup_tot_precision = 0.0
        dup_tot_recall = 0.0
        dup_bi_toto_recall = 0.0
        dup_num = 0
        test_num = 0

        infile = open(filename, 'r')
        for line in infile:
            IRLog.get_instance().println('----test----')
            test_num += 1
            line.strip()
            new_report = IRReport.from_string(line)
            ori_report = IRReport(new_report.get_dummy_bug_id())
            #IRLog.get_instance().println('Summary')
            #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(),
            #                              new_report.get_summary_termcount())
            #IRLog.get_instance().println('Description')
            #IRTermCount.show_dict_compare(ori_report.get_description_termcount(),
            #                              new_report.get_description_termcount())
            # do test for single
            similarities, duplicates = new_report.similarities_and_duplicates()
            sim_ids = [sim[0] for sim in similarities]
            dup_ids = [dup[0] for dup in duplicates]
            IRLog.get_instance().println('Sim ids: %s' % str(sim_ids))
            IRLog.get_instance().println('Dup ids: %s' % str(dup_ids))
            # evaluate sim
            sim_hit, sim_nothit, real_duplicates = \
                IRDuplicateGroup.is_in_same_duplicate_group(
                    new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id)
            # some group contain only one
            if real_duplicates.__len__() == 0:
                test_num -= 1
                continue
            
            precision, recall = self.__report_result(
                new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates)

            sim_tot_precision += precision
            sim_tot_recall += recall
            sim_tot_size += sim_ids.__len__()
            sim_bi_tot_recall += 1 if recall > 0.0 else 0

            if dup_ids.__len__() > 0:
                dup_num += 1
                dup_hit, dup_nothit, real_duplicates = \
                        IRDuplicateGroup.is_in_same_duplicate_group(
                                new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id)
                precision, recall = self.__report_result(
                        new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates)
                dup_tot_precision += precision
                dup_tot_recall += recall
                dup_bi_toto_recall += 1 if recall > 0.0 else 0
        # general conclusion
        if dup_num == 0:
            dup_num = 1.0
        IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\
                '#dup', 'dup pre', 'dup rec', 'dup birec']))
        IRLog.get_instance().println(','.join([str(test_num), \
                str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \
                str(dup_num), \
                str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)]))
        infile.close()
Beispiel #39
0
    def test_create_new_report_from_string(self):
        from nose.tools import eq_
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport
        from ir_term_count import IRTermCount

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_text = 'Firefox crashed'
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!\n'
        report = IRReport(summary_text, description_text)
        report.set_basic_info(12345, 'core')
        report.set_penalty_terms(IRTermCount.do_stemming(['ie', 'explore']))
        report.set_exclude_report_ids([100100])
        report.set_dummy_bug_id(12345)
        report.set_skip_terms(IRTermCount.do_stemming(['new','please']))
        # save to text
        text = report.to_string()
        IRLog.get_instance().println('Serialized report: %s' % (text))
        # load from text
        new_report = IRReport.from_string(text)

        assert new_report.get_summary_text() == report.get_summary_text()
        eq_(new_report.get_description_text().strip(), report.get_description_text().strip())
        assert new_report.get_create_ts() == report.get_create_ts()
        assert new_report.get_product() == report.get_product()
        assert new_report.get_dummy_bug_id() == report.get_dummy_bug_id()
        assert new_report.get_penalty_terms() == report.get_penalty_terms()
        assert new_report.get_exclude_report_ids() == report.get_exclude_report_ids()
        eq_(new_report.get_skip_terms(), report.get_skip_terms())
        IRLog.get_instance().stop_log()