def test_cache_all_data(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRTFIDF.cache_all_data() IRLog.get_instance().stop_log()
def get_summary_and_description_tfidf_squared_length(self): from ir_tfidf import IRTFIDF if self.__summary_squared_length is None or \ self.__description_squared_length is None: summary, description = self.get_summary_and_description_tfidf() self.__summary_squared_length = \ IRTFIDF.get_squared_length(summary) self.__description_squared_length = \ IRTFIDF.get_squared_length(description) return self.__summary_squared_length, self.__description_squared_length
def test_batch_generate_tfidf(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRTFIDF.batch_generate_tfidf() IRLog.get_instance().stop_log()
def server_cache(msg, res): from ir_log import IRLog from ir_text import IRText from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount IRLog.get_instance().println('Server is caching data') IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRLog.get_instance().println('Server cached data') return SIGNAL_CONTINUE
def query(cls, summary, description, top_n): from ir_term_count import IRTermCount from ir_tfidf import IRTFIDF summary_bow, description_bow = \ IRTermCount.calculate_term_count(summary, description) summary_tfidf, description_tfidf = \ IRTFIDF.calculate_tfidf_for_report_termcount(summary_bow, description_bow) similarities = \ IRTFIDF.get_top_n_similarity_over_all(summary_tfidf, description_tfidf, top_n) return similarities
def compare_and_print_tfidf(cls, title_a, report_a, title_b, report_b): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF summary_field_name = IRConfig.get_instance().get('bug_summary_name') description_field_name = IRConfig.get_instance().get('bug_description_name') summary_a, description_a = report_a.get_summary_and_description_tfidf() summary_b, description_b = report_b.get_summary_and_description_tfidf() IRLog.get_instance().println('[TFIDF][Summary][%s][%s]' \ % (title_a, title_b)) IRTFIDF.show_dict_compare(summary_a, summary_b, summary_field_name) IRLog.get_instance().println('[TFIDF][Description][%s][%s]' \ % (title_a, title_b)) IRTFIDF.show_dict_compare(description_a, description_b, description_field_name)
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name: self.get_product(), create_ts_name: { '$gt': self.get_create_ts() - search_time_span }, bug_id_name: { '$nin': self.__exclude_report_ids } }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug( bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def test_calculate_tfidf_for_report_termcount_bidf(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRConfig.get_instance().set('tfidf_algorithm', 'bidf') summary = {'firefox':5, 'chrome':12} description = {'max':10, 'min':30, 'fix':10} summary_tfidf, description_tfidf = \ IRTFIDF.calculate_tfidf_for_report_termcount(summary, description) IRLog.get_instance().println('Summary') IRTFIDF.show_dict_compare(summary_tfidf, summary_tfidf) IRLog.get_instance().println('Description') IRTFIDF.show_dict_compare(description_tfidf, description_tfidf) IRLog.get_instance().stop_log()
def test_tfidf_asm_similarity(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') vec_a = {'firefox':1, 'chrome':1} vec_b = {'firefox':1, 'chrome':1, 'ie':1} vec_c = {'firefox':1, 'windows':1, 'linux':1} delta = 0.0001 assert abs(1.0 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_b)) < delta assert abs(0.5 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_c)) < delta assert IRTFIDF.tfidf_asm_similarity(vec_a, vec_b) > \ IRTFIDF.tfidf_asm_similarity(vec_a, vec_b, None, ['ie'], 100)
def compare_and_print_tfidf(cls, title_a, report_a, title_b, report_b): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF summary_field_name = IRConfig.get_instance().get('bug_summary_name') description_field_name = IRConfig.get_instance().get( 'bug_description_name') summary_a, description_a = report_a.get_summary_and_description_tfidf() summary_b, description_b = report_b.get_summary_and_description_tfidf() IRLog.get_instance().println('[TFIDF][Summary][%s][%s]' \ % (title_a, title_b)) IRTFIDF.show_dict_compare(summary_a, summary_b, summary_field_name) IRLog.get_instance().println('[TFIDF][Description][%s][%s]' \ % (title_a, title_b)) IRTFIDF.show_dict_compare(description_a, description_b, description_field_name)
def test_get_squared_length(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary = {'firefox':0.4, 'chrome':0.6} assert abs(IRTFIDF.get_squared_length(summary) - 0.52 ) < 0.00001
def get_term_by_simple_entropy(cls, diff, sim_bug_ids, penalty_terms=None): """Get the best term which has most entropy in diff. Args: diff: [(set, set)], generated by get_all_reports_difference Retruns: str, The term """ termcount = {} max_score = -1.0 max_score_term = None # count the occurance of term total_score = 0.0 for index, delta in enumerate(diff): total_score += sim_bug_ids[index][1][0] # only account for for term in delta[1]: if penalty_terms is not None and term in penalty_terms: continue if not term in termcount: termcount[term] = 0.0 termcount[term] += sim_bug_ids[index][1][0] # calcualte the value and pick the most from ir_config import IRConfig from ir_document_count import IRDocumentCount from ir_tfidf import IRTFIDF description_name = IRConfig.get_instance().get('bug_description_name') # debug use scoreboard = [] # /debug use from math import log for term in termcount: bg_score = termcount[term] / total_score ig_score = -2.0 * abs(float(termcount[term]) / total_score - 0.5) + 1 idf = IRTFIDF.get_unit_idf(IRDocumentCount.get_documentcount(term, \ description_name)) score = ig_score * idf scoreboard.append((term, score, ig_score, idf)) if score > max_score: max_score = score max_score_term = term scoreboard.sort(cmp=lambda x, y: cmp(x[1], y[1]), reverse=True) from ir_log import IRLog IRLog.get_instance().println( 'Candidate keywords: %s' % ','.join(['word', 'score', 'ig_score', 'idf'])) IRLog.get_instance().println('\n'.join([ \ ','.join([t[0],str(t[1]), str(t[2]), str(t[3])]) for t in scoreboard[:10] \ ])) return max_score_term
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name : self.get_product(), create_ts_name : {'$gt' : self.get_create_ts() - search_time_span}, bug_id_name : {'$nin' : self.__exclude_report_ids} }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def test_show_dict_compare(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') summary_a, description_a = IRTFIDF.get_tfidf_of_bug(100000) summary_b, description_b = IRTFIDF.get_tfidf_of_bug(100200) IRLog.get_instance().println('Summary 100000 vs 100200') IRTFIDF.show_dict_compare(summary_a, summary_b, summary_name) IRLog.get_instance().println('Description 100000 vs 100200') IRTFIDF.show_dict_compare(description_a, description_b) IRLog.get_instance().println('Summary 100000 vs 100000') IRTFIDF.show_dict_compare(summary_a, summary_a) IRLog.get_instance().println('Description 100000 vs 100000') IRTFIDF.show_dict_compare(description_a, description_a, description_name)
def __update_summary_and_description_tfidf_from_termcount(self): from ir_tfidf import IRTFIDF summary_termcount, description_termcount = \ self.get_summary_and_description_termcount() summary_tfidf, description_tfidf = \ IRTFIDF.calculate_tfidf_for_report_termcount(summary_termcount, description_termcount) if self.__summary_tfidf is None: self.__summary_tfidf = summary_tfidf if self.__description_tfidf is None: self.__description_tfidf = description_tfidf
def get_term_by_simple_entropy(cls, diff, sim_bug_ids, penalty_terms = None): """Get the best term which has most entropy in diff. Args: diff: [(set, set)], generated by get_all_reports_difference Retruns: str, The term """ termcount = {} max_score = -1.0 max_score_term = None # count the occurance of term total_score = 0.0 for index, delta in enumerate(diff): total_score += sim_bug_ids[index][1][0] # only account for for term in delta[1]: if penalty_terms is not None and term in penalty_terms: continue if not term in termcount: termcount[term] = 0.0 termcount[term] += sim_bug_ids[index][1][0] # calcualte the value and pick the most from ir_config import IRConfig from ir_document_count import IRDocumentCount from ir_tfidf import IRTFIDF description_name = IRConfig.get_instance().get('bug_description_name') # debug use scoreboard = [] # /debug use from math import log for term in termcount: bg_score = termcount[term] / total_score ig_score = -2.0 * abs(float(termcount[term]) / total_score - 0.5) + 1 idf = IRTFIDF.get_unit_idf(IRDocumentCount.get_documentcount(term, \ description_name)) score = ig_score * idf scoreboard.append((term, score, ig_score, idf)) if score > max_score: max_score = score max_score_term = term scoreboard.sort(cmp=lambda x,y:cmp(x[1],y[1]), reverse=True) from ir_log import IRLog IRLog.get_instance().println('Candidate keywords: %s' % ','.join(['word','score','ig_score','idf'])) IRLog.get_instance().println('\n'.join([ \ ','.join([t[0],str(t[1]), str(t[2]), str(t[3])]) for t in scoreboard[:10] \ ])) return max_score_term
def test_get_tfidf_of_bug(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRTFIDF.get_tfidf_of_bug(100000) IRLog.get_instance().println('Summary tfidf: %s' % (str(summary))) IRLog.get_instance().println('Description tfidf: %s' % (str(description))) IRLog.get_instance().stop_log()
def get_tfidf(self): if self.__tfidf is None: from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper from ir_tfidf import IRTFIDF description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRMongodbHelper.get_instance().get_collection( 'bug_db_name', 'bug_tfidf_collection_name', False) bug_count = tfidf_collection.count() self.__tfidf = \ IRTFIDF.calculate_tfidf(self.get_termcount(), description_name, bug_count, None, 'tfidf') return self.__tfidf
def get_summary_and_description_tfidf(self): if self.__bug_id is None: if self.__summary_tfidf is None or \ self.__description_tfidf is None: self.__update_summary_and_description_tfidf_from_termcount() return [self.__summary_tfidf, self.__description_tfidf] else: if self.__allow_cache and \ self.__summary_tfidf is not None and \ self.__description_tfidf is not None: return [self.__summary_tfidf, self.__description_tfidf] from ir_tfidf import IRTFIDF summary_tfidf, description_tfidf = \ IRTFIDF.get_tfidf_of_bug(self.__bug_id) if self.__allow_cache: self.__summary_tfidf, self.__description_tfidf = \ summary_tfidf, description_tfidf return [summary_tfidf, description_tfidf]
def test_calcualte_tfidf_for_report_termcount_tfidf(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary = {'firefox':5, 'chrome':12} description = {'max':10, 'min':30, 'fix':10} summary_tfidf, description_tfidf = \ IRTFIDF.calculate_tfidf_for_report_termcount(summary, description) summary_sum = 0.0 for term, tfidf in summary_tfidf.items(): summary_sum += tfidf ** 2 description_sum = 0.0 for term, tfidf in description_tfidf.items(): description_sum += tfidf ** 2 # print summary_sum, description_sum assert (summary_sum - 1.0) ** 2 < 0.00001 assert (description_sum - 1.0) ** 2 < 0.00001
def do_test_over_file(self, filename): """Do test over the file. Args: filename: str, the input file which generated by generate_incomplete_test_file. """ from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup from ir_text import IRText from ir_term_count import IRTermCount from ir_tfidf import IRTFIDF from ir_report import IRReport from ir_document_count import IRDocumentCount IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() remove_self_bug_id = IRConfig.get_instance().get_bool('remove_self_bug_id', True) sim_tot_precision = 0.0 sim_tot_recall = 0.0 sim_bi_tot_recall = 0.0 sim_tot_size = 0 dup_tot_precision = 0.0 dup_tot_recall = 0.0 dup_bi_toto_recall = 0.0 dup_num = 0 test_num = 0 infile = open(filename, 'r') for line in infile: IRLog.get_instance().println('----test----') test_num += 1 line.strip() new_report = IRReport.from_string(line) ori_report = IRReport(new_report.get_dummy_bug_id()) #IRLog.get_instance().println('Summary') #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(), # new_report.get_summary_termcount()) #IRLog.get_instance().println('Description') #IRTermCount.show_dict_compare(ori_report.get_description_termcount(), # new_report.get_description_termcount()) # do test for single similarities, duplicates = new_report.similarities_and_duplicates() sim_ids = [sim[0] for sim in similarities] dup_ids = [dup[0] for dup in duplicates] IRLog.get_instance().println('Sim ids: %s' % str(sim_ids)) IRLog.get_instance().println('Dup ids: %s' % str(dup_ids)) # evaluate sim sim_hit, sim_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id) # some group contain only one if real_duplicates.__len__() == 0: test_num -= 1 continue precision, recall = self.__report_result( new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates) sim_tot_precision += precision sim_tot_recall += recall sim_tot_size += sim_ids.__len__() sim_bi_tot_recall += 1 if recall > 0.0 else 0 if dup_ids.__len__() > 0: dup_num += 1 dup_hit, dup_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id) precision, recall = self.__report_result( new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates) dup_tot_precision += precision dup_tot_recall += recall dup_bi_toto_recall += 1 if recall > 0.0 else 0 # general conclusion if dup_num == 0: dup_num = 1.0 IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\ '#dup', 'dup pre', 'dup rec', 'dup birec'])) IRLog.get_instance().println(','.join([str(test_num), \ str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \ str(dup_num), \ str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)])) infile.close()
def similarity_with(self, other_report): """ Returns: [float, float, float, float], [total score, summary, \ description, stacktrace] """ from ir_config import IRConfig from ir_tfidf import IRTFIDF from ir_gnome_st_tools import IRSTTools summary_ratio = IRConfig.get_instance().get_float('bug_summary_ratio') description_ratio = IRConfig.get_instance().get_float( 'bug_description_ratio') stacktrace_ratio = IRConfig.get_instance().get_float( 'bug_stacktrace_ratio') summary_tfidf_a, description_tfidf_a = \ self.get_summary_and_description_tfidf() summary_tfidf_b, description_tfidf_b = \ other_report.get_summary_and_description_tfidf() tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') stacktrace_algorithm = IRConfig.get_instance().get( 'stacktrace_algorithm') if tfidf_algorithm == 'tfidf': summary_similarity = IRTFIDF.tfidf_similarity( summary_tfidf_a, summary_tfidf_b) description_similarity = IRTFIDF.tfidf_similarity( description_tfidf_a, description_tfidf_b) elif tfidf_algorithm == 'bidf': summary_squared_length, description_squared_length = \ self.get_summary_and_description_tfidf_squared_length() summary_similarity = IRTFIDF.tfidf_asm_similarity( summary_tfidf_a, summary_tfidf_b, summary_squared_length) description_similarity = IRTFIDF.tfidf_asm_similarity( description_tfidf_a, description_tfidf_b, description_squared_length, self.__penalty_terms) if self.__stacktrace is None or \ self.__stacktrace.__len__() == 0 or \ self.__stacktrace[0].__len__() == 0: stacktrace_similarity = 1.0 else: stacktrace_similarity = IRSTTools.compare_stackinfo( self.get_stacktrace(), other_report.get_stacktrace(), stacktrace_algorithm) scoring_strategy = IRConfig.get_instance().get('scoring_strategy', 'heuristic') if scoring_strategy == 'weighted': score = self.__weighted_scoring(summary_similarity, description_similarity, stacktrace_similarity) elif scoring_strategy == 'heuristic': score = self.__heuristic_scoring(summary_similarity, description_similarity, stacktrace_similarity) elif scoring_strategy == 'distweighted': score = self.__distweighted_scoring(summary_similarity, description_similarity, stacktrace_similarity) else: assert False, 'invalid scoring strategy' return [ score, summary_similarity, description_similarity, stacktrace_similarity ]
test_file = sys.argv[3] bug_id = int(sys.argv[4]) from ir_sim_bug_evaluator import IRSimBugEvaluator new_report = IRSimBugEvaluator.get_report_from_test_file( test_file, bug_id) if new_report is None: IRLog.get_instance().println('Error! Cannot find report %d in %s' % \ (bug_id, test_file)) else: if sys.argv.__len__() > 5: from ir_term_count import IRTermCount penalty_terms_raw = sys.argv[4].split(',') penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw)) IRLog.get_instance().println('%d penalty terms: %s:' \ % (penalty_terms.__len__(), ','.join(penalty_terms))) new_report.set_penalty_terms(penalty_terms) elif mode == 'text': text = sys.argv[3] new_report = IRReport.from_string(text) elif mode == 'inte': IRRecommender.start_shell() exit() else: IRLog.get_instance().println('Error! Known mode %s' % mode) from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRRecommender.do_recommend(new_report) IRLog.get_instance().stop_log()
def start_shell(cls): """Start a shell that do recommending interactively""" from ir_log import IRLog from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount from ir_report import IRReport IRLog.get_instance().println("Starting Intereport...") IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRLog.get_instance().println("Intereport Started. Waiting for input") new_report = None while 1: cmd = raw_input("Input command:").strip() if cmd == 'exit': IRLog.get_instance().println('Exiting') break elif cmd == 'new': IRLog.get_instance().println('Creating New Report') import time cur_time = -1 while cur_time < 0: try: cur_time = int( time.mktime( time.strptime( raw_input( "Input Time (e.g., 2011-05-05): "), '%Y-%m-%d'))) except: cur_time = -1 product = raw_input("Input Product: ") summary = raw_input("Summary: ") raw_description = raw_input("Description:\n") new_report = IRReport.from_string( IRReport.separator.join([ str(cur_time), product.lower(), summary, raw_description, '', '' ])) cls.__print_report(new_report) elif cmd == 'do': IRLog.get_instance().println('Do Recommending') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: cls.do_recommend(new_report) elif cmd == 'ls': IRLog.get_instance().println('Show Current Report') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: cls.__print_report(new_report) elif cmd == 'ad': IRLog.get_instance().println('Appending Description') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: append_description = raw_input("Append Description:\n") description = ' '.join([ new_report.get_description_text(), append_description ]) dummy_report = IRReport(new_report.get_summary_text(), description) dummy_report.set_stacktrace(new_report.get_stacktrace()) dummy_report.set_basic_info(new_report.get_create_ts(), new_report.get_product()) dummy_report.set_penalty_terms( new_report.get_penalty_terms()) dummy_report.set_dummy_bug_id( new_report.get_dummy_bug_id()) new_report = dummy_report IRLog.get_instance().println('Description: %s' % description) elif cmd == 'ap': IRLog.get_instance().println('Appending Penalties') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: raw = [] while raw.__len__() < 1: raw = raw_input( 'Input Penalties (split by \',\'):').split(',') from ir_term_count import IRTermCount penalty = new_report.get_penalty_terms() if penalty is None: penalty = [] penalty += IRTermCount.do_stemming(raw) new_report.set_penalty_terms(penalty) print len(penalty), penalty IRLog.get_instance().println('Penalties: %s' % \ (', '.join(penalty))) elif cmd == 'sd': IRLog.get_instance().println('Set Dummy Bug ID') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: bug_id = -1 while bug_id <= 0: try: bug_id = int(raw_input('Dummy Bug ID: ')) except: bug_id = -1 new_report.set_dummy_bug_id(bug_id) IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id) elif cmd == 'help': cls.__show_help() else: IRLog.get_instance().println('Error! Unkown command: %s' \ % cmd) cls.__show_help() # end of while 1 IRLog.get_instance().println("Bye")
def similarity_with(self, other_report): """ Returns: [float, float, float, float], [total score, summary, \ description, stacktrace] """ from ir_config import IRConfig from ir_tfidf import IRTFIDF from ir_gnome_st_tools import IRSTTools summary_ratio = IRConfig.get_instance().get_float('bug_summary_ratio') description_ratio = IRConfig.get_instance().get_float('bug_description_ratio') stacktrace_ratio = IRConfig.get_instance().get_float('bug_stacktrace_ratio') summary_tfidf_a, description_tfidf_a = \ self.get_summary_and_description_tfidf() summary_tfidf_b, description_tfidf_b = \ other_report.get_summary_and_description_tfidf() tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') stacktrace_algorithm = IRConfig.get_instance().get('stacktrace_algorithm') if tfidf_algorithm == 'tfidf': summary_similarity = IRTFIDF.tfidf_similarity( summary_tfidf_a, summary_tfidf_b) description_similarity = IRTFIDF.tfidf_similarity( description_tfidf_a, description_tfidf_b) elif tfidf_algorithm == 'bidf': summary_squared_length, description_squared_length = \ self.get_summary_and_description_tfidf_squared_length() summary_similarity = IRTFIDF.tfidf_asm_similarity( summary_tfidf_a, summary_tfidf_b, summary_squared_length) description_similarity = IRTFIDF.tfidf_asm_similarity( description_tfidf_a, description_tfidf_b, description_squared_length, self.__penalty_terms) if self.__stacktrace is None or \ self.__stacktrace.__len__() == 0 or \ self.__stacktrace[0].__len__() == 0: stacktrace_similarity = 1.0 else: stacktrace_similarity = IRSTTools.compare_stackinfo( self.get_stacktrace(), other_report.get_stacktrace(), stacktrace_algorithm) scoring_strategy = IRConfig.get_instance().get('scoring_strategy', 'heuristic') if scoring_strategy == 'weighted': score = self.__weighted_scoring(summary_similarity, description_similarity, stacktrace_similarity) elif scoring_strategy == 'heuristic': score = self.__heuristic_scoring(summary_similarity, description_similarity, stacktrace_similarity) elif scoring_strategy == 'distweighted': score = self.__distweighted_scoring(summary_similarity, description_similarity, stacktrace_similarity) else: assert False, 'invalid scoring strategy' return [score, summary_similarity, description_similarity, stacktrace_similarity]
def start_shell(cls): """Start a shell that do recommending interactively""" from ir_log import IRLog from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount from ir_report import IRReport IRLog.get_instance().println("Starting Intereport...") IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRLog.get_instance().println("Intereport Started. Waiting for input") new_report = None while 1: cmd = raw_input("Input command:").strip() if cmd == 'exit': IRLog.get_instance().println('Exiting') break elif cmd == 'new': IRLog.get_instance().println('Creating New Report') import time cur_time = -1 while cur_time < 0: try: cur_time = int(time.mktime(time.strptime( raw_input("Input Time (e.g., 2011-05-05): "), '%Y-%m-%d'))) except: cur_time = -1 product = raw_input("Input Product: ") summary = raw_input("Summary: ") raw_description = raw_input("Description:\n") new_report = IRReport.from_string(IRReport.separator.join([ str(cur_time), product.lower(), summary, raw_description, '', ''])) cls.__print_report(new_report) elif cmd == 'do': IRLog.get_instance().println('Do Recommending') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: cls.do_recommend(new_report) elif cmd == 'ls': IRLog.get_instance().println('Show Current Report') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: cls.__print_report(new_report) elif cmd == 'ad': IRLog.get_instance().println('Appending Description') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: append_description = raw_input("Append Description:\n") description =' '.join([new_report.get_description_text(), append_description]) dummy_report = IRReport(new_report.get_summary_text(), description) dummy_report.set_stacktrace(new_report.get_stacktrace()) dummy_report.set_basic_info(new_report.get_create_ts(), new_report.get_product()) dummy_report.set_penalty_terms(new_report.get_penalty_terms()) dummy_report.set_dummy_bug_id(new_report.get_dummy_bug_id()) new_report = dummy_report IRLog.get_instance().println('Description: %s' % description) elif cmd == 'ap': IRLog.get_instance().println('Appending Penalties') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: raw = [] while raw.__len__() < 1: raw = raw_input('Input Penalties (split by \',\'):').split(',') from ir_term_count import IRTermCount penalty = new_report.get_penalty_terms() if penalty is None: penalty = [] penalty += IRTermCount.do_stemming(raw) new_report.set_penalty_terms(penalty) print len(penalty), penalty IRLog.get_instance().println('Penalties: %s' % \ (', '.join(penalty))) elif cmd == 'sd': IRLog.get_instance().println('Set Dummy Bug ID') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: bug_id = -1 while bug_id <= 0: try: bug_id = int(raw_input('Dummy Bug ID: ')) except: bug_id = -1 new_report.set_dummy_bug_id(bug_id) IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id) elif cmd == 'help': cls.__show_help() else: IRLog.get_instance().println('Error! Unkown command: %s' \ % cmd) cls.__show_help() # end of while 1 IRLog.get_instance().println("Bye")
if mode == 'file': test_file = sys.argv[3] bug_id = int(sys.argv[4]) from ir_sim_bug_evaluator import IRSimBugEvaluator new_report = IRSimBugEvaluator.get_report_from_test_file(test_file, bug_id) if new_report is None: IRLog.get_instance().println('Error! Cannot find report %d in %s' % \ (bug_id, test_file)) else: if sys.argv.__len__() > 5: from ir_term_count import IRTermCount penalty_terms_raw = sys.argv[4].split(',') penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw)) IRLog.get_instance().println('%d penalty terms: %s:' \ % (penalty_terms.__len__(), ','.join(penalty_terms))) new_report.set_penalty_terms(penalty_terms) elif mode == 'text': text = sys.argv[3] new_report = IRReport.from_string(text) elif mode == 'inte': IRRecommender.start_shell() exit() else: IRLog.get_instance().println('Error! Known mode %s' % mode) from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRRecommender.do_recommend(new_report) IRLog.get_instance().stop_log()
def do_test_over_file(self, filename): """Do test over the file. Args: filename: str, the input file which generated by generate_incomplete_test_file. """ from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup from ir_text import IRText from ir_term_count import IRTermCount from ir_tfidf import IRTFIDF from ir_report import IRReport from ir_document_count import IRDocumentCount IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() remove_self_bug_id = IRConfig.get_instance().get_bool( 'remove_self_bug_id', True) sim_tot_precision = 0.0 sim_tot_recall = 0.0 sim_bi_tot_recall = 0.0 sim_tot_size = 0 dup_tot_precision = 0.0 dup_tot_recall = 0.0 dup_bi_toto_recall = 0.0 dup_num = 0 test_num = 0 infile = open(filename, 'r') for line in infile: IRLog.get_instance().println('----test----') test_num += 1 line.strip() new_report = IRReport.from_string(line) ori_report = IRReport(new_report.get_dummy_bug_id()) #IRLog.get_instance().println('Summary') #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(), # new_report.get_summary_termcount()) #IRLog.get_instance().println('Description') #IRTermCount.show_dict_compare(ori_report.get_description_termcount(), # new_report.get_description_termcount()) # do test for single similarities, duplicates = new_report.similarities_and_duplicates() sim_ids = [sim[0] for sim in similarities] dup_ids = [dup[0] for dup in duplicates] IRLog.get_instance().println('Sim ids: %s' % str(sim_ids)) IRLog.get_instance().println('Dup ids: %s' % str(dup_ids)) # evaluate sim sim_hit, sim_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id) # some group contain only one if real_duplicates.__len__() == 0: test_num -= 1 continue precision, recall = self.__report_result( new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates) sim_tot_precision += precision sim_tot_recall += recall sim_tot_size += sim_ids.__len__() sim_bi_tot_recall += 1 if recall > 0.0 else 0 if dup_ids.__len__() > 0: dup_num += 1 dup_hit, dup_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id) precision, recall = self.__report_result( new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates) dup_tot_precision += precision dup_tot_recall += recall dup_bi_toto_recall += 1 if recall > 0.0 else 0 # general conclusion if dup_num == 0: dup_num = 1.0 IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\ '#dup', 'dup pre', 'dup rec', 'dup birec'])) IRLog.get_instance().println(','.join([str(test_num), \ str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \ str(dup_num), \ str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)])) infile.close()