def test_cluster_sentences(test): from ir_log import IRLog from ir_config import IRConfig from ir_sentence import IRSentence IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') bug_id = 10000 description = 'Version: 12.43\n'\ 'Distribution: Gnome 12.03\n'\ '\n'\ 'Steps to repreduce:\n'\ '1. Open firefox.\n'\ '2. Click Option\n'\ '3. Open firefox\n'\ '\n'\ 'Additional information:\n'\ 'This is really crazy when it crashed.' sentences = IRSentence.get_sentence_from_description( description, bug_id) group_id, selected_id = IRSentence.cluster_sentences(sentences, 3) groups = [] for i in range(3): groups.append([]) index = 0 for id in group_id: groups[id].append(index) index += 1 index = 0 for group in groups: IRLog.get_instance().println('Group %d. Representative: %s' % \ (index, sentences[selected_id[index]].get_text())) for id in group: IRLog.get_instance().println(sentences[id].get_text()) index += 1
def batch_generate_term_count(cls): """Generate term count for text in mongodb database, and store to database. """ from ir_log import IRProgressBar from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().\ get('bug_description_name', 'desc') termcount_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'w') def iter_text(bug): summary_bow, description_bow = cls.calculate_term_count( bug[summary_name], bug[description_name]) termcount_collection.insert({ bug_id_name : bug[bug_id_name], summary_name : summary_bow, description_name : description_bow }) IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}), iter_text, "From Text to Term Count") termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) termcount_collection.close()
def get_summary_and_description_of_bug(cls, bug_id): """Get summary and description from mongodb. Args: bug_id: int Returns: [str, str], [summary, description] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection if cls.__is_cache: if bug_id in cls.__cache_summary_description: return cls.__cache_summary_description[bug_id] bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') text_collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'r') res = text_collection.find({bug_id_name : bug_id}) summary = '' description = '' if res.count() > 0: summary = res[0][summary_name] description = res[0][description_name] if cls.__is_cache: cls.__cache_summary_description[bug_id] = (summary, description) return summary, description
def test_create_new_report_from_string(self): from nose.tools import eq_ from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_term_count import IRTermCount IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary_text = 'Firefox crashed' description_text = 'When I was openning history folder, the f**king' \ ' Firefox just crashed!\n' report = IRReport(summary_text, description_text) report.set_basic_info(12345, 'core') report.set_penalty_terms(IRTermCount.do_stemming(['ie', 'explore'])) report.set_exclude_report_ids([100100]) report.set_dummy_bug_id(12345) report.set_skip_terms(IRTermCount.do_stemming(['new','please'])) # save to text text = report.to_string() IRLog.get_instance().println('Serialized report: %s' % (text)) # load from text new_report = IRReport.from_string(text) assert new_report.get_summary_text() == report.get_summary_text() eq_(new_report.get_description_text().strip(), report.get_description_text().strip()) assert new_report.get_create_ts() == report.get_create_ts() assert new_report.get_product() == report.get_product() assert new_report.get_dummy_bug_id() == report.get_dummy_bug_id() assert new_report.get_penalty_terms() == report.get_penalty_terms() assert new_report.get_exclude_report_ids() == report.get_exclude_report_ids() eq_(new_report.get_skip_terms(), report.get_skip_terms()) IRLog.get_instance().stop_log()
def __assert_collection_change(self, db_cfg_name, collection_cfg_name, is_finished): """Tell the agent the collection will be/has been modified. Args: db_cfg_name: str, Config name of database in config file collection_cfg_name: str, Config name of collection_cfg_name in config file. is_finished: boolean, Whether the change is about finished modifying. If not, it is the intention to modify. """ import time from ir_config import IRConfig db_name = IRConfig.get_instance().get(db_cfg_name) collection_name = IRConfig.get_instance().get(collection_cfg_name) meta_collection = self.__get_meta_collection(db_name) res = self.__find_collection_in_meta(db_name, collection_name) if res.count() > 0: meta_collection.update({self.__meta_key_name: collection_name}, { '$set': { self.__meta_lastmodified_name: int(time.time()), self.__meta_success_name: is_finished } }) else: meta_collection.insert({ self.__meta_key_name: collection_name, self.__meta_lastmodified_name: int(time.time()), self.__meta_success_name: is_finished })
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name: self.get_product(), create_ts_name: { '$gt': self.get_create_ts() - search_time_span }, bug_id_name: { '$nin': self.__exclude_report_ids } }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug( bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def test_parse_info_level1(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') assert None != IRConfig.get_instance() IRText.parse_info_level1('../data/test/info_level1_test') IRLog.get_instance().stop_log() con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # in the test data, we have 1000 in total. # within, 40 have no resolution, 154 are incomplete assert 833 == col.count() assert 'gnome is full of bugs ! (100000 currently)' == \ col.find({'bug_id':100000})[0]["summ"] res = col.find({"summ":{'$regex':'(>)|(<)|(")|(&apo)s|(&)'}}) assert res.count() == 0
def cache_all_data(cls): """Load all document count into memory. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') term_name = IRConfig.get_instance().get('bug_term_name') cls.__is_cache = True documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') def iter_document_count(term): summary = term[summary_name] if summary_name in term else 0 description = term[ description_name] if description_name in term else 0 cls.__cache_document_count[term[term_name]] = \ (summary, description) IRProgressBar.execute_iteration_for_cursor( documentcount_collection.find({}), iter_document_count, "Caching Document Count")
def test_compare_stackinfo(self): from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper from ir_gnome_st_tools import IRSTTools from ir_text import IRText from random import randint import pymongo IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRText.parse_info_level1('../data/test/stacktrace_test') con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col bugs = col.find() total = col.count() st1 = bugs[0]["stacktrace"] for i in range(total): st2 = bugs[i]["stacktrace"] result_weight = IRSTTools.compare_stackinfo(st1, st2, 'weight') result_max = IRSTTools.compare_stackinfo(st1, st2, 'max') IRLog.get_instance().println('Weight: %f, Max: %f' \ % (result_weight, result_max)) IRLog.get_instance().stop_log()
def cache_all_data(cls): """Load all data into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') # caching data cls.set_is_cache(True) text_collection = \ IRCollection('bug_db_name', 'bug_text_collection_name', 'r') cls.__cache_summary_description = {} cls.__cache_stacktrace = {} def iter_func(bug): cls.__cache_summary_description[bug[bug_id_name]] = \ (bug[summary_name], bug[description_name]) cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name] IRProgressBar.execute_iteration_for_cursor(text_collection.find(), iter_func, 'Caching Text Data') text_collection.close()
def test_parse_info_level1(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') assert None != IRConfig.get_instance() IRText.parse_info_level1('../data/test/info_level1_test') IRLog.get_instance().stop_log() con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # in the test data, we have 1000 in total. # within, 40 have no resolution, 154 are incomplete assert 833 == col.count() assert 'gnome is full of bugs ! (100000 currently)' == \ col.find({'bug_id':100000})[0]["summ"] res = col.find( {"summ": { '$regex': '(>)|(<)|(")|(&apo)s|(&)' }}) assert res.count() == 0
def test_cluster_sentences(test): from ir_log import IRLog from ir_config import IRConfig from ir_sentence import IRSentence IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') bug_id = 10000 description = 'Version: 12.43\n'\ 'Distribution: Gnome 12.03\n'\ '\n'\ 'Steps to repreduce:\n'\ '1. Open firefox.\n'\ '2. Click Option\n'\ '3. Open firefox\n'\ '\n'\ 'Additional information:\n'\ 'This is really crazy when it crashed.' sentences = IRSentence.get_sentence_from_description(description, bug_id) group_id, selected_id = IRSentence.cluster_sentences(sentences, 3) groups = [] for i in range(3): groups.append([]) index = 0 for id in group_id: groups[id].append(index) index += 1 index = 0 for group in groups: IRLog.get_instance().println('Group %d. Representative: %s' % \ (index, sentences[selected_id[index]].get_text())) for id in group: IRLog.get_instance().println(sentences[id].get_text()) index += 1
def get_termcount_of_bug(cls, bug_id): """Get termcount of a bug Args: bug_id: int Returns: [dict, dict], [termcount of summary, termcount of description] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection if cls.__is_cache: if bug_id in cls.__cache_term_count: return cls.__cache_term_count[bug_id] bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') termcount_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'r') res = termcount_collection.find({bug_id_name : bug_id}) summary = {} description = {} if res.count() > 0: summary = res[0][summary_name] description = res[0][description_name] if cls.__is_cache: cls.__cache_term_count[bug_id] = (summary, description) return summary, description
def test_get_stacktrace_text_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') stacktrace_text = IRText.get_stacktrace_text_of_bug(104400) IRLog.get_instance().println('stacktrace_text: %s' % (stacktrace_text))
def test_create_incomplete_report(self): from ir_log import IRLog from ir_config import IRConfig from ir_term_count import IRTermCount IRConfig.get_instance().load('../data/test/bug_test.cfg') summary = 'This is a test of calculation for single report term count.' description = 'This is the description of the test report. Just a test.' summary_BoW, description_BoW = \ IRTermCount.calculate_term_count(summary, description) inc_summary, inc_description = \ IRTermCount.create_incomplete_report(summary, description, 0.4) inc_summary_bow, inc_description_bow = \ IRTermCount.calculate_term_count(inc_summary, inc_description) IRLog.get_instance().println('Original Summary: %s' % (summary)) IRLog.get_instance().println('Original Description: %s' % (description)) IRLog.get_instance().println('Incomplete Summary: %s' % (inc_summary)) IRLog.get_instance().println('Incomplete Description: %s' % (inc_description)) IRLog.get_instance().println( 'Compare original BoW with incomplete BoW') IRLog.get_instance().println('%16s\t%8s\t%8s' % ('Summary', 'Ori', 'Inc')) IRTermCount.show_dict_compare(summary_BoW, inc_summary_bow) IRLog.get_instance().println('%16s\t%8s\t%8s' % ('Description', 'Ori', 'Inc')) IRTermCount.show_dict_compare(description_BoW, inc_description_bow)
def get_tfidf_of_bug(cls, bug_id): """Get tfidf of a bug. Args: bug_id: int Returns: [dict, dict], [TFIDF of summary, TFIDF of description] """ if cls.__is_cache: if bug_id in cls.__cache: return cls.__cache[bug_id] # load from db from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRCollection('bug_db_name', 'bug_tfidf_collection_name', 'r') find_result = tfidf_collection.find({bug_id_name: bug_id}) summary = {} description = {} if find_result.count() > 0: summary = find_result[0][summary_name] description = find_result[0][description_name] if cls.__is_cache: cls.__cache[bug_id] = (summary, description) return summary, description
def get_summary_and_description_of_bug(cls, bug_id): """Get summary and description from mongodb. Args: bug_id: int Returns: [str, str], [summary, description] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection if cls.__is_cache: if bug_id in cls.__cache_summary_description: return cls.__cache_summary_description[bug_id] bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') text_collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'r') res = text_collection.find({bug_id_name: bug_id}) summary = '' description = '' if res.count() > 0: summary = res[0][summary_name] description = res[0][description_name] if cls.__is_cache: cls.__cache_summary_description[bug_id] = (summary, description) return summary, description
def test_get_report_difference(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_recommender import IRRecommender IRConfig.get_instance().load('../data/test/bug_test.cfg') new_report = IRReport('apple for summary', 'linux description') sim_report = IRReport('apple of ghost crashed', 'description linux wow') (diff_sum, diff_desc) = \ IRRecommender.get_report_difference(new_report, sim_report) IRLog.get_instance().println('New summary: %s' \ % (new_report.get_summary_text())) IRLog.get_instance().println('Sim summary: %s' \ % (sim_report.get_summary_text())) IRLog.get_instance().println('New description: %s' \ % (new_report.get_description_text())) IRLog.get_instance().println('Sim description: %s' \ % (sim_report.get_description_text())) IRLog.get_instance().println('Diff of summary: %s' % (diff_sum)) IRLog.get_instance().println('Diff of description: %s' % (diff_desc)) assert diff_sum == {'ghost', 'crash'} assert diff_desc == {'wow'}
def test_filter(self): from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper from ir_gnome_st_tools import IRSTTools from ir_text import IRText import pymongo IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRText.parse_info_level1('../data/test/info_level1_test') con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # Maybe a bug here: # The test of filter (originally) depends on parse_info_level1 # But parse_info_level1 seems to invoke filter... for bug in col.find(): # TODO: it's not correct. no stacktrace in desc desc, stack = IRSTTools.filter(bug["desc"]) IRLog.get_instance().stop_log()
def get_stacktrace_of_bug(cls, bug_id): """Get stacktrace from mongodb. Args: bug_id: int Returns: [[str]], [[signature]] """ if cls.__is_cache: if bug_id in cls.__cache_stacktrace: return cls.__cache_stacktrace[bug_id] from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') text_collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'r') res = text_collection.find({bug_id_name: bug_id}) stacktrace = [] if res.count() > 0: stacktrace = res[0][stacktrace_name] if cls.__is_cache: cls.__cache_stacktrace[bug_id] = stacktrace return stacktrace
def calculate_tfidf_for_report_termcount(cls, summary_termcount, description_termcount): """Calculate TFIDF for single report. Args: summary_termcount: dict, {term -> termcount} description_termcount: dict, {term -> termcount} Returns: [dict, dict], [tfidf of summary, tfidf of description] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') total_document = cls.get_total_report_number() summary_tfidf = cls.calculate_tfidf( summary_termcount, IRConfig.get_instance().get('bug_summary_name'), total_document, documentcount_collection) description_tfidf = cls.calculate_tfidf( description_termcount, IRConfig.get_instance().get('bug_description_name'), total_document, documentcount_collection) return summary_tfidf, description_tfidf
def test_create_new_report_from_string(self): from nose.tools import eq_ from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_term_count import IRTermCount IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary_text = 'Firefox crashed' description_text = 'When I was openning history folder, the f**king' \ ' Firefox just crashed!\n' report = IRReport(summary_text, description_text) report.set_basic_info(12345, 'core') report.set_penalty_terms(IRTermCount.do_stemming(['ie', 'explore'])) report.set_exclude_report_ids([100100]) report.set_dummy_bug_id(12345) report.set_skip_terms(IRTermCount.do_stemming(['new', 'please'])) # save to text text = report.to_string() IRLog.get_instance().println('Serialized report: %s' % (text)) # load from text new_report = IRReport.from_string(text) assert new_report.get_summary_text() == report.get_summary_text() eq_(new_report.get_description_text().strip(), report.get_description_text().strip()) assert new_report.get_create_ts() == report.get_create_ts() assert new_report.get_product() == report.get_product() assert new_report.get_dummy_bug_id() == report.get_dummy_bug_id() assert new_report.get_penalty_terms() == report.get_penalty_terms() assert new_report.get_exclude_report_ids( ) == report.get_exclude_report_ids() eq_(new_report.get_skip_terms(), report.get_skip_terms()) IRLog.get_instance().stop_log()
def test_get_collection_status(self): from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper dbhelper = IRMongodbHelper.get_instance() IRConfig.get_instance().load('../data/test/bug_test.cfg') collection = dbhelper.get_collection( 'bug_db_name', 'bug_mongodb_helper_collection_name', True) ts, success = dbhelper.get_collection_status( 'bug_db_name', 'bug_mongodb_helper_collection_name') assert success == False db_name = IRConfig.get_instance().get('bug_db_name') collection_name = IRConfig.get_instance(). \ get('bug_mongodb_helper_collection_name') dbhelper.update_meta( db_name, collection_name, True) ts, success = dbhelper.get_collection_status( 'bug_db_name', 'bug_mongodb_helper_collection_name') assert success == True
def batch_generate_tfidf(cls): """Batch calculate TFIDF.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_document_count import IRDocumentCount from ir_term_count import IRTermCount # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') # prepare collections IRDocumentCount.cache_all_data() tfidf_collection = IRCollection( 'bug_db_name', 'bug_tfidf_collection_name', 'w') # batch calculate tfidf termcount_iterator = IRTermCount.get_iterator() bug_count = termcount_iterator.count() def iter_term_count(bug): summary_tfidf = cls.calculate_tfidf(bug[summary_name], summary_name, bug_count, None, tfidf_algorithm) description_tfidf = cls.calculate_tfidf(bug[description_name], description_name, bug_count, None, tfidf_algorithm) tfidf_collection.insert({bug_id_name : bug[bug_id_name], summary_name : summary_tfidf, description_name : description_tfidf}) IRProgressBar.execute_iteration_for_cursor(termcount_iterator, iter_term_count, "Calculating TFIDF") tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) tfidf_collection.close()
def get_tfidf_of_bug(cls, bug_id): """Get tfidf of a bug. Args: bug_id: int Returns: [dict, dict], [TFIDF of summary, TFIDF of description] """ if cls.__is_cache: if bug_id in cls.__cache: return cls.__cache[bug_id] # load from db from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRCollection( 'bug_db_name', 'bug_tfidf_collection_name', 'r') find_result = tfidf_collection.find({bug_id_name : bug_id}) summary = {} description = {} if find_result.count() > 0: summary = find_result[0][summary_name] description = find_result[0][description_name] if cls.__is_cache: cls.__cache[bug_id] = (summary, description) return summary, description
def get_stacktrace_of_bug(cls, bug_id): """Get stacktrace from mongodb. Args: bug_id: int Returns: [[str]], [[signature]] """ if cls.__is_cache: if bug_id in cls.__cache_stacktrace: return cls.__cache_stacktrace[bug_id] from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') text_collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'r') res = text_collection.find({bug_id_name : bug_id}) stacktrace = [] if res.count() > 0: stacktrace = res[0][stacktrace_name] if cls.__is_cache: cls.__cache_stacktrace[bug_id] = stacktrace return stacktrace
def test_generate_and_test_complete_test_file(self): from ir_config import IRConfig from ir_sim_bug_evaluator import IRSimBugEvaluator IRConfig.get_instance().load('../data/test/bug_test.cfg') evl = IRSimBugEvaluator() evl.generate_test_file('complete_test_file', 0.0) evl.do_test_over_file('complete_test_file')
def test_cache_all_data(self): from ir_log import IRLog from ir_config import IRConfig from ir_document_count import IRDocumentCount IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRDocumentCount.cache_all_data()
def test_get_summary_and_description_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRText.get_summary_and_description_of_bug(100000) IRLog.get_instance().println('summary: %s' % (summary)) IRLog.get_instance().println('description: %s' % (description))
def test_get_squared_length(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary = {'firefox':0.4, 'chrome':0.6} assert abs(IRTFIDF.get_squared_length(summary) - 0.52 ) < 0.00001
def test_cache_all_data(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRTFIDF.cache_all_data() IRLog.get_instance().stop_log()
def __weighted_scoring(self, summary_similarity, description_similarity, stacktrace_similarity): from ir_config import IRConfig summary_ratio = IRConfig.get_instance().get_float('bug_summary_ratio') description_ratio = IRConfig.get_instance().get_float('bug_description_ratio') stacktrace_ratio = IRConfig.get_instance().get_float('bug_stacktrace_ratio') return summary_similarity * summary_ratio + \ description_similarity * description_ratio + \ stacktrace_similarity * stacktrace_ratio
def get_documentcount(cls, term, field=None, documentcount_collection=None): """Get documentcount of a term. Args: term, str Returns: if field == None: (int, int), (summary document count, description document count) else: int, the document count of corresponding field """ if cls.__is_cache and term in cls.__cache_document_count: if field is None: return cls.__cache_document_count[term] else: from ir_config import IRConfig summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get( 'bug_description_name') if field == summary_name: return cls.__cache_document_count[term][0] elif field == description_name: return cls.__cache_document_count[term][1] else: return 0 # load from db from ir_mongodb_helper import IRCollection from ir_config import IRConfig term_name = IRConfig.get_instance().get('bug_term_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') if documentcount_collection is None: documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') res = documentcount_collection.find({term_name: term}) summary = 0 description = 0 if res.count() > 0: if summary_name in res[0]: summary = res[0][summary_name] if description_name in res[0]: description = res[0][description_name] if cls.__is_cache: cls.__cache_document_count[term] = (summary, description) # return value if field is None: return summary, description elif field == summary_name: return summary elif field == description_name: return description else: return 0
def test_generate_document_count(self): from ir_log import IRLog from ir_config import IRConfig from ir_document_count import IRDocumentCount IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRDocumentCount.batch_generate_document_count() IRLog.get_instance().stop_log()
def get_connection(self): """Get the connection, using db_host and db_port set in config file.""" if self.__connection is None: import pymongo from ir_config import IRConfig self.__connection = pymongo.Connection( IRConfig.get_instance().get('db_host', self.__default_host), IRConfig.get_instance().get_int('db_port', self.__default_port)) return self.__connection
def test_get_summary_and_description_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRText.get_summary_and_description_of_bug( 100000) IRLog.get_instance().println('summary: %s' % (summary)) IRLog.get_instance().println('description: %s' % (description))
def __store_to_mongodb(cls, bug2group, group2bug): """Store duplicate group information into Mongodb. Args: bug2group: dict, {bug_id -> group_id} group2bug: dict, {group_id -> [bug_id]} """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') bug_group_name = IRConfig.get_instance().get('bug_group_name') duplicate_collection = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'w') def iter_bug_group(bug): duplicate_collection.insert({ bug_id_name: bug, bug_group_name: bug2group[bug] }) IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group, "Store to db") duplicate_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) duplicate_collection.create_index([(bug_group_name, IRCollection.ASCENDING)]) duplicate_collection.close() # duplicate group size collection group_name = IRConfig.get_instance().get('bug_group_name') group_size_name = IRConfig.get_instance().get('bug_group_size') duplicate_group_count_collection = IRCollection( 'bug_db_name', 'bug_duplicate_group_count_collection_name', 'w') line_num = 0 for group, bugs in group2bug.items(): line_num += 1 def iter_group_bug(group): duplicate_group_count_collection.insert({ group_name: group, group_size_name: group2bug[group].__len__() }) IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug, 'Store Index') duplicate_group_count_collection.create_index([ (group_name, IRCollection.ASCENDING) ]) duplicate_group_count_collection.close()
def calculate_tfidf(cls, termcount, field_name, document_num, documentcount_collection=None, algorithm=None): """Calculate TFIDF for a BoW. Args: termcount: dict, {term -> count} field_name: str, 'summary' or 'description', in order to get document count document_num: int, Total number of documents algorithm: str, 'tfidf' for term-frequency and normalized tfidf. 'bidf' for 0-1 counting without normalized if None, fetch config from file Returns: dict, {term -> tfidf} """ from math import sqrt from ir_document_count import IRDocumentCount #total_termcount = cls.__get_total_number_of_terms(termcount) # calcualte raw tfidf if algorithm is None: from ir_config import IRConfig algorithm = IRConfig.get_instance().get('tfidf_algorithm') tfidfs = {} length_2 = 0 #total_termcount = cls.__get_total_number_of_terms(termcount) # calculate raw tfidf if algorithm is None: from ir_config import IRConfig algorithm = IRConfig.get_instance().get('tfidf_algorithm') for term, count in termcount.items(): documentcount = IRDocumentCount.get_documentcount( term, field_name, documentcount_collection) idf = cls.get_idf(documentcount) # Warning: there're two types of tf: term count or term frequency # We need to compare their performance # If we normalize the vector, we just use occurrence of term if algorithm == 'tfidf': tfidf = float(count) * idf #/ total_termcount length_2 += tfidf**2 elif algorithm == 'bidf': tfidf = (1 if count > 0 else 0) * idf tfidfs[term] = tfidf # normalize raw tfidf if algorithm == 'tfidf': length = sqrt(length_2) if abs(length) > 0.0001: for term in tfidfs: tfidfs[term] /= length return tfidfs
def __weighted_scoring(self, summary_similarity, description_similarity, stacktrace_similarity): from ir_config import IRConfig summary_ratio = IRConfig.get_instance().get_float('bug_summary_ratio') description_ratio = IRConfig.get_instance().get_float( 'bug_description_ratio') stacktrace_ratio = IRConfig.get_instance().get_float( 'bug_stacktrace_ratio') return summary_similarity * summary_ratio + \ description_similarity * description_ratio + \ stacktrace_similarity * stacktrace_ratio
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name : self.get_product(), create_ts_name : {'$gt' : self.get_create_ts() - search_time_span}, bug_id_name : {'$nin' : self.__exclude_report_ids} }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def test_tokenization(self): from ir_log import IRLog from ir_config import IRConfig from ir_term_count import IRTermCount from nose.tools import assert_equals IRConfig.get_instance().load('../data/test/bug_test.cfg') tests = ['mouse-down', 'set_background_color()'] expects = [['mouse-down'], ['set_background_color']] for index, test in enumerate(tests): assert_equals(expects[index], IRTermCount.do_tokenization(test))
def test_batch_generate_tfidf(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRTFIDF.batch_generate_tfidf() IRLog.get_instance().stop_log()
def test_get_duplicate_group_information(self): from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') duplicate_group = IRDuplicateGroup() group_ids = duplicate_group.get_duplicate_group_information(3,10) IRLog.get_instance().println('Groups with size between %d, %d: %s' \ % (0, 100, ' '.join([str(group_id) for group_id in group_ids]))) IRLog.get_instance().stop_log()
def test_top_n_similarity_over_all(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') report = IRReport(100000) bugs_similarities = report.top_n_similarity_over_all(10) IRLog.get_instance().println('Bugs with top similarities with bug %d: %s' \ % (100000, str(bugs_similarities))) IRLog.get_instance().stop_log()
def test_get_term_by_simple_entropy(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_recommender import IRRecommender IRConfig.get_instance().load('../data/test/bug_test.cfg') diff = [(set([]), {'a', 'b', 'c', 'd'}), (set([]), {'a', 'b', 'c'}), (set([]), {'a', 'b'}), (set([]), {'a'})]
def test_get_bugs_in_group(self): from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') duplicate_group = IRDuplicateGroup() bug_ids = duplicate_group.get_bugs_in_group(1) IRLog.get_instance().println('Group %d has bugs: ' % (1) + \ ' '.join([str(bug_id) for bug_id in bug_ids])) IRLog.get_instance().stop_log()
def get_documentcount(cls, term, field = None, documentcount_collection = None): """Get documentcount of a term. Args: term, str Returns: if field == None: (int, int), (summary document count, description document count) else: int, the document count of corresponding field """ if cls.__is_cache and term in cls.__cache_document_count: if field is None: return cls.__cache_document_count[term] else: from ir_config import IRConfig summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') if field == summary_name: return cls.__cache_document_count[term][0] elif field == description_name: return cls.__cache_document_count[term][1] else: return 0 # load from db from ir_mongodb_helper import IRCollection from ir_config import IRConfig term_name = IRConfig.get_instance().get('bug_term_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') if documentcount_collection is None: documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') res = documentcount_collection.find({term_name : term}) summary = 0 description = 0 if res.count() > 0: if summary_name in res[0]: summary = res[0][summary_name] if description_name in res[0]: description = res[0][description_name] if cls.__is_cache: cls.__cache_document_count[term] = (summary, description) # return value if field is None: return summary, description elif field == summary_name: return summary elif field == description_name: return description else: return 0
def test_get_tfidf_of_bug(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRTFIDF.get_tfidf_of_bug(100000) IRLog.get_instance().println('Summary tfidf: %s' % (str(summary))) IRLog.get_instance().println('Description tfidf: %s' % (str(description))) IRLog.get_instance().stop_log()
def test_get_termcount_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_term_count import IRTermCount IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRTermCount.get_termcount_of_bug(100000) assert None != summary assert None != description IRLog.get_instance().println('Summary') IRTermCount.show_dict_compare(summary, {}) IRLog.get_instance().println('Description') IRTermCount.show_dict_compare(description, {})