def test_parse_info_level1(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') assert None != IRConfig.get_instance() IRText.parse_info_level1('../data/test/info_level1_test') IRLog.get_instance().stop_log() con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # in the test data, we have 1000 in total. # within, 40 have no resolution, 154 are incomplete assert 833 == col.count() assert 'gnome is full of bugs ! (100000 currently)' == \ col.find({'bug_id':100000})[0]["summ"] res = col.find({"summ":{'$regex':'(>)|(<)|(")|(&apo)s|(&)'}}) assert res.count() == 0
def test_get_stacktrace_text_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') stacktrace_text = IRText.get_stacktrace_text_of_bug(104400) IRLog.get_instance().println('stacktrace_text: %s' % (stacktrace_text))
def __is_collection_close(self): """Check if the operation is conducted after the collection is closed.""" if self.__is_closed: from ir_log import IRLog IRLog.get_instance().println( 'Error! Cannot write to closed collection.') assert False
def test_filter(self): from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper from ir_gnome_st_tools import IRSTTools from ir_text import IRText import pymongo IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRText.parse_info_level1('../data/test/info_level1_test') con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # Maybe a bug here: # The test of filter (originally) depends on parse_info_level1 # But parse_info_level1 seems to invoke filter... for bug in col.find(): # TODO: it's not correct. no stacktrace in desc desc, stack = IRSTTools.filter(bug["desc"]) IRLog.get_instance().stop_log()
def run(self): from ir_log import IRLog session_state = STATE_ALIVE while session_state == STATE_ALIVE: try: msgpack = self.__msg_queue.get(True) # do something to msgpack conn = msgpack['connection'] respack = msgpack['respack'] respack[SESSION_ID] = msgpack[SESSION_ID] # set phase for key, value in msgpack.items(): if key in SET_COMMANDS: self.__report = SET_COMMANDS[key](self.__report, value) # do phase signal = SIGNAL_CONTINUE for key, value in msgpack.items(): if key in CTL_COMMANDS: signal = CTL_COMMANDS[key](self.__report, respack) if signal == SIGNAL_BREAK: session_state = STATE_EXPIRED self.__pack_report_info(respack) IRLog.get_instance().println('Send message: %s' % str(respack)) conn.send(str(respack)) except Queue.Empty: from ir_log import IRLog IRLog.get_instance().println('Session %d time out' % self.__id, 2) break self.__dispatcher.remove_session(self.__id)
def func_each_line(line): bug_id, summary, description, resolution, create_ts, product = \ cls.__extract_information_from_info_level1_line(line) if resolution is not None and resolution != "INCOMPLETE": # post process description description, stacktrace = \ cls.extract_raw_description_info(description, community_name) # drop the report whose description containing stacktrace info if cls.is_drop_report(description): from ir_log import IRLog IRLog.get_instance().println('Drop report#=%d because it '\ 'contains unrecognizable stacktrace.' % bug_id, 3) return collection.insert({ bug_id_name: bug_id, summary_name: summary, description_name: description, stacktrace_name: stacktrace }) collection_basic.insert({ bug_id_name: bug_id, create_ts_name: create_ts, product_name: product })
def __show_similarity_distribution(self, sorted_similarities): """Show the distribtuion of similarities. Args: sorted_similarities: [(bug_id, (score, ...))] """ from ir_log import IRLog tot = sorted_similarities.__len__() # number of near top print sorted_similarities[0] max_score = sorted_similarities[0][1][0] min_score = sorted_similarities[-1][1][0] score_span = 0.1 near_threshold = max_score - (max_score - min_score) * score_span near_one_number = 0 for item in sorted_similarities: if item[1][0] > near_threshold: near_one_number += 1 else: break IRLog.get_instance().println('%d in %d (%f) reports have score ' \ 'greater than %f (%f of the score span)' % \ (near_one_number, tot, float(near_one_number)/tot, near_threshold, score_span)) # quantiles quantiles = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] for quan in quantiles: pos = int(quan * tot) if pos >= tot: pos = tot-1 IRLog.get_instance().println('Top %d: %f' \ % (int(quan*100), sorted_similarities[pos][1][0]))
def __show_similarity_distribution(self, sorted_similarities): """Show the distribtuion of similarities. Args: sorted_similarities: [(bug_id, (score, ...))] """ from ir_log import IRLog tot = sorted_similarities.__len__() # number of near top print sorted_similarities[0] max_score = sorted_similarities[0][1][0] min_score = sorted_similarities[-1][1][0] score_span = 0.1 near_threshold = max_score - (max_score - min_score) * score_span near_one_number = 0 for item in sorted_similarities: if item[1][0] > near_threshold: near_one_number += 1 else: break IRLog.get_instance().println('%d in %d (%f) reports have score ' \ 'greater than %f (%f of the score span)' % \ (near_one_number, tot, float(near_one_number)/tot, near_threshold, score_span)) # quantiles quantiles = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] for quan in quantiles: pos = int(quan * tot) if pos >= tot: pos = tot - 1 IRLog.get_instance().println('Top %d: %f' \ % (int(quan*100), sorted_similarities[pos][1][0]))
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name: self.get_product(), create_ts_name: { '$gt': self.get_create_ts() - search_time_span }, bug_id_name: { '$nin': self.__exclude_report_ids } }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug( bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def __del__(self): if (self.__mode == 'w' or self.__mode == 'a') \ and self.__is_closed == False: from ir_log import IRLog IRLog.get_instance().println('Error! Collection in modifying mode ' 'is destoried before being closed.') assert False
def test_parse_info_level1(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') assert None != IRConfig.get_instance() IRText.parse_info_level1('../data/test/info_level1_test') IRLog.get_instance().stop_log() con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # in the test data, we have 1000 in total. # within, 40 have no resolution, 154 are incomplete assert 833 == col.count() assert 'gnome is full of bugs ! (100000 currently)' == \ col.find({'bug_id':100000})[0]["summ"] res = col.find( {"summ": { '$regex': '(>)|(<)|(")|(&apo)s|(&)' }}) assert res.count() == 0
def show_dict_compare(cls, dicta, dictb, log_level = 1): """Compare the print two BoW. Args: dicta: dict, term -> count dictb: dict log_level: int """ from ir_log import IRLog keys = set() if None != dicta: for key in dicta: keys.add(key) if None != dictb: for key in dictb: keys.add(key) # sort by common num common_num = [] for key in keys: counta = 0 countb = 0 if None != dicta: if key in dicta: counta = dicta[key] if None != dictb: if key in dictb: countb = dictb[key] common_num.append((key, min(counta, countb), counta, countb)) common_num.sort(cmp=lambda a,b:cmp(a[1],b[1]), reverse=True) # print it out for item in common_num: IRLog.get_instance().println('%16s\t%8d\t%8d' \ % (item[0], item[2], item[3]), log_level)
def test_cache_all_data(self): from ir_log import IRLog from ir_config import IRConfig from ir_document_count import IRDocumentCount IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRDocumentCount.cache_all_data()
def __is_modification_legal_in_current_mode(self): """Check if current mode supports modifying operation.""" self.__is_collection_close() if self.__mode == 'r': from ir_log import IRLog IRLog.get_instance().println( 'Error! Cannot write to collection being opened in read mode.') assert False
def similarities_and_duplicates(self): """Calculate the similarities over all existing reports and return the similar reports and duplicate reports. Returns: [bug_id],[bug_id], [similar report ids],[duplicate report ids] """ from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup similar_threshold = IRConfig.get_instance().get_float( 'bug_similar_threshold', 0.7) duplicate_num = IRConfig.get_instance().get_int( 'bug_duplicate_number', 5) duplicate_threshold = IRConfig.get_instance().get_int( 'bug_duplicate_threshold', 10) max_similar_number = IRConfig.get_instance().get_int( 'bug_similar_max', 10000000) similar_threshold_percent = IRConfig.get_instance().get_float( 'bug_similar_threshold_percent', 0.8) no_similar_threshold = IRConfig.get_instance().get_float( 'bug_no_similar_threshold', 0.65) similarities = self.similarity_over_all().items() if similarities.__len__() == 0: return [], [] similarities.sort(key=lambda x:x[1][0], reverse = True) # report scoring IRLog.get_instance().println('Max score report: %s' % str(similarities[0])) if similarities.__len__() > 1: IRLog.get_instance().println('Second score report: %s' % str(similarities[1])) # find cutting edge of similar reports max_score = similarities[0][1][0] min_score = similarities[-1][1][0] IRLog.get_instance().println('max score:%f, min score: %f' %(max_score, min_score)) IRLog.get_instance().println('no threshold:%f' % no_similar_threshold) if max_score < no_similar_threshold: return [], [] similar_threshold_percent_cut = min_score + (max_score - min_score) *\ similar_threshold_percent print 'cut:', similar_threshold_percent_cut cut_position = min(max_similar_number, self.__binary_search_less(similarities, lambda x:x[1][ 0], similar_threshold_percent_cut)) IRLog.get_instance().println('Get %d similar reports.' % cut_position) # find number of duplicate groups in similar reports group_set = set() for report in similarities[:cut_position]: group_set.add(IRDuplicateGroup.get_group_of_bug(report[0])) if None in group_set: group_set.remove(None) duplicate_reports = [] if group_set.__len__() <= duplicate_threshold: duplicate_reports = similarities[:min(cut_position, duplicate_num)] return similarities[:cut_position], duplicate_reports
def test_get_summary_and_description_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRText.get_summary_and_description_of_bug(100000) IRLog.get_instance().println('summary: %s' % (summary)) IRLog.get_instance().println('description: %s' % (description))
def test_get_squared_length(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary = {'firefox':0.4, 'chrome':0.6} assert abs(IRTFIDF.get_squared_length(summary) - 0.52 ) < 0.00001
def test_cache_all_data(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRTFIDF.cache_all_data() IRLog.get_instance().stop_log()
def test_generate_document_count(self): from ir_log import IRLog from ir_config import IRConfig from ir_document_count import IRDocumentCount IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRDocumentCount.batch_generate_document_count() IRLog.get_instance().stop_log()
def print_similarity_score(cls, report_a, report_b): """Warning: report_a is primary! It is critial in asymatric algorithm""" from ir_log import IRLog total, summary, description, stacktrace = \ report_a.similarity_with(report_b) IRLog.get_instance().println('[Similarity] %f '\ '=[Summary]%f[Description]%f[Stacktrace]%f' \ % (total, summary, description, stacktrace))
def test_get_summary_and_description_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRText.get_summary_and_description_of_bug( 100000) IRLog.get_instance().println('summary: %s' % (summary)) IRLog.get_instance().println('description: %s' % (description))
def get_int(self, name, default_value = None): """Get the int value with the given name.""" try: res = self.get(name, default_value) return int(res) except ValueError: from ir_log import IRLog IRLog.get_instance().println('Could not convert %d to int.' \ % (self.get(name))) return default_value
def get_term_by_simple_entropy(cls, diff, sim_bug_ids, penalty_terms=None): """Get the best term which has most entropy in diff. Args: diff: [(set, set)], generated by get_all_reports_difference Retruns: str, The term """ termcount = {} max_score = -1.0 max_score_term = None # count the occurance of term total_score = 0.0 for index, delta in enumerate(diff): total_score += sim_bug_ids[index][1][0] # only account for for term in delta[1]: if penalty_terms is not None and term in penalty_terms: continue if not term in termcount: termcount[term] = 0.0 termcount[term] += sim_bug_ids[index][1][0] # calcualte the value and pick the most from ir_config import IRConfig from ir_document_count import IRDocumentCount from ir_tfidf import IRTFIDF description_name = IRConfig.get_instance().get('bug_description_name') # debug use scoreboard = [] # /debug use from math import log for term in termcount: bg_score = termcount[term] / total_score ig_score = -2.0 * abs(float(termcount[term]) / total_score - 0.5) + 1 idf = IRTFIDF.get_unit_idf(IRDocumentCount.get_documentcount(term, \ description_name)) score = ig_score * idf scoreboard.append((term, score, ig_score, idf)) if score > max_score: max_score = score max_score_term = term scoreboard.sort(cmp=lambda x, y: cmp(x[1], y[1]), reverse=True) from ir_log import IRLog IRLog.get_instance().println( 'Candidate keywords: %s' % ','.join(['word', 'score', 'ig_score', 'idf'])) IRLog.get_instance().println('\n'.join([ \ ','.join([t[0],str(t[1]), str(t[2]), str(t[3])]) for t in scoreboard[:10] \ ])) return max_score_term
def server_cache(msg, res): from ir_log import IRLog from ir_text import IRText from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount IRLog.get_instance().println('Server is caching data') IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRLog.get_instance().println('Server cached data') return SIGNAL_CONTINUE
def test_batch_generate_tfidf(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRTFIDF.batch_generate_tfidf() IRLog.get_instance().stop_log()
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name : self.get_product(), create_ts_name : {'$gt' : self.get_create_ts() - search_time_span}, bug_id_name : {'$nin' : self.__exclude_report_ids} }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def get_term_by_simple_entropy(cls, diff, sim_bug_ids, penalty_terms = None): """Get the best term which has most entropy in diff. Args: diff: [(set, set)], generated by get_all_reports_difference Retruns: str, The term """ termcount = {} max_score = -1.0 max_score_term = None # count the occurance of term total_score = 0.0 for index, delta in enumerate(diff): total_score += sim_bug_ids[index][1][0] # only account for for term in delta[1]: if penalty_terms is not None and term in penalty_terms: continue if not term in termcount: termcount[term] = 0.0 termcount[term] += sim_bug_ids[index][1][0] # calcualte the value and pick the most from ir_config import IRConfig from ir_document_count import IRDocumentCount from ir_tfidf import IRTFIDF description_name = IRConfig.get_instance().get('bug_description_name') # debug use scoreboard = [] # /debug use from math import log for term in termcount: bg_score = termcount[term] / total_score ig_score = -2.0 * abs(float(termcount[term]) / total_score - 0.5) + 1 idf = IRTFIDF.get_unit_idf(IRDocumentCount.get_documentcount(term, \ description_name)) score = ig_score * idf scoreboard.append((term, score, ig_score, idf)) if score > max_score: max_score = score max_score_term = term scoreboard.sort(cmp=lambda x,y:cmp(x[1],y[1]), reverse=True) from ir_log import IRLog IRLog.get_instance().println('Candidate keywords: %s' % ','.join(['word','score','ig_score','idf'])) IRLog.get_instance().println('\n'.join([ \ ','.join([t[0],str(t[1]), str(t[2]), str(t[3])]) for t in scoreboard[:10] \ ])) return max_score_term
def test_get_termcount_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_term_count import IRTermCount IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRTermCount.get_termcount_of_bug(100000) assert None != summary assert None != description IRLog.get_instance().println('Summary') IRTermCount.show_dict_compare(summary, {}) IRLog.get_instance().println('Description') IRTermCount.show_dict_compare(description, {})
def test_get_documentcount(self): from ir_log import IRLog from ir_config import IRConfig from ir_document_count import IRDocumentCount IRLog.get_instance().start_log() summary, description = IRDocumentCount.get_documentcount('click') IRLog.get_instance().println('\'click\', Document Count of summary: %d, description: %d' % (summary, description)) assert summary == IRDocumentCount.get_documentcount( 'click', IRConfig.get_instance().get('bug_summary_name')) assert description == IRDocumentCount.get_documentcount( 'click', IRConfig.get_instance().get('bug_description_name'))
def compare_and_print_termcount(cls, title_a, report_a, title_b, report_b): from ir_log import IRLog from ir_term_count import IRTermCount summary_a, description_a = \ report_a.get_summary_and_description_termcount() summary_b, description_b = \ report_b.get_summary_and_description_termcount() IRLog.get_instance().println('[Termcount][Summary][%s][%s]' \ % (title_a, title_b)) IRTermCount.show_dict_compare(summary_a, summary_b) IRLog.get_instance().println('[Termcount][Description][%s][%s]' \ % (title_a, title_b)) IRTermCount.show_dict_compare(description_a, description_b)
def __report_result(self, bug_id, hit, nothit, duplicates): """ Print the evaluation result. hit: actual duplicates found by algorithm notthis: actual non-duplicates, but are detected as duplicate by algorithm Return: precision, recall """ from ir_log import IRLog total = hit.__len__() + nothit.__len__() if total == 0: precision = 0.0 else: precision = float(hit.__len__())/(hit.__len__() + nothit.__len__()) if duplicates.__len__() == 0: recall = 0.0 else: recall = float(hit.__len__())/duplicates.__len__() IRLog.get_instance().println('Bug %d, precision %f, recall %f, ' \ 'duplicate size %d' \ % (bug_id, precision, recall, duplicates.__len__()), 2) IRLog.get_instance().println('Hit %d duplicates: %s' \ % (hit.__len__(), ','.join([str(bug_id) for bug_id in hit])), 1) IRLog.get_instance().println('Hit %d nonduplicates: %s' \ % (nothit.__len__(), ','.join([str(bug_id) for bug_id in nothit])), 1) IRLog.get_instance().println('Actual %d duplicates: %s' \ % (duplicates.__len__(), ','.join([str(bug_id) for bug_id in duplicates])), 1) return precision, recall
def __report_result(self, bug_id, hit, nothit, duplicates): """ Print the evaluation result. hit: actual duplicates found by algorithm notthis: actual non-duplicates, but are detected as duplicate by algorithm Return: precision, recall """ from ir_log import IRLog total = hit.__len__() + nothit.__len__() if total == 0: precision = 0.0 else: precision = float( hit.__len__()) / (hit.__len__() + nothit.__len__()) if duplicates.__len__() == 0: recall = 0.0 else: recall = float(hit.__len__()) / duplicates.__len__() IRLog.get_instance().println('Bug %d, precision %f, recall %f, ' \ 'duplicate size %d' \ % (bug_id, precision, recall, duplicates.__len__()), 2) IRLog.get_instance().println('Hit %d duplicates: %s' \ % (hit.__len__(), ','.join([str(bug_id) for bug_id in hit])), 1) IRLog.get_instance().println('Hit %d nonduplicates: %s' \ % (nothit.__len__(), ','.join([str(bug_id) for bug_id in nothit])), 1) IRLog.get_instance().println('Actual %d duplicates: %s' \ % (duplicates.__len__(), ','.join([str(bug_id) for bug_id in duplicates])), 1) return precision, recall
def __generate_sample_over_a_list(self, infile, group_ids, sample_num, drop_rate): """ Conduct evaluation over the bugs within the groups in group ids group_ids: a list of group_ids sample_num: the number of bugs being sampled drop_rate: the probability of chance to drop a word """ from ir_log import IRLog sampling_bugs = self.__get_sample_bugs_within_groups(group_ids, sample_num) for bug_id in sampling_bugs: new_report = \ self.__generate_single_bug(bug_id, drop_rate) IRLog.get_instance().println('%d' % bug_id) infile.write('%s\n' % (new_report.to_string()))
def test_similarities_and_duplicates(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') report = IRReport(100000) similarities, duplicates = report.similarities_and_duplicates() IRLog.get_instance().println('Report %d' % (100000)) IRLog.get_instance().println('%d Similar Reports: %s' % (similarities .__len__(), ','.join([str(item[0]) for item in similarities]))) IRLog.get_instance().println('%d Duplicate Reports: %s' % (duplicates .__len__(), ','.join([str(item[0]) for item in duplicates]))) IRLog.get_instance().stop_log()
def test_stemming(self): from ir_log import IRLog from ir_config import IRConfig from ir_term_count import IRTermCount IRConfig.get_instance().load('../data/test/bug_test.cfg') tests = ['discrimination', 'disgusting', 'visualization', 'configuration'] stemmers = ['porter', 'lancaster', 'snowball'] for test in tests: out = [] for stemmer in stemmers: IRConfig.get_instance().set('stemmer', stemmer) out_token = IRTermCount.do_stemming([test]) out.append(':'.join([stemmer, out_token[0]])) IRLog.get_instance().println('%s > %s' % (test, ', '.join(out)))
def test_create_new_report(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary_text = 'Firefox crashed' description_text = 'When I was openning history folder, the f**king' \ ' Firefox just crashed!' report = IRReport(summary_text, description_text) assert summary_text == report.get_summary_text() assert description_text == report.get_description_text() report.get_summary_and_description_tfidf() report.get_summary_and_description_tfidf_squared_length() IRLog.get_instance().stop_log()
def compare_and_print_tfidf(cls, title_a, report_a, title_b, report_b): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF summary_field_name = IRConfig.get_instance().get('bug_summary_name') description_field_name = IRConfig.get_instance().get('bug_description_name') summary_a, description_a = report_a.get_summary_and_description_tfidf() summary_b, description_b = report_b.get_summary_and_description_tfidf() IRLog.get_instance().println('[TFIDF][Summary][%s][%s]' \ % (title_a, title_b)) IRTFIDF.show_dict_compare(summary_a, summary_b, summary_field_name) IRLog.get_instance().println('[TFIDF][Description][%s][%s]' \ % (title_a, title_b)) IRTFIDF.show_dict_compare(description_a, description_b, description_field_name)
def test_progress_bar(self): from ir_log import IRLog from ir_log import IRProgressBar IRLog.get_instance().start_log(True) title = 'ProgressBar Output Not Verbose' bar = IRProgressBar(1000, title, False, 0, 1) assert bar is not None for i in range(0,1001): bar.set_value(i) title = 'ProgressBar Output Verbose' bar = IRProgressBar(1000, title, True, 1, 0) assert bar is not None for i in range(0,1001): bar.set_value(i) IRLog.get_instance().start_log()
def test_is_in_same_duplicate_group(self): from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') hit, nothit, duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group(100000, [(100000, 0.93)]) IRLog.get_instance().println( 'Hit: %s' % (','.join([str(bug_id) for bug_id in hit]))) IRLog.get_instance().println( 'Not Hit: %s' % (','.join([str(bug_id) for bug_id in nothit]))) IRLog.get_instance().println( 'Actual Duplicate: %s' % (','.join([str(bug_id) for bug_id in duplicates]))) IRLog.get_instance().stop_log()
def test_progress_bar(self): from ir_log import IRLog from ir_log import IRProgressBar IRLog.get_instance().start_log(True) title = 'ProgressBar Output Not Verbose' bar = IRProgressBar(1000, title, False, 0, 1) assert bar is not None for i in range(0, 1001): bar.set_value(i) title = 'ProgressBar Output Verbose' bar = IRProgressBar(1000, title, True, 1, 0) assert bar is not None for i in range(0, 1001): bar.set_value(i) IRLog.get_instance().start_log()
def test_tfidf_asm_similarity(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') vec_a = {'firefox':1, 'chrome':1} vec_b = {'firefox':1, 'chrome':1, 'ie':1} vec_c = {'firefox':1, 'windows':1, 'linux':1} delta = 0.0001 assert abs(1.0 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_b)) < delta assert abs(0.5 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_c)) < delta assert IRTFIDF.tfidf_asm_similarity(vec_a, vec_b) > \ IRTFIDF.tfidf_asm_similarity(vec_a, vec_b, None, ['ie'], 100)
def get_artifact(self): """ Check whether all the dependencies are updated. (If not, update the dependencies recursively) Check whether this artifact needs updating. """ # get all the dependencies for key, value in self.dependencies.items(): value.get_artifact() # need update ? if self.action is not None and self.__need_update(): self.action() if not self.is_success(): from ir_log import IRLog IRLog.get_instance().println("Fail to generate %s." % self.id) assert False
def test_similarities_and_duplicates(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') report = IRReport(100000) similarities, duplicates = report.similarities_and_duplicates() IRLog.get_instance().println('Report %d' % (100000)) IRLog.get_instance().println( '%d Similar Reports: %s' % (similarities.__len__(), ','.join( [str(item[0]) for item in similarities]))) IRLog.get_instance().println( '%d Duplicate Reports: %s' % (duplicates.__len__(), ','.join( [str(item[0]) for item in duplicates]))) IRLog.get_instance().stop_log()
def test_create_new_sentence(self): from ir_log import IRLog from ir_config import IRConfig from ir_sentence import IRSentence IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') bug_id = 10000 description_text = 'When I was openning history folder, the f**king' \ ' Firefox just crashed!' sent = IRSentence(description_text, bug_id) assert description_text == sent.get_text() assert bug_id == sent.get_bug_id() assert sent.contain_term('folder') sent.get_termcount() sent.get_tfidf() IRLog.get_instance().stop_log()
def show_dict_compare(cls, dicta, dictb, field_name = 'summ', log_level = 1): """Compare and print the tfidf of two tfidf. tfidf sorted. Args: dicta: dict, TFIDF dictb: dict, TFIDF field_name: str, summary or description? log_level: int """ from ir_log import IRLog from ir_mongodb_helper import IRCollection from ir_document_count import IRDocumentCount documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') keys = set() if None != dicta: for key in dicta: keys.add(key) if None != dictb: for key in dictb: keys.add(key) # sort by product product = [] for key in keys: tfidf_a = 0.0 tfidf_b = 0.0 if (None != dicta) and (key in dicta): tfidf_a = dicta[key] if (None != dictb) and (key in dictb): tfidf_b = dictb[key] documentcount = IRDocumentCount.get_documentcount( key, field_name, documentcount_collection) idf = cls.get_idf(documentcount) product.append((key, tfidf_a*tfidf_b, tfidf_a, tfidf_b, documentcount, idf)) product.sort(cmp=lambda a,b:cmp(a[1],b[1]), reverse = True) # print it out IRLog.get_instance().println('%16s\t%8s\t%8s\t%8s\t%8s\t%8s' \ % ('term', 'tfidf a', 'tfidf b', 'doccount', 'idf', 'sim')) for item in product: IRLog.get_instance().println('%16s\t%8f\t%8f\t%8d\t%8f\t%8f' \ % (item[0], item[2], item[3], item[4], item[5], item[1]), log_level)