def get_summary_and_description_of_bug(cls, bug_id): """Get summary and description from mongodb. Args: bug_id: int Returns: [str, str], [summary, description] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection if cls.__is_cache: if bug_id in cls.__cache_summary_description: return cls.__cache_summary_description[bug_id] bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') text_collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'r') res = text_collection.find({bug_id_name : bug_id}) summary = '' description = '' if res.count() > 0: summary = res[0][summary_name] description = res[0][description_name] if cls.__is_cache: cls.__cache_summary_description[bug_id] = (summary, description) return summary, description
def cache_all_data(cls): """Load all document count into memory. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') term_name = IRConfig.get_instance().get('bug_term_name') cls.__is_cache = True documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') def iter_document_count(term): summary = term[summary_name] if summary_name in term else 0 description = term[ description_name] if description_name in term else 0 cls.__cache_document_count[term[term_name]] = \ (summary, description) IRProgressBar.execute_iteration_for_cursor( documentcount_collection.find({}), iter_document_count, "Caching Document Count")
def get_tfidf_of_bug(cls, bug_id): """Get tfidf of a bug. Args: bug_id: int Returns: [dict, dict], [TFIDF of summary, TFIDF of description] """ if cls.__is_cache: if bug_id in cls.__cache: return cls.__cache[bug_id] # load from db from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRCollection('bug_db_name', 'bug_tfidf_collection_name', 'r') find_result = tfidf_collection.find({bug_id_name: bug_id}) summary = {} description = {} if find_result.count() > 0: summary = find_result[0][summary_name] description = find_result[0][description_name] if cls.__is_cache: cls.__cache[bug_id] = (summary, description) return summary, description
def get_tfidf_of_bug(cls, bug_id): """Get tfidf of a bug. Args: bug_id: int Returns: [dict, dict], [TFIDF of summary, TFIDF of description] """ if cls.__is_cache: if bug_id in cls.__cache: return cls.__cache[bug_id] # load from db from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRCollection( 'bug_db_name', 'bug_tfidf_collection_name', 'r') find_result = tfidf_collection.find({bug_id_name : bug_id}) summary = {} description = {} if find_result.count() > 0: summary = find_result[0][summary_name] description = find_result[0][description_name] if cls.__is_cache: cls.__cache[bug_id] = (summary, description) return summary, description
def batch_generate_tfidf(cls): """Batch calculate TFIDF.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_document_count import IRDocumentCount from ir_term_count import IRTermCount # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') # prepare collections IRDocumentCount.cache_all_data() tfidf_collection = IRCollection( 'bug_db_name', 'bug_tfidf_collection_name', 'w') # batch calculate tfidf termcount_iterator = IRTermCount.get_iterator() bug_count = termcount_iterator.count() def iter_term_count(bug): summary_tfidf = cls.calculate_tfidf(bug[summary_name], summary_name, bug_count, None, tfidf_algorithm) description_tfidf = cls.calculate_tfidf(bug[description_name], description_name, bug_count, None, tfidf_algorithm) tfidf_collection.insert({bug_id_name : bug[bug_id_name], summary_name : summary_tfidf, description_name : description_tfidf}) IRProgressBar.execute_iteration_for_cursor(termcount_iterator, iter_term_count, "Calculating TFIDF") tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) tfidf_collection.close()
def get_summary_and_description_of_bug(cls, bug_id): """Get summary and description from mongodb. Args: bug_id: int Returns: [str, str], [summary, description] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection if cls.__is_cache: if bug_id in cls.__cache_summary_description: return cls.__cache_summary_description[bug_id] bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') text_collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'r') res = text_collection.find({bug_id_name: bug_id}) summary = '' description = '' if res.count() > 0: summary = res[0][summary_name] description = res[0][description_name] if cls.__is_cache: cls.__cache_summary_description[bug_id] = (summary, description) return summary, description
def get_termcount_of_bug(cls, bug_id): """Get termcount of a bug Args: bug_id: int Returns: [dict, dict], [termcount of summary, termcount of description] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection if cls.__is_cache: if bug_id in cls.__cache_term_count: return cls.__cache_term_count[bug_id] bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') termcount_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'r') res = termcount_collection.find({bug_id_name : bug_id}) summary = {} description = {} if res.count() > 0: summary = res[0][summary_name] description = res[0][description_name] if cls.__is_cache: cls.__cache_term_count[bug_id] = (summary, description) return summary, description
def get_stacktrace_of_bug(cls, bug_id): """Get stacktrace from mongodb. Args: bug_id: int Returns: [[str]], [[signature]] """ if cls.__is_cache: if bug_id in cls.__cache_stacktrace: return cls.__cache_stacktrace[bug_id] from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') text_collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'r') res = text_collection.find({bug_id_name: bug_id}) stacktrace = [] if res.count() > 0: stacktrace = res[0][stacktrace_name] if cls.__is_cache: cls.__cache_stacktrace[bug_id] = stacktrace return stacktrace
def cache_all_data(cls): """Load all data into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') # caching data cls.set_is_cache(True) text_collection = \ IRCollection('bug_db_name', 'bug_text_collection_name', 'r') cls.__cache_summary_description = {} cls.__cache_stacktrace = {} def iter_func(bug): cls.__cache_summary_description[bug[bug_id_name]] = \ (bug[summary_name], bug[description_name]) cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name] IRProgressBar.execute_iteration_for_cursor(text_collection.find(), iter_func, 'Caching Text Data') text_collection.close()
def get_stacktrace_of_bug(cls, bug_id): """Get stacktrace from mongodb. Args: bug_id: int Returns: [[str]], [[signature]] """ if cls.__is_cache: if bug_id in cls.__cache_stacktrace: return cls.__cache_stacktrace[bug_id] from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') text_collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'r') res = text_collection.find({bug_id_name : bug_id}) stacktrace = [] if res.count() > 0: stacktrace = res[0][stacktrace_name] if cls.__is_cache: cls.__cache_stacktrace[bug_id] = stacktrace return stacktrace
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name: self.get_product(), create_ts_name: { '$gt': self.get_create_ts() - search_time_span }, bug_id_name: { '$nin': self.__exclude_report_ids } }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug( bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def get_documentcount(cls, term, field=None, documentcount_collection=None): """Get documentcount of a term. Args: term, str Returns: if field == None: (int, int), (summary document count, description document count) else: int, the document count of corresponding field """ if cls.__is_cache and term in cls.__cache_document_count: if field is None: return cls.__cache_document_count[term] else: from ir_config import IRConfig summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get( 'bug_description_name') if field == summary_name: return cls.__cache_document_count[term][0] elif field == description_name: return cls.__cache_document_count[term][1] else: return 0 # load from db from ir_mongodb_helper import IRCollection from ir_config import IRConfig term_name = IRConfig.get_instance().get('bug_term_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') if documentcount_collection is None: documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') res = documentcount_collection.find({term_name: term}) summary = 0 description = 0 if res.count() > 0: if summary_name in res[0]: summary = res[0][summary_name] if description_name in res[0]: description = res[0][description_name] if cls.__is_cache: cls.__cache_document_count[term] = (summary, description) # return value if field is None: return summary, description elif field == summary_name: return summary elif field == description_name: return description else: return 0
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name : self.get_product(), create_ts_name : {'$gt' : self.get_create_ts() - search_time_span}, bug_id_name : {'$nin' : self.__exclude_report_ids} }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def get_documentcount(cls, term, field = None, documentcount_collection = None): """Get documentcount of a term. Args: term, str Returns: if field == None: (int, int), (summary document count, description document count) else: int, the document count of corresponding field """ if cls.__is_cache and term in cls.__cache_document_count: if field is None: return cls.__cache_document_count[term] else: from ir_config import IRConfig summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') if field == summary_name: return cls.__cache_document_count[term][0] elif field == description_name: return cls.__cache_document_count[term][1] else: return 0 # load from db from ir_mongodb_helper import IRCollection from ir_config import IRConfig term_name = IRConfig.get_instance().get('bug_term_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') if documentcount_collection is None: documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') res = documentcount_collection.find({term_name : term}) summary = 0 description = 0 if res.count() > 0: if summary_name in res[0]: summary = res[0][summary_name] if description_name in res[0]: description = res[0][description_name] if cls.__is_cache: cls.__cache_document_count[term] = (summary, description) # return value if field is None: return summary, description elif field == summary_name: return summary elif field == description_name: return description else: return 0
def get_iterator(cls, arg): """Get the cursor to the items fulfill arg. Args: arg: dict, condition Returns: cursor """ from ir_mongodb_helper import IRCollection text_collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'r') return text_collection.find(arg)
def get_iterator(cls, arg): """Get the cursor to the items fulfill arg. Args: arg: dict, condition Returns: cursor """ from ir_mongodb_helper import IRCollection text_collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'r') return text_collection.find(arg)
def get_iterator(cls, arg=None): """Get iterator of termcounts fulfiling arg. Args: arg: dict, Condiction. Returns: cursor """ if not arg: arg = {} from ir_mongodb_helper import IRCollection termcount_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'r') return termcount_collection.find(arg)
def get_total_report_number(cls): """Get the total number of reports. Returns: int """ if cls.__is_cache and cls.__total_report_number is not None: return cls.__total_report_number from ir_mongodb_helper import IRCollection tc_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'r') total_report_number = tc_collection.count() if cls.__is_cache: cls.__total_report_number = total_report_number return total_report_number
def get_total_report_number(cls): """Get the total number of reports. Returns: int """ if cls.__is_cache and cls.__total_report_number is not None: return cls.__total_report_number from ir_mongodb_helper import IRCollection tc_collection = IRCollection('bug_db_name', 'bug_termcount_collection_name', 'r') total_report_number = tc_collection.count() if cls.__is_cache: cls.__total_report_number = total_report_number return total_report_number
def calculate_tfidf_for_report_termcount(cls, summary_termcount, description_termcount): """Calculate TFIDF for single report. Args: summary_termcount: dict, {term -> termcount} description_termcount: dict, {term -> termcount} Returns: [dict, dict], [tfidf of summary, tfidf of description] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') total_document = cls.get_total_report_number() summary_tfidf = cls.calculate_tfidf( summary_termcount, IRConfig.get_instance().get('bug_summary_name'), total_document, documentcount_collection) description_tfidf = cls.calculate_tfidf( description_termcount, IRConfig.get_instance().get('bug_description_name'), total_document, documentcount_collection) return summary_tfidf, description_tfidf
def cache_all_data(cls): """Load all TFIDF into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRCollection( 'bug_db_name', 'bug_tfidf_collection_name', 'r') cls.set_is_cache(True) cls.__cache = {} def iter_tfidf(bug): cls.__cache[bug[bug_id_name]] = (bug[summary_name], bug[description_name]) IRProgressBar.execute_iteration_for_cursor(tfidf_collection.find(), iter_tfidf, "Caching TFIDF")
def show_distribution_on_product_and_create_ts(cls): """Show the distribution of create time and number of products on each duplicate group. """ from ir_log import IRLog from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug2group_collection = IRCollection( 'bug_db_name', 'bug_duplicate_collection_name', 'r') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') bug_id_name = IRConfig.get_instance().get('bug_id_name') group_name = IRConfig.get_instance().get('bug_group_name') product_name = IRConfig.get_instance().get('bug_product_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') group_ids = bug2group_collection.distinct(group_name) progress_bar = IRProgressBar(group_ids.__len__(), "group", False, 0, 1) group_num = 0 for group_id in group_ids: group_num += 1 progress_bar.set_value(group_num) bugs = bug2group_collection.find({group_name : group_id}) min_ts = 9999999999 max_ts = -1000 product_set = set() for bug in bugs: bug_id = bug[bug_id_name] basic = basic_collection.find({bug_id_name : bug_id}) if basic.count() == 0: continue ts = basic[0][create_ts_name] product = basic[0][product_name] # ts if ts > max_ts: max_ts = ts if ts < min_ts: min_ts = ts # product product_set.add(product) IRLog.get_instance().println('ts span:%d;product number:%d' \ % (max_ts - min_ts, product_set.__len__()), 2)
def test_parse_info_level0(self): from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_duplicate_group import IRDuplicateGroup IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') duplicate_group = IRDuplicateGroup() duplicate_group.parse_info_level0('../data/test/info_level0_test') #test if incomplete bugs have been removed bug2group = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'r') assert bug2group is not None res = bug2group.find({'bug_id': 102500}) assert res.count() == 0 IRLog.get_instance().stop_log()
def get_bugs_in_group(cls, group_id): """Get bugs in a group. Args: group_id: int Returns: [int], [bug_id] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection duplicate_collection =IRCollection( 'bug_db_name', 'bug_duplicate_collection_name', 'r') bug_id_name = IRConfig.get_instance().get('bug_id_name') group_name = IRConfig.get_instance().get('bug_group_name') find_result = duplicate_collection.find({group_name : group_id}) return [bug[bug_id_name] for bug in find_result]
def test_parse_info_level0(self): from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_duplicate_group import IRDuplicateGroup IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') duplicate_group = IRDuplicateGroup() duplicate_group.parse_info_level0('../data/test/info_level0_test') #test if incomplete bugs have been removed bug2group = IRCollection( 'bug_db_name', 'bug_duplicate_collection_name', 'r') assert bug2group is not None res = bug2group.find({'bug_id':102500}) assert res.count() == 0 IRLog.get_instance().stop_log()
def cache_all_data(cls): """Load all TFIDF into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_collection = IRCollection('bug_db_name', 'bug_tfidf_collection_name', 'r') cls.set_is_cache(True) cls.__cache = {} def iter_tfidf(bug): cls.__cache[bug[bug_id_name]] = (bug[summary_name], bug[description_name]) IRProgressBar.execute_iteration_for_cursor(tfidf_collection.find(), iter_tfidf, "Caching TFIDF")
def get_bugs_in_group(cls, group_id): """Get bugs in a group. Args: group_id: int Returns: [int], [bug_id] """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection duplicate_collection = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'r') bug_id_name = IRConfig.get_instance().get('bug_id_name') group_name = IRConfig.get_instance().get('bug_group_name') find_result = duplicate_collection.find({group_name: group_id}) return [bug[bug_id_name] for bug in find_result]
def get_basic_info_of_bug(cls, bug_id): """Get basic info from mongodb. Args: bug_id: int Returns: (int, str): (create_ts, product) """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') res = basic_collection.find({bug_id_name: bug_id}) if res.count() > 0: return res[0][create_ts_name], res[0][product_name] else: return -1, ''
def get_basic_info_of_bug(cls, bug_id): """Get basic info from mongodb. Args: bug_id: int Returns: (int, str): (create_ts, product) """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') res = basic_collection.find({bug_id_name : bug_id}) if res.count() > 0: return res[0][create_ts_name], res[0][product_name] else: return -1, ''
def parse_dump_basic_file(cls, dump_filename = None): # Not finished yet """Extract basic information mysql dump and insert into mongo db dump_filename: str, Filename of dump file. If this parameter is not given, dump_filename will be fetched from config file """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') product_name = IRConfig.get_instance().get('bug_product_name', 'product') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name', 'ts') collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'w') # load and insert text file if None == dump_filename : dump_filename = IRConfig.get_instance().\ get('bug_dump_basic_filename') in_file = open(dump_filename, 'r') def iter_for_line(line): # TODO here bug_id, product, ts = cls.__extract_basic_from_dump_file_line__(line) collection.insert({ bug_id_name : int(bug_id), product_name: product, create_ts_name : int(ts) }) IRProgressBar.execute_iteration_for_file(in_file, iter_for_line, 'Parsing Dump Basic') in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection.close()
def batch_generate_term_count(cls): """Generate term count for text in mongodb database, and store to database. """ from ir_log import IRProgressBar from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().\ get('bug_description_name', 'desc') termcount_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'w') def iter_text(bug): summary_bow, description_bow = cls.calculate_term_count( bug[summary_name], bug[description_name]) termcount_collection.insert({ bug_id_name : bug[bug_id_name], summary_name : summary_bow, description_name : description_bow }) IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}), iter_text, "From Text to Term Count") termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) termcount_collection.close()
def get_duplicate_group_information(cls, group_size_min, group_size_max): """Calculate the size of duplicate group. Args: group_size_min: int, The minimum size of wanted group. group_size_max: int, The maximum size of wanted group. Returns: [int], [group_id] """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection duplicate_group_count_collection = IRCollection( 'bug_db_name', 'bug_duplicate_group_count_collection_name', 'r') group_name = IRConfig.get_instance().get('bug_group_name') group_size_name = IRConfig.get_instance().get('bug_group_size') result = duplicate_group_count_collection.find({group_size_name : \ {"$gt":group_size_min, "$lt":group_size_max}}) return [group[group_name] for group in result]
def batch_generate_document_count(cls): """Batch calculate term count over documents. Input is from mongodb, termcount collection. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_term_count import IRTermCount bug_id_name = IRConfig.get_instance().get('bug_id_name') term_name = IRConfig.get_instance().get('bug_term_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') # Calculate document count and stored in document_count document_count = {} def iter_term_count(bug): for term in bug[summary_name]: if not term in document_count: document_count[term] = {term_name:term, summary_name:0, description_name:0} document_count[term][summary_name] += 1 for term in bug[description_name]: if not term in document_count: document_count[term] = {term_name:term, summary_name:0, description_name:0} document_count[term][description_name] += 1 IRProgressBar.execute_iteration_for_cursor(IRTermCount.get_iterator({}), iter_term_count, "Counting Document Count") # Write to db documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'w') def write_to_mongo(term): documentcount_collection.insert(document_count[term]) IRProgressBar.execute_iteration_for_dict(document_count, write_to_mongo, "Write to database") documentcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) documentcount_collection.close()
def get_group_of_bug(cls, bug_id): """Get the group id of a bug. Args: bug_id: int Returns: int, group_id """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection duplicate_collection = IRCollection( 'bug_db_name', 'bug_duplicate_collection_name', 'r') bug_id_name = IRConfig.get_instance().get('bug_id_name') group_name = IRConfig.get_instance().get('bug_group_name') result = duplicate_collection.find({bug_id_name : bug_id}) if result.count() == 0: return None else: return result[0][group_name]
def cache_all_data(cls): """Load all document count into memory. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') term_name = IRConfig.get_instance().get('bug_term_name') cls.__is_cache = True documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') def iter_document_count(term): summary = term[summary_name] if summary_name in term else 0 description = term[description_name] if description_name in term else 0 cls.__cache_document_count[term[term_name]] = \ (summary, description) IRProgressBar.execute_iteration_for_cursor( documentcount_collection.find({}), iter_document_count, "Caching Document Count")
def cache_all_data(cls): """Load all data into memory.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') # caching data cls.set_is_cache(True) text_collection = \ IRCollection('bug_db_name', 'bug_text_collection_name', 'r') cls.__cache_summary_description = {} cls.__cache_stacktrace = {} def iter_func(bug): cls.__cache_summary_description[bug[bug_id_name]] = \ (bug[summary_name], bug[description_name]) cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name] IRProgressBar.execute_iteration_for_cursor( text_collection.find(), iter_func, 'Caching Text Data') text_collection.close()
def parse_dump_file(cls, dump_filename = None): """Extract text from mysql dump and insert into mongo db dump_filename: str, Filename of dump file. If this parameter is not given, dump_filename will be fetched from config file """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get key name bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().get('bug_description_name', 'desc') # collection collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'w') # load and insert text file if None == dump_filename : dump_filename = IRConfig.get_instance().\ get('bug_dump_text_filename') in_file = open(dump_filename, 'r') def iter_for_line(line): bug_id, summary, description = \ cls.__extract_summary_and_description_from_dump_file_line(line) collection.insert({ bug_id_name : int(bug_id), summary_name: summary, description_name : description }) IRProgressBar.execute_iteration_for_file(in_file, iter_for_line, 'Parsing Dump') in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection.close()
def get_group_of_bug(cls, bug_id): """Get the group id of a bug. Args: bug_id: int Returns: int, group_id """ from ir_config import IRConfig from ir_mongodb_helper import IRCollection duplicate_collection = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'r') bug_id_name = IRConfig.get_instance().get('bug_id_name') group_name = IRConfig.get_instance().get('bug_group_name') result = duplicate_collection.find({bug_id_name: bug_id}) if result.count() == 0: return None else: return result[0][group_name]
def show_dict_compare(cls, dicta, dictb, field_name='summ', log_level=1): """Compare and print the tfidf of two tfidf. tfidf sorted. Args: dicta: dict, TFIDF dictb: dict, TFIDF field_name: str, summary or description? log_level: int """ from ir_log import IRLog from ir_mongodb_helper import IRCollection from ir_document_count import IRDocumentCount documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'r') keys = set() if None != dicta: for key in dicta: keys.add(key) if None != dictb: for key in dictb: keys.add(key) # sort by product product = [] for key in keys: tfidf_a = 0.0 tfidf_b = 0.0 if (None != dicta) and (key in dicta): tfidf_a = dicta[key] if (None != dictb) and (key in dictb): tfidf_b = dictb[key] documentcount = IRDocumentCount.get_documentcount( key, field_name, documentcount_collection) idf = cls.get_idf(documentcount) product.append( (key, tfidf_a * tfidf_b, tfidf_a, tfidf_b, documentcount, idf)) product.sort(cmp=lambda a, b: cmp(a[1], b[1]), reverse=True) # print it out IRLog.get_instance().println('%16s\t%8s\t%8s\t%8s\t%8s\t%8s' \ % ('term', 'tfidf a', 'tfidf b', 'doccount', 'idf', 'sim')) for item in product: IRLog.get_instance().println('%16s\t%8f\t%8f\t%8d\t%8f\t%8f' \ % (item[0], item[2], item[3], item[4], item[5], item[1]), log_level)
def batch_generate_document_count(cls): """Batch calculate term count over documents. Input is from mongodb, termcount collection. """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_term_count import IRTermCount bug_id_name = IRConfig.get_instance().get('bug_id_name') term_name = IRConfig.get_instance().get('bug_term_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') # Calculate document count and stored in document_count document_count = {} def iter_term_count(bug): for term in bug[summary_name]: if not term in document_count: document_count[term] = { term_name: term, summary_name: 0, description_name: 0 } document_count[term][summary_name] += 1 for term in bug[description_name]: if not term in document_count: document_count[term] = { term_name: term, summary_name: 0, description_name: 0 } document_count[term][description_name] += 1 IRProgressBar.execute_iteration_for_cursor( IRTermCount.get_iterator({}), iter_term_count, "Counting Document Count") # Write to db documentcount_collection = IRCollection( 'bug_db_name', 'bug_documentcount_collection_name', 'w') def write_to_mongo(term): documentcount_collection.insert(document_count[term]) IRProgressBar.execute_iteration_for_dict(document_count, write_to_mongo, "Write to database") documentcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) documentcount_collection.close()
def show_distribution_on_product_and_create_ts(cls): """Show the distribution of create time and number of products on each duplicate group. """ from ir_log import IRLog from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug2group_collection = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'r') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') bug_id_name = IRConfig.get_instance().get('bug_id_name') group_name = IRConfig.get_instance().get('bug_group_name') product_name = IRConfig.get_instance().get('bug_product_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') group_ids = bug2group_collection.distinct(group_name) progress_bar = IRProgressBar(group_ids.__len__(), "group", False, 0, 1) group_num = 0 for group_id in group_ids: group_num += 1 progress_bar.set_value(group_num) bugs = bug2group_collection.find({group_name: group_id}) min_ts = 9999999999 max_ts = -1000 product_set = set() for bug in bugs: bug_id = bug[bug_id_name] basic = basic_collection.find({bug_id_name: bug_id}) if basic.count() == 0: continue ts = basic[0][create_ts_name] product = basic[0][product_name] # ts if ts > max_ts: max_ts = ts if ts < min_ts: min_ts = ts # product product_set.add(product) IRLog.get_instance().println('ts span:%d;product number:%d' \ % (max_ts - min_ts, product_set.__len__()), 2)
def test_parse_dump_dup_file(self): from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_duplicate_group import IRDuplicateGroup IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') duplicate_group = IRDuplicateGroup() duplicate_group.parse_dump_dup_file('../data/test/dump_dup_file_test') bug2group = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'r') assert bug2group is not None bug_ids = duplicate_group.get_bugs_in_group(1) IRLog.get_instance().println('In dump-dup_file_test: Group %d has bugs: ' % (1) + \ ' '.join([str(bug_id) for bug_id in bug_ids])) IRLog.get_instance().stop_log()
def parse_dump_basic_file(cls, dump_filename=None): # Not finished yet """Extract basic information mysql dump and insert into mongo db dump_filename: str, Filename of dump file. If this parameter is not given, dump_filename will be fetched from config file """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') product_name = IRConfig.get_instance().get('bug_product_name', 'product') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name', 'ts') collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'w') # load and insert text file if None == dump_filename: dump_filename = IRConfig.get_instance().\ get('bug_dump_basic_filename') in_file = open(dump_filename, 'r') def iter_for_line(line): # TODO here bug_id, product, ts = cls.__extract_basic_from_dump_file_line__( line) collection.insert({ bug_id_name: int(bug_id), product_name: product, create_ts_name: int(ts) }) IRProgressBar.execute_iteration_for_file(in_file, iter_for_line, 'Parsing Dump Basic') in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection.close()
def parse_dump_file(cls, dump_filename=None): """Extract text from mysql dump and insert into mongo db dump_filename: str, Filename of dump file. If this parameter is not given, dump_filename will be fetched from config file """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection # get key name bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().get('bug_description_name', 'desc') # collection collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'w') # load and insert text file if None == dump_filename: dump_filename = IRConfig.get_instance().\ get('bug_dump_text_filename') in_file = open(dump_filename, 'r') def iter_for_line(line): bug_id, summary, description = \ cls.__extract_summary_and_description_from_dump_file_line(line) collection.insert({ bug_id_name: int(bug_id), summary_name: summary, description_name: description }) IRProgressBar.execute_iteration_for_file(in_file, iter_for_line, 'Parsing Dump') in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection.close()
def batch_generate_tfidf(cls): """Batch calculate TFIDF.""" from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_document_count import IRDocumentCount from ir_term_count import IRTermCount # get config bug_id_name = IRConfig.get_instance().get('bug_id_name') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') # prepare collections IRDocumentCount.cache_all_data() tfidf_collection = IRCollection('bug_db_name', 'bug_tfidf_collection_name', 'w') # batch calculate tfidf termcount_iterator = IRTermCount.get_iterator() bug_count = termcount_iterator.count() def iter_term_count(bug): summary_tfidf = cls.calculate_tfidf(bug[summary_name], summary_name, bug_count, None, tfidf_algorithm) description_tfidf = cls.calculate_tfidf(bug[description_name], description_name, bug_count, None, tfidf_algorithm) tfidf_collection.insert({ bug_id_name: bug[bug_id_name], summary_name: summary_tfidf, description_name: description_tfidf }) IRProgressBar.execute_iteration_for_cursor(termcount_iterator, iter_term_count, "Calculating TFIDF") tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) tfidf_collection.close()
def __store_to_mongodb(cls, bug2group, group2bug): """Store duplicate group information into Mongodb. Args: bug2group: dict, {bug_id -> group_id} group2bug: dict, {group_id -> [bug_id]} """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') bug_group_name = IRConfig.get_instance().get('bug_group_name') duplicate_collection = IRCollection('bug_db_name', 'bug_duplicate_collection_name', 'w') def iter_bug_group(bug): duplicate_collection.insert({ bug_id_name: bug, bug_group_name: bug2group[bug] }) IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group, "Store to db") duplicate_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) duplicate_collection.create_index([(bug_group_name, IRCollection.ASCENDING)]) duplicate_collection.close() # duplicate group size collection group_name = IRConfig.get_instance().get('bug_group_name') group_size_name = IRConfig.get_instance().get('bug_group_size') duplicate_group_count_collection = IRCollection( 'bug_db_name', 'bug_duplicate_group_count_collection_name', 'w') line_num = 0 for group, bugs in group2bug.items(): line_num += 1 def iter_group_bug(group): duplicate_group_count_collection.insert({ group_name: group, group_size_name: group2bug[group].__len__() }) IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug, 'Store Index') duplicate_group_count_collection.create_index([ (group_name, IRCollection.ASCENDING) ]) duplicate_group_count_collection.close()
def __store_to_mongodb(cls, bug2group, group2bug): """Store duplicate group information into Mongodb. Args: bug2group: dict, {bug_id -> group_id} group2bug: dict, {group_id -> [bug_id]} """ from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection bug_id_name = IRConfig.get_instance().get('bug_id_name') bug_group_name = IRConfig.get_instance().get('bug_group_name') duplicate_collection = IRCollection( 'bug_db_name', 'bug_duplicate_collection_name', 'w') def iter_bug_group(bug): duplicate_collection.insert({ bug_id_name : bug, bug_group_name : bug2group[bug] }) IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group, "Store to db") duplicate_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) duplicate_collection.create_index([(bug_group_name, IRCollection.ASCENDING)]) duplicate_collection.close() # duplicate group size collection group_name = IRConfig.get_instance().get('bug_group_name') group_size_name = IRConfig.get_instance().get('bug_group_size') duplicate_group_count_collection = IRCollection( 'bug_db_name', 'bug_duplicate_group_count_collection_name', 'w') line_num = 0 for group, bugs in group2bug.items(): line_num += 1 def iter_group_bug(group): duplicate_group_count_collection.insert({group_name : group, group_size_name : group2bug[group].__len__()}) IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug, 'Store Index') duplicate_group_count_collection.create_index( [(group_name, IRCollection.ASCENDING)]) duplicate_group_count_collection.close()
def test_ir_collection(self): from ir_mongodb_helper import IRCollection from ir_config import IRConfig import pymongo IRConfig.get_instance().load('../data/test/bug_test.cfg') db_cfg_name = 'bug_db_name' collection_cfg_name = 'bug_mongodb_helper_collection_name' # create empty collection ircollection = IRCollection(db_cfg_name, collection_cfg_name, 'w') assert None != ircollection ircollection.insert({'abc':'abc'}) ircollection.close() # access existing collection ircollection = IRCollection(db_cfg_name, collection_cfg_name, 'r') assert None != ircollection ircollection.close() # test result connection = pymongo.Connection(IRConfig.get_instance().get('db_host'), IRConfig.get_instance().get_int('db_port')) db_name = IRConfig.get_instance().get(db_cfg_name) collection_name = IRConfig.get_instance().get(collection_cfg_name) assert connection[db_name][collection_name].find({'abc':'abc'}).count() > 0 ircollection = IRCollection(db_cfg_name, collection_cfg_name, 'w') ircollection.clean() ircollection.close()
def parse_info_level1(cls, info_level1_filename=None): """Extract text and insert into mongo db info_level1_filename: str, Filename of info level1. If this parameter is not given, bug_info_level1_filename will be fetched from config file """ import pymongo from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_gnome_st_tools import IRSTTools # get config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().get('bug_description_name', 'desc') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') # collections collection = IRCollection('bug_db_name', 'bug_text_collection_name', 'w') collection_basic = IRCollection('bug_db_name', 'bug_basic_collection_name', 'w') community_name = IRConfig.get_instance().get('community') # load and insert text file if None == info_level1_filename: info_level1_filename = IRConfig.get_instance().\ get('bug_info_level1_filename') in_file = open(info_level1_filename, 'r') def func_each_line(line): bug_id, summary, description, resolution, create_ts, product = \ cls.__extract_information_from_info_level1_line(line) if resolution is not None and resolution != "INCOMPLETE": # post process description description, stacktrace = \ cls.extract_raw_description_info(description, community_name) # drop the report whose description containing stacktrace info if cls.is_drop_report(description): from ir_log import IRLog IRLog.get_instance().println('Drop report#=%d because it '\ 'contains unrecognizable stacktrace.' % bug_id, 3) return collection.insert({ bug_id_name: bug_id, summary_name: summary, description_name: description, stacktrace_name: stacktrace }) collection_basic.insert({ bug_id_name: bug_id, create_ts_name: create_ts, product_name: product }) IRProgressBar.execute_iteration_for_file(in_file, func_each_line, "Parsing Infolevel 1") in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection_basic.create_index([(bug_id_name, IRCollection.ASCENDING), (create_ts_name, IRCollection.ASCENDING), (product_name, IRCollection.ASCENDING)]) collection.close() collection_basic.close()
#!/usr/bin/python2.7 if __name__ == '__main__': import sys from ir_config import IRConfig from ir_text import IRText from ir_mongodb_helper import IRCollection config = IRConfig.get_instance() config.load(sys.argv[1]) product_name = config.get('bug_product_name') products = dict() basic = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') cursor = basic.find(None) for bug in cursor: product = bug[product_name] if product not in products: products[product] = 0 products[product] += 1 product_list = products.items() product_list.sort(cmp=lambda x,y:cmp(x[1],y[1]), reverse=True) prefix = '' if sys.argv.__len__() < 3 else sys.argv[2] surfix = '' if sys.argv.__len__() < 4 else sys.argv[3] threshold = 100 if sys.argv.__len__() <5 else int(sys.argv[4]) for product in product_list: if product[1] < threshold:
def parse_info_level1(cls, info_level1_filename = None): """Extract text and insert into mongo db info_level1_filename: str, Filename of info level1. If this parameter is not given, bug_info_level1_filename will be fetched from config file """ import pymongo from ir_log import IRProgressBar from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_gnome_st_tools import IRSTTools # get config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().get('bug_description_name', 'desc') stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') # collections collection = IRCollection( 'bug_db_name', 'bug_text_collection_name', 'w') collection_basic = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'w') community_name = IRConfig.get_instance().get('community') # load and insert text file if None == info_level1_filename : info_level1_filename = IRConfig.get_instance().\ get('bug_info_level1_filename') in_file = open(info_level1_filename, 'r') def func_each_line(line): bug_id, summary, description, resolution, create_ts, product = \ cls.__extract_information_from_info_level1_line(line) if resolution is not None and resolution != "INCOMPLETE": # post process description description, stacktrace = \ cls.extract_raw_description_info(description, community_name) # drop the report whose description containing stacktrace info if cls.is_drop_report(description): from ir_log import IRLog IRLog.get_instance().println('Drop report#=%d because it '\ 'contains unrecognizable stacktrace.' % bug_id, 3) return collection.insert({ bug_id_name : bug_id, summary_name: summary, description_name : description, stacktrace_name : stacktrace }) collection_basic.insert({ bug_id_name : bug_id, create_ts_name : create_ts, product_name : product }) IRProgressBar.execute_iteration_for_file(in_file, func_each_line, "Parsing Infolevel 1") in_file.close() collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) collection_basic.create_index([ (bug_id_name, IRCollection.ASCENDING), (create_ts_name, IRCollection.ASCENDING), (product_name, IRCollection.ASCENDING) ]) collection.close() collection_basic.close()